In [1]:
from dotenv import load_dotenv
import os
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.llms import OpenAI
from serpapi import GoogleSearch

load_dotenv()

OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
SERPAPI_KEY = os.getenv("SERPAPI_KEY")
print(OPENAI_API_KEY[0:6], SERPAPI_KEY[0:6])

sk-859 24ada9


In [19]:
import requests
import json

search = GoogleSearch(
    {
        "q": "Swift Logistics + website",
        "engine": "google",
        "location": "Austin, Texas",
        "api_key": SERPAPI_KEY,
    }
)
response = search.get_dict()


In [20]:
response["organic_results"][0]["link"]

'https://www.swiftlogistics.com/'

In [4]:

llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0)



In [5]:



template = """
    I'll give you three inputs. These inputs will be the name of the company, 
    the country of the company, and the website company. The website of the company
    is not mandatory.
    You have to give me the products and services that the company offers as output.
    you dont need to give me nothing more than the ouput.



    input:
    IKEA Deutschland GmbH & Co. KG
    Germany
    ikea.com

    the output must be in this format, please use it:
    "Products/services": Furniture, Home decor, Kitchen and Dining;
    "Keywords":furniture, storage, lighting;
    "Company Classification":5712 (Furniture Stores) – SIC, 442110 (Furniture Stores) – NAICS
    do it yourself now.
    input:
    {name_of_company}
    {country_of_company}
    {website_of_company}

    what is the output?
"""

prompt = PromptTemplate(
    input_variables=["name_of_company", "country_of_company", "website_of_company"],
    template=template,
)

In [6]:

chain = LLMChain(llm=llm, prompt=prompt)
output = chain.run({"name_of_company": "Google", 
           "country_of_company": "United States", 
           "website_of_company":"google.com"

})

In [7]:
print(output)

"Products/services": Search engine, Advertising, Cloud computing, Software;
    "Keywords": Search, Advertising, Cloud, Software, Technology;
    "Company Classification": 7370 (Computer Programming, Data Processing, and Related Services) – SIC, 511210 (Software Publishers) – NAICS.


In [None]:
#query de busca no serpapi:
#nomedaempresa + paisdaempresa + palavrachave1 + palavrachave2 + palavrachave3

In [8]:
def parse_output(output_langchain: str) -> dict:
  """ 
  Will parse the output_langchain of the Langchain query
  
  Args:
    output(str): The output_langchain of Langchain query.

  Returns:
    result_dict(dict): the parsed output_langchain to dict
  """

  result_dict = {}

  sections = [section.strip() for section in output.split(";")]

  for section in sections:
      if section:
          header, values_str = section.split(":")
          header = header.strip('"')
          values = [value.strip() for value in values_str.strip('[]').split(",")]

          result_dict[header] = values

  return result_dict

In [9]:
parsed = parse_output(output)

In [11]:
parsed["images"] = "test"
parsed

{'Products/services': ['Search engine',
  'Advertising',
  'Cloud computing',
  'Software'],
 'Keywords': ['Search', 'Advertising', 'Cloud', 'Software', 'Technology'],
 'Company Classification': ['7370 (Computer Programming',
  'Data Processing',
  'and Related Services) – SIC',
  '511210 (Software Publishers) – NAICS.'],
 'images': 'test'}

In [8]:
parsed["Products/services"]

['Search engine', 'Advertising', 'Cloud computing', 'Software']

# Teste com SerpAPI

In [9]:
#!pip install google-search-results


In [19]:
search = GoogleSearch({
    "q": "Innoscripta + Germany + Innovation consulting + Digital transformation + Product development ",
    "engine": "google_images",
    "location": "Austin, Texas",
    "api_key": SERPAPI_KEY
})

In [12]:
search

<serpapi.google_search.GoogleSearch at 0x7f240b485190>

In [27]:
response = search.get_dict()
print(response["images_results"][2])

{'position': 3, 'thumbnail': 'https://serpapi.com/searches/64657183f0adfbcd97e4a773/images/4f3abf25eaa5fccfe017f296d62f7f3cb3cc25ed527034b09f8049b5aab91d65.jpeg', 'related_content_id': 'Y1NsQlBId0NjZE04eE1cIixcIjdxUlFlNWdFc1d1V3NN', 'serpapi_related_content_link': 'https://serpapi.com/search.json?engine=google_images_related_content&gl=us&hl=en&q=Innoscripta+%2B+Germany+%2B+Innovation+consulting+%2B+Digital+transformation+%2B+Product+development+&related_content_id=Y1NsQlBId0NjZE04eE1cIixcIjdxUlFlNWdFc1d1V3NN', 'source': 'innoscripta', 'title': 'Who we are looking for: What we offer: Your tasks: Are you interested?  Apply now!', 'link': 'https://it.innoscripta.com/documents/positions/innoscripta_Product_Manager_IT.pdf', 'is_product': False}


In [16]:
response.keys()

dict_keys(['search_metadata', 'search_parameters', 'search_information', 'suggested_searches', 'images_results', 'related_searches'])

In [30]:
imgs = [r.get("original", None) for r in response["images_results"][:5]]
imgs

['https://d3ml3b6vywsj0z.cloudfront.net/company_images/5ba611307c86660d3c6b2536_images.png',
 'https://image.pitchbook.com/3T2zgG15lp2J4ZRw44DGxAmhbJe1598972475402_200x200',
 None,
 'https://www.cparityevent.com/wp-content/uploads/2023/05/Web-Innoscripta-HQ-300x300.png',
 'https://www.innoscripta.com/6cc987b6a5d16e4b06b5.jpg?url']

In [26]:
def google_search(query: str) -> list:
    """
    Will query google for images based in the output of OpenAIAPI
    
    Args:
        query(str): Formated query using the output openaiAPI
    
    Results:
        imgs(list): List with URLs for images
    """
    search = GoogleSearch({
    "q": query,
    "engine": "google_images",
    "location": "Austin, Texas",
    "api_key": SERPAPI_KEY
    })
    response = search.get_dict()
    imgs = [r["original"] for r in response["images_results"][:5]]

    return imgs

In [27]:
tst = google_search("Polar bear")
tst

['https://upload.wikimedia.org/wikipedia/commons/6/66/Polar_Bear_-_Alaska_%28cropped%29.jpg',
 'https://i.natgeofe.com/k/55256f3f-2cf1-4b93-9d95-a13b0faa30a6/Mom-and-Babies_Polar-Bear_KIDS_0223-crop_3x2.jpg',
 'https://files.worldwildlife.org/wwfcmsprod/images/Polar_bear_on_ice_in_Svalbard_Norway_WW294883/story_full_width/42ny6cwj8t_Polar_bear_on_ice_in_Svalbard_Norway_WW294883.jpg',
 'https://good-nature-blog-uploads.s3.amazonaws.com/uploads/2022/07/Polar-Bear-playing-in-the-snow-by-Eddy-Savage-1280x640.png',
 'https://optimise2.assets-servd.host/maniacal-finch/production/animals/polar-bear-01-01.jpg?w=1200&auto=compress%2Cformat&fit=crop&dm=1658950229&s=92bb7b274a3ab178ae54ddf5b186306b']

In [28]:
#prompt creation -> openai query -> google query creation -> google search

def gpt_call(name: str, country: str, website: str = None) -> dict:
    """
    Will call gpt-3.5-turbo for querying informations about a company.

    Args:
        name(str): Name of the company
        country(str): Country of the company
        website(str): Website of the company

    Results:
        output(dict) = Parsed output of OpenAIAPI
    """
    if not website:
        website = ""
    llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0)

    template = """
    I'll give you three inputs. These inputs will be the name of the company, 
    the country of the company, and the website company. The website of the company
    is not mandatory.
    You have to give me the products and services that the company offers as output.
    you dont need to give me nothing more than the ouput.



    input:
    IKEA Deutschland GmbH & Co. KG
    Germany
    ikea.com

    the output must be in this format, please use it:
    "Products/services": Furniture, Home decor, Kitchen and Dining;
    "Keywords":furniture, storage, lighting;
    "Company Classification":5712 (Furniture Stores) – SIC, 442110 (Furniture Stores) – NAICS
    do it yourself now.
    input:
    {name_of_company}
    {country_of_company}
    {website_of_company}

    what is the output?
    """

    prompt = PromptTemplate(
        input_variables=["name_of_company", "country_of_company", "website_of_company"],
        template=template,
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    output = chain.run({"name_of_company": name, 
            "country_of_company": country, 
            "website_of_company":website

    })
    parsed = parse_output(output)

    return parsed


In [29]:
parsed = gpt_call(name="Google", country="United States")
print(parsed)



{'Products/services': ['Search engine', 'Advertising', 'Cloud computing', 'Software'], 'Keywords': ['Search', 'Advertising', 'Cloud', 'Software', 'Technology'], 'Company Classification': ['7370 (Computer Programming', 'Data Processing', 'and Other Computer Related Services) – SIC', '511210 (Software Publishers) – NAICS']}
{'Products/services': ['Search engine', 'Advertising', 'Cloud computing', 'Software'], 'Keywords': ['Search', 'Advertising', 'Cloud', 'Software', 'Technology'], 'Company Classification': ['7370 (Computer Programming', 'Data Processing', 'and Other Computer Related Services) – SIC', '511210 (Software Publishers) – NAICS']}


In [37]:
products_services_str = " + ".join(parsed["Products/services"])

In [43]:
def google_query_formation(name: str, country: str, products: list) -> str:
    """
    Will manipulate strings to create Google search query

    Args:
        name(str): Name of company
        countr(str): Country of company
        products(list): Products that the company offers

    Return:
        google_query(str): Google query
    """
    products_services_str = " + ".join(products)

    return " + ".join([name, country, products_services_str])

In [21]:
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.llms import OpenAI
from serpapi import GoogleSearch
class Innoscripta:
    """Class of innoscript solution"""

    def __init__(self, name: str, country: str, website: str = None):
        """
        Initialize the Innoscripta search engine.

        Args:
            name (str): name of the company
            country (str): name of the country of the company
            website (str): website of the company
        """
        self.name = name
        self.country = country
        if not website:
            print("website not inserted by client... searching")
            self.website = self.find_website(company_name=name)
            print(f"website found.... {self.website}")
        else:
            self.website = website
        self.llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0)

    def find_website(self, company_name) -> str:
        """
        Try to find the website of the company

        Args:
            company_name (str): The name of the company. 
        Returns:
            website (str): website of the company
        """
        search = GoogleSearch(
            {
                "q": f"{company_name} + website",
                "engine": "google",
                "location": "Austin, Texas",
                "api_key": SERPAPI_KEY,
            }
        )
        response = search.get_dict()

        website = response["organic_results"][0]["link"]

        if website:
            return website
        else:
            return " " 

    def website_prompt_template(self):
        template = """
        I'll provide you a name of a company, I want as result the website of this company. Please use this format:
        input: Google

        output: google.com

        If you dont know the website just answer "Not available" the output format must be:

        output: Not Available
        """

        prompt = PromptTemplate(
            input_variables=[
                "name_of_company",
                "country_of_company",
                "website_of_company",
            ],
            template=template,
        )

        return prompt

    def main(self):
        """
        Will do the innoscripta querying
        """
        parsed_gpt_ouput = self.gpt_call()
        google_query = self.google_query_formation(parsed_gpt_ouput["products_services"])
        imgs = self.google_search(google_query)
        parsed_gpt_ouput["images"] = imgs

        return parsed_gpt_ouput

    def gpt_call(self) -> dict:
        """
        Will call gpt-3.5-turbo for querying informations about a company.

        Args:
            name(str): Name of the company
            country(str): Country of the company
            website(str): Website of the company

        Results:
            output(dict) = Parsed output of OpenAIAPI
        """

        prompt = self.prompt_template()

        chain = LLMChain(llm=self.llm, prompt=prompt)
        output = chain.run(
            {
                "name_of_company": self.name,
                "country_of_company": self.country,
                "website_of_company": self.website,
            }
        )
        parsed = self.parse_output(output)

        return parsed

    def google_query_formation(self, products: list) -> str:
        """
        Will manipulate strings to create Google search query

        Return:
            google_query(str): Google query
        """
        products_services_str = " + ".join(products)

        return " + ".join([self.name, products_services_str])

    def google_search(self, query: str) -> list:
        """
        Will query google for images based in the output of OpenAIAPI

        Args:
            query(str): Formated query using the output openaiAPI

        Results:
            imgs(list): List with URLs for images
        """
        search = GoogleSearch(
            {
                "q": query,
                "engine": "google_images",
                "location": "Austin, Texas",
                "api_key": SERPAPI_KEY,
            }
        )
        response = search.get_dict()
        imgs = [r.get("original", None) for r in response["images_results"][:5]]

        return imgs

    def prompt_template(self):
        template = """
        I'll give you three inputs. These inputs will be the name of the company, 
        the country of the company, and the website company. The website of the company
        is not mandatory, so it can be just an empty string.
        If the website was not provided, gather all info you can with just name and country.
        You have to give me the products and services that the company offers as output.
        you dont need to give me nothing more than the ouput.



        input:
        IKEA Deutschland GmbH & Co. KG
        Germany
        ikea.com

        the output must be in this format, please use it:
        "products_services": Furniture, Home decor, Kitchen and Dining;
        "keywords":furniture, storage, lighting;
        "company_classification":5712 (Furniture Stores) – SIC, 442110 (Furniture Stores) – NAICS
        do it yourself now.
        input:
        {name_of_company}
        {country_of_company}
        {website_of_company}

        what is the output?
        """

        prompt = PromptTemplate(
            input_variables=[
                "name_of_company",
                "country_of_company",
                "website_of_company",
            ],
            template=template,
        )

        return prompt

    def parse_output(self, output_langchain: str) -> dict:
        """
        Will parse the output_langchain of the Langchain query

        Args:
            output(str): The output_langchain of Langchain query.

        Returns:
            result_dict(dict): the parsed output_langchain to dict
        """

        result_dict = {}

        sections = [section.strip() for section in output_langchain.split(";")]

        for section in sections:
            if section:
                header, values_str = section.split(":")
                header = header.strip('"')
                values = [value.strip() for value in values_str.strip("[]").split(",")]

                result_dict[header] = values

        return result_dict

In [22]:
inno = Innoscripta(name="Innoscripta", country="Germany", website=None)

website not inserted by client... searching
website found.... https://www.innoscripta.com/




In [23]:
tst = inno.main()

In [24]:
tst

{'products_services': ['Innovation consulting',
  'Digital transformation',
  'Product development'],
 'keywords': ['innovation',
  'consulting',
  'digital transformation',
  'product development'],
 'company_classification': ['Not applicable (Innoscripta is a consulting firm and does not have a specific SIC or NAICS code).'],
 'images': ['https://www.innosight.com/wp-content/uploads/2018/02/Figure2_700x438.png',
  'https://www.innosight.com/wp-content/uploads/2018/02/Figure3_500x358.png',
  'https://www.cparityevent.com/wp-content/uploads/2023/05/Web-Innoscripta-HQ-300x300.png',
  'https://www.innosight.com/wp-content/uploads/2018/02/Figure4_600x390.png',
  'https://www.innoscripta.com/6cc987b6a5d16e4b06b5.jpg?url']}

In [25]:
"innoscripta" in "innoscripta.com"

True