In [1]:
from dotenv import load_dotenv
import os
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.llms import OpenAI
from serpapi import GoogleSearch

load_dotenv()

OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
SERPAPI_KEY = os.getenv("SERPAPI_KEY")
print(OPENAI_API_KEY[0:6], SERPAPI_KEY[0:6])

sk-859 00c641


In [19]:
import requests
import json

search = GoogleSearch(
    {
        "q": "Swift Logistics + website",
        "engine": "google",
        "location": "Austin, Texas",
        "api_key": SERPAPI_KEY,
    }
)
response = search.get_dict()


In [20]:
response["organic_results"][0]["link"]

'https://www.swiftlogistics.com/'

In [4]:

llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0)



In [5]:



template = """
    I'll give you three inputs. These inputs will be the name of the company, 
    the country of the company, and the website company. The website of the company
    is not mandatory.
    You have to give me the products and services that the company offers as output.
    you dont need to give me nothing more than the ouput.



    input:
    IKEA Deutschland GmbH & Co. KG
    Germany
    ikea.com

    the output must be in this format, please use it:
    "Products/services": Furniture, Home decor, Kitchen and Dining;
    "Keywords":furniture, storage, lighting;
    "Company Classification":5712 (Furniture Stores) – SIC, 442110 (Furniture Stores) – NAICS
    do it yourself now.
    input:
    {name_of_company}
    {country_of_company}
    {website_of_company}

    what is the output?
"""

prompt = PromptTemplate(
    input_variables=["name_of_company", "country_of_company", "website_of_company"],
    template=template,
)

In [6]:

chain = LLMChain(llm=llm, prompt=prompt)
output = chain.run({"name_of_company": "Google", 
           "country_of_company": "United States", 
           "website_of_company":"google.com"

})

In [7]:
print(output)

"Products/services": Search engine, Advertising, Cloud computing, Software;
    "Keywords": Search, Advertising, Cloud, Software, Technology;
    "Company Classification": 7370 (Computer Programming, Data Processing, and Related Services) – SIC, 511210 (Software Publishers) – NAICS.


In [None]:
#query de busca no serpapi:
#nomedaempresa + paisdaempresa + palavrachave1 + palavrachave2 + palavrachave3

In [8]:
def parse_output(output_langchain: str) -> dict:
  """ 
  Will parse the output_langchain of the Langchain query
  
  Args:
    output(str): The output_langchain of Langchain query.

  Returns:
    result_dict(dict): the parsed output_langchain to dict
  """

  result_dict = {}

  sections = [section.strip() for section in output.split(";")]

  for section in sections:
      if section:
          header, values_str = section.split(":")
          header = header.strip('"')
          values = [value.strip() for value in values_str.strip('[]').split(",")]

          result_dict[header] = values

  return result_dict

In [9]:
parsed = parse_output(output)

In [11]:
parsed["images"] = "test"
parsed

{'Products/services': ['Search engine',
  'Advertising',
  'Cloud computing',
  'Software'],
 'Keywords': ['Search', 'Advertising', 'Cloud', 'Software', 'Technology'],
 'Company Classification': ['7370 (Computer Programming',
  'Data Processing',
  'and Related Services) – SIC',
  '511210 (Software Publishers) – NAICS.'],
 'images': 'test'}

In [8]:
parsed["Products/services"]

['Search engine', 'Advertising', 'Cloud computing', 'Software']

# Teste com SerpAPI

In [9]:
#!pip install google-search-results


In [19]:
search = GoogleSearch({
    "q": "Innoscripta + Germany + Innovation consulting + Digital transformation + Product development ",
    "engine": "google_images",
    "location": "Austin, Texas",
    "api_key": SERPAPI_KEY
})

In [12]:
search

<serpapi.google_search.GoogleSearch at 0x7f240b485190>

In [27]:
response = search.get_dict()
print(response["images_results"][2])

{'position': 3, 'thumbnail': 'https://serpapi.com/searches/64657183f0adfbcd97e4a773/images/4f3abf25eaa5fccfe017f296d62f7f3cb3cc25ed527034b09f8049b5aab91d65.jpeg', 'related_content_id': 'Y1NsQlBId0NjZE04eE1cIixcIjdxUlFlNWdFc1d1V3NN', 'serpapi_related_content_link': 'https://serpapi.com/search.json?engine=google_images_related_content&gl=us&hl=en&q=Innoscripta+%2B+Germany+%2B+Innovation+consulting+%2B+Digital+transformation+%2B+Product+development+&related_content_id=Y1NsQlBId0NjZE04eE1cIixcIjdxUlFlNWdFc1d1V3NN', 'source': 'innoscripta', 'title': 'Who we are looking for: What we offer: Your tasks: Are you interested?  Apply now!', 'link': 'https://it.innoscripta.com/documents/positions/innoscripta_Product_Manager_IT.pdf', 'is_product': False}


In [16]:
response.keys()

dict_keys(['search_metadata', 'search_parameters', 'search_information', 'suggested_searches', 'images_results', 'related_searches'])

In [30]:
imgs = [r.get("original", None) for r in response["images_results"][:5]]
imgs

['https://d3ml3b6vywsj0z.cloudfront.net/company_images/5ba611307c86660d3c6b2536_images.png',
 'https://image.pitchbook.com/3T2zgG15lp2J4ZRw44DGxAmhbJe1598972475402_200x200',
 None,
 'https://www.cparityevent.com/wp-content/uploads/2023/05/Web-Innoscripta-HQ-300x300.png',
 'https://www.innoscripta.com/6cc987b6a5d16e4b06b5.jpg?url']

In [26]:
def google_search(query: str) -> list:
    """
    Will query google for images based in the output of OpenAIAPI
    
    Args:
        query(str): Formated query using the output openaiAPI
    
    Results:
        imgs(list): List with URLs for images
    """
    search = GoogleSearch({
    "q": query,
    "engine": "google_images",
    "location": "Austin, Texas",
    "api_key": SERPAPI_KEY
    })
    response = search.get_dict()
    imgs = [r["original"] for r in response["images_results"][:5]]

    return imgs

In [27]:
tst = google_search("Polar bear")
tst

['https://upload.wikimedia.org/wikipedia/commons/6/66/Polar_Bear_-_Alaska_%28cropped%29.jpg',
 'https://i.natgeofe.com/k/55256f3f-2cf1-4b93-9d95-a13b0faa30a6/Mom-and-Babies_Polar-Bear_KIDS_0223-crop_3x2.jpg',
 'https://files.worldwildlife.org/wwfcmsprod/images/Polar_bear_on_ice_in_Svalbard_Norway_WW294883/story_full_width/42ny6cwj8t_Polar_bear_on_ice_in_Svalbard_Norway_WW294883.jpg',
 'https://good-nature-blog-uploads.s3.amazonaws.com/uploads/2022/07/Polar-Bear-playing-in-the-snow-by-Eddy-Savage-1280x640.png',
 'https://optimise2.assets-servd.host/maniacal-finch/production/animals/polar-bear-01-01.jpg?w=1200&auto=compress%2Cformat&fit=crop&dm=1658950229&s=92bb7b274a3ab178ae54ddf5b186306b']

In [28]:
#prompt creation -> openai query -> google query creation -> google search

def gpt_call(name: str, country: str, website: str = None) -> dict:
    """
    Will call gpt-3.5-turbo for querying informations about a company.

    Args:
        name(str): Name of the company
        country(str): Country of the company
        website(str): Website of the company

    Results:
        output(dict) = Parsed output of OpenAIAPI
    """
    if not website:
        website = ""
    llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0)

    template = """
    I'll give you three inputs. These inputs will be the name of the company, 
    the country of the company, and the website company. The website of the company
    is not mandatory.
    You have to give me the products and services that the company offers as output.
    you dont need to give me nothing more than the ouput.



    input:
    IKEA Deutschland GmbH & Co. KG
    Germany
    ikea.com

    the output must be in this format, please use it:
    "Products/services": Furniture, Home decor, Kitchen and Dining;
    "Keywords":furniture, storage, lighting;
    "Company Classification":5712 (Furniture Stores) – SIC, 442110 (Furniture Stores) – NAICS
    do it yourself now.
    input:
    {name_of_company}
    {country_of_company}
    {website_of_company}

    what is the output?
    """

    prompt = PromptTemplate(
        input_variables=["name_of_company", "country_of_company", "website_of_company"],
        template=template,
    )
    chain = LLMChain(llm=llm, prompt=prompt)
    output = chain.run({"name_of_company": name, 
            "country_of_company": country, 
            "website_of_company":website

    })
    parsed = parse_output(output)

    return parsed


In [29]:
parsed = gpt_call(name="Google", country="United States")
print(parsed)



{'Products/services': ['Search engine', 'Advertising', 'Cloud computing', 'Software'], 'Keywords': ['Search', 'Advertising', 'Cloud', 'Software', 'Technology'], 'Company Classification': ['7370 (Computer Programming', 'Data Processing', 'and Other Computer Related Services) – SIC', '511210 (Software Publishers) – NAICS']}
{'Products/services': ['Search engine', 'Advertising', 'Cloud computing', 'Software'], 'Keywords': ['Search', 'Advertising', 'Cloud', 'Software', 'Technology'], 'Company Classification': ['7370 (Computer Programming', 'Data Processing', 'and Other Computer Related Services) – SIC', '511210 (Software Publishers) – NAICS']}


In [37]:
products_services_str = " + ".join(parsed["Products/services"])

In [43]:
def google_query_formation(name: str, country: str, products: list) -> str:
    """
    Will manipulate strings to create Google search query

    Args:
        name(str): Name of company
        countr(str): Country of company
        products(list): Products that the company offers

    Return:
        google_query(str): Google query
    """
    products_services_str = " + ".join(products)

    return " + ".join([name, country, products_services_str])

In [51]:
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.llms import OpenAI
from serpapi import GoogleSearch
import logging

logger = logging.getLogger(__name__)
class Innoscripta:
    """Class of innoscript solution"""

    def __init__(self, name: str, country: str, website: str = None):
        """
        Initialize the Innoscripta search engine.

        Args:
            name (str): name of the company
            country (str): name of the country of the company
            website (str): website of the company
        """
        self.name = name
        self.country = country
        if not website:
            logger.info("Website not provided, searching for it....")
            self.website = self.find_website(company_name=name)
            logger.info(f"Found website! {self.website}")
        else:
            self.website = website
        self.llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0)

    def find_website(self, company_name) -> str:
        """
        Try to find the website of the company

        Args:
            company_name (str): The name of the company.
        Returns:
            website (str): website of the company
        """
        search = GoogleSearch(
            {
                "q": f"{company_name} + website",
                "engine": "google",
                "location": "Austin, Texas",
                "api_key": SERPAPI_KEY,
            }
        )
        response = search.get_dict()

        website = response["organic_results"][0]["link"]

        if website:
            return website
        else:
            return " "

    def main(self):
        """
        Will do the innoscripta querying
        """
        logger.info("Doing GPT search...")
        parsed_gpt_ouput = self.gpt_call()
        logger.info("Doing Google search...")
        google_query = self.google_query_formation(
            parsed_gpt_ouput["products_services"]
        )
        imgs = self.google_search(google_query)
        parsed_gpt_ouput["images"] = imgs
        # parsed_gpt_ouput["additional_informations"] = [
        #     txt.replace("\n", " ")
        #     for txt in parsed_gpt_ouput["additional_informations"]
        # ]
        # parsed_gpt_ouput["additional_informations"] = "".join(
        #     parsed_gpt_ouput["additional_informations"]
        # )
        return parsed_gpt_ouput

    def gpt_call(self) -> dict:
        """
        Will call gpt-3.5-turbo for querying informations about a company.

        Args:
            name(str): Name of the company
            country(str): Country of the company
            website(str): Website of the company

        Results:
            output(dict) = Parsed output of OpenAIAPI
        """

        prompt = self.prompt_template()

        chain = LLMChain(llm=self.llm, prompt=prompt)
        output = chain.run(
            {
                "name_of_company": self.name,
                "country_of_company": self.country,
                "website_of_company": self.website,
            }
        )
        parsed = self.parse_output(output)

        return parsed

    def google_query_formation(self, products: list) -> str:
        """
        Will manipulate strings to create Google search query

        Return:
            google_query(str): Google query
        """
        products_services_str = " + ".join(products)

        return " + ".join([self.name, products_services_str])

    def google_search(self, query: str) -> list:
        """
        Will query google for images based in the output of OpenAIAPI

        Args:
            query(str): Formated query using the output openaiAPI

        Results:
            imgs(list): List with URLs for images
        """
        search = GoogleSearch(
            {
                "q": query,
                "engine": "google_images",
                "location": "Austin, Texas",
                "api_key": SERPAPI_KEY,
            }
        )
        response = search.get_dict()
        imgs = [r.get("original", None) for r in response["images_results"][:5]]

        return imgs

    def prompt_template(self):
        template = """
        I'll give you three inputs. These inputs will be the name of the company, 
        the country of the company, and the website company. The website of the company
        is not mandatory, so it can be just an empty string.
        If the website was not provided, gather all info you can with just name and country.
        You have to give me the products and services that the company offers as output.
        you dont need to give me nothing more than the ouput.

        input:
        IKEA Deutschland GmbH & Co. KG
        Germany
        ikea.com

        the output must be in this format, please use it:
        "products_services": Furniture, Home decor, Kitchen and Dining;
        "keywords":furniture, storage, lighting;
        "company_classification":5712 (Furniture Stores) – SIC, 442110 (Furniture Stores) – NAICS;
        "additional_informations":
            "Furniture" = "IKEA is well-known for its wide range of stylish and affordable furniture. They offer various furniture pieces for every room in the home, including living room, bedroom, kitchen, dining, and outdoor furniture. Their products feature modern designs, functionality, and often come in flat-pack form for easy transportation and assembly.",
            "Home Decor and Accessories" =  "In addition to furniture, IKEA provides a variety of home decor and accessories to enhance the style and functionality of living spaces. This includes items such as rugs, curtains, lighting fixtures, mirrors, frames, plants, and decorative storage solutions.",
            "Kitchen and Dining" = "IKEA offers a comprehensive range of kitchen and dining products. This includes kitchen cabinets, countertops, appliances, cookware, utensils, tableware, and dining furniture. They provide solutions for various kitchen styles, sizes, and budgets.",
            "Storage and Organization" = "IKEA specializes in storage and organization solutions to help keep homes tidy and efficient. They offer a wide selection of shelves, storage units, wardrobes, drawers, and closet systems. These products are designed to maximize space and provide smart storage solutions.",
            "Bedroom Furniture and Mattresses"= "IKEA provides bedroom furniture and mattresses that cater to different preferences and needs. They offer beds, bed frames, mattresses, dressers, wardrobes, and bedding accessories. Their products focus on comfort, functionality, and innovative design.",
            "Bathroom Furnishings" = "For bathrooms, IKEA offers a range of furnishings and accessories, including vanities, cabinets, sinks, faucets, showers, storage solutions, and bathroom textiles. These products aim to optimize space utilization and create a stylish and functional bathroom environment.",
            "Children's Furniture and Toys" = "IKEA features a variety of furniture and toys designed specifically for children. They offer children's beds, desks, storage systems, playroom furniture, toys, and decor items. Their products prioritize safety, durability, and imaginative play.",
            "Textiles and Fabrics" = "IKEA provides an assortment of textiles and fabrics for home decor, including curtains, blinds, rugs, cushions, bedding, and fabrics by the yard. They offer a wide selection of colors, patterns, and materials to suit different styles and preferences.",
            "Smart Home Solutions" = "In line with the growing trend of smart homes, IKEA offers smart home solutions such as lighting systems, wireless chargers, smart plugs, and integrated furniture with built-in technology. These products aim to enhance convenience, energy efficiency, and connectivity in the home.",
            "Home Delivery and Assembly Services" = "IKEA offers home delivery services to bring purchased products directly to customers' homes. They also provide assembly services, where IKEA's professionals can assemble the furniture and ensure it is ready to use.",


        do it yourself now.
        input:
        {name_of_company}
        {country_of_company}
        {website_of_company}

        what is the output?
        """

        prompt = PromptTemplate(
            input_variables=[
                "name_of_company",
                "country_of_company",
                "website_of_company",
            ],
            template=template,
        )

        return prompt

    def parse_output(self, output_langchain: str) -> dict:
        """
        Will parse the output_langchain of the Langchain query

        Args:
            output(str): The output_langchain of Langchain query.

        Returns:
            result_dict(dict): the parsed output_langchain to dict
        """

        result_dict = {}

        sections = [section.strip() for section in output_langchain.split(";")]

        for section in sections:
            print(f"---->{section}")
            if section:
                if "additional_informations" in section:
                    print("---> oi porra")
                    header = "additional_informations"
                    values = self.parse_additional_info(section)
                else:
                    header, values_str = section.split(":")
                    header = header.strip('"')
                    values = [value.strip() for value in values_str.strip("[]").split(",")]

                result_dict[header] = values

        return result_dict

    def parse_additional_info(self, data):
        # Parse to dict
        additional_informations = {}

        # Split into lines and iterate
        for line in data.split('\n'):
            # Ignore lines without '='
            if '=' not in line:
                continue

            # Split line into key-value pair
            key, value = line.split('=', 1)

            # Remove unwanted characters from key and value
            key = key.replace('"', '').strip()
            value = value.replace('"', '').strip()

            # Add to dictionary
            additional_informations[key] = value
        return additional_informations

In [52]:
inno = Innoscripta(name="Innoscripta", country="Germany", website=None)

In [53]:
tst = inno.main()

Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


---->"products_services": Digital transformation, Software development, IT consulting
---->"keywords":digital transformation, software development, IT consulting, innovation, technology
---->"company_classification":62020 (Information Technology (IT) Consulting) – NAICS
---->"additional_informations":
            "Digital Transformation" = "Innoscripta offers digital transformation services to help businesses adapt to the changing digital landscape. They provide solutions for digital strategy, customer experience, data analytics, and process optimization. Their services aim to improve efficiency, productivity, and customer satisfaction.",
            "Software Development" =  "Innoscripta specializes in software development services, including web and mobile app development, software integration, and custom software solutions. They use the latest technologies and agile methodologies to deliver high-quality software products that meet clients' needs.",
            "IT Consulting" = "Inn

In [54]:
tst

{'products_services': ['Digital transformation',
  'Software development',
  'IT consulting'],
 'keywords': ['digital transformation',
  'software development',
  'IT consulting',
  'innovation',
  'technology'],
 'company_classification': ['62020 (Information Technology (IT) Consulting) – NAICS'],
 'additional_informations': {'Digital Transformation': 'Innoscripta offers digital transformation services to help businesses adapt to the changing digital landscape. They provide solutions for digital strategy, customer experience, data analytics, and process optimization. Their services aim to improve efficiency, productivity, and customer satisfaction.,',
  'Software Development': "Innoscripta specializes in software development services, including web and mobile app development, software integration, and custom software solutions. They use the latest technologies and agile methodologies to deliver high-quality software products that meet clients' needs.,",
  'IT Consulting': 'Innoscripta

In [25]:
"innoscripta" in "innoscripta.com"

True

In [1]:
import requests

# define the base url
base_url = 'http://localhost:8885'  # replace with your actual server address and port

# define the endpoint
endpoint = "/company/"

# define the parameters for the GET request
params = {
    "company_name": "OpenAI",
    "company_country": "USA",
    "company_website": "https://www.openai.com",
}

# send a GET request
response = requests.get(base_url + endpoint, params=params)

# print the response
print(response.json())

{'products_services': ['Artificial Intelligence research', 'development', 'and deployment'], 'keywords': ['AI', 'machine learning', 'natural language processing'], 'company_classification': ['7371 (Computer Programming Services) – SIC', '541511 (Custom Computer Programming Services) – NAICS'], 'images': ['https://assets.weforum.org/editor/WAi7awMyhM1beUh5qAHB-m7gkrZ9BCjQrhRovg5OHVA.jpeg', 'https://fourweekmba.com/wp-content/uploads/2023/01/openai-organizational-structur-1200x900.png', 'https://www.delcotimes.com/wp-content/uploads/2023/05/OpenAI_CEO_Congress_63364.jpg?w=525', 'https://i0.wp.com/fourweekmba.com/wp-content/uploads/2023/02/how-does-openai-make-money.png?fit=2560%2C1901&ssl=1', 'https://news.microsoft.com/source/wp-content/uploads/2023/01/openai3-1020x960-1-768x720.png'], 'additional_informations': "OpenAI is a research organization that focuses on developing and deploying          artificial intelligence in a safe and beneficial way. They work on a wide range of AI-relate

In [40]:
data = """
"additional_informations":
            "Digital Transformation" = "Innoscripta offers digital transformation services to help businesses adapt to the digital age. They provide solutions for digital strategy, customer experience, data analytics, and process optimization. Their services aim to improve efficiency, productivity, and customer satisfaction.",
            "Software Development" = "Innoscripta specializes in software development services, including web and mobile app development, software architecture, and quality assurance. They use agile methodologies and cutting-edge technologies to deliver high-quality software solutions.",
            "IT Consulting" = "Innoscripta provides IT consulting services to help businesses align their IT strategy with their business goals. They offer services such as IT strategy development, IT governance, risk management, and IT project management. Their services aim to improve IT efficiency, reduce costs, and mitigate risks."
"""

In [42]:
# Parse to dict
additional_informations = {}

# Split into lines and iterate
for line in tst.split('\n'):
    # Ignore lines without '='
    if '=' not in line:
        continue

    # Split line into key-value pair
    key, value = line.split('=', 1)

    # Remove unwanted characters from key and value
    key = key.replace('"', '').strip()
    value = value.replace('"', '').strip()

    # Add to dictionary
    additional_informations[key] = value


In [43]:
additional_informations

{'Digital Transformation': 'Innoscripta offers digital transformation services to help businesses adapt to the digital age. They provide solutions for digital strategy, customer experience, data analytics, and process optimization. Their services aim to improve efficiency, productivity, and customer satisfaction.,',
 'Software Development': 'Innoscripta specializes in software development services, including web and mobile app development, software architecture, and quality assurance. They use agile methodologies and cutting-edge technologies to deliver high-quality software solutions.,',
 'IT Consulting': 'Innoscripta provides IT consulting services to help businesses align their IT strategy with their business goals. They offer services such as IT strategy development, IT governance, risk management, and IT project management. Their services aim to improve IT efficiency, reduce costs, and mitigate risks.'}

In [44]:
tst2 = {"oi": additional_informations}

In [2]:
search = GoogleSearch(
    {
        "q": "Horsch GmbH & Co. KG",
        "engine": "google_images",
        "location": "Germany",
        "api_key": SERPAPI_KEY,
        "google_domain": "google.de",
        "hl": "de",
        "gl": "de"
    }
)
response = search.get_dict()

In [3]:
response

{'search_metadata': {'id': '64713f5ef26ac692946906b8',
  'status': 'Success',
  'json_endpoint': 'https://serpapi.com/searches/bf7159697f498450/64713f5ef26ac692946906b8.json',
  'created_at': '2023-05-26 23:23:10 UTC',
  'processed_at': '2023-05-26 23:23:10 UTC',
  'google_images_url': 'https://www.google.de/search?q=Horsch+GmbH+%26+Co.+KG&oq=Horsch+GmbH+%26+Co.+KG&uule=w+CAIQICIHR2VybWFueQ&hl=de&gl=de&tbm=isch',
  'raw_html_file': 'https://serpapi.com/searches/bf7159697f498450/64713f5ef26ac692946906b8.html',
  'total_time_taken': 1.39},
 'search_parameters': {'engine': 'google_images',
  'q': 'Horsch GmbH & Co. KG',
  'location_requested': 'Germany',
  'location_used': 'Germany',
  'google_domain': 'google.de',
  'hl': 'de',
  'gl': 'de',
  'device': 'desktop'},
 'search_information': {'image_results_state': 'Results for exact spelling',
  'menu_items': [{'position': 1,
    'title': 'Alle',
    'link': 'https://www.google.de/search?q=Horsch+GmbH+%26+Co.+KG&source=lmns&gl=de&hl=de&sa=X

In [5]:
import json

with open("../external_data/google-countries.json", "r") as f:
    oi = json.load(f)

In [8]:
for i in oi:
    if i["country_name"].lower() == "Germany".lower():
        print(i["country_code"])

de
