In [20]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

In [21]:
#Different Functions for Different Descriptions
def comp_description(comp_name,response):
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        #print(soup)
        # Check if the 'About' section exists on the page
        abt_section = soup.find('span', {'class': 'description ng-star-inserted'})

        if abt_section:
            # Check if text content is available
            descp = abt_section.text.strip() if abt_section.text else "No description Found"
            return descp
        else:
            return f"No Description found for {comp_name}"
    else:
        return f"Failed to get info. Error: {response.status_code}"

In [22]:
def fsector(comp_name,response):

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        #print(soup)
        # Check if sector exists on the page
        sector = soup.find('span', {'class': 'component--field-formatter field-type-enum ng-star-inserted'})

        if sector:
            # Check if text content is available
            t_sec = sector.text.strip() if sector.text else "No Sector available"
            return t_sec
        else:
            return f"No sector found for {comp_name}"
    else:
        return f"ailed to get info. Error: {response.status_code}"

In [23]:
def f_ind(comp_name,response):

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all elements with the class (industry)
        c_texts = soup.find_all('div', {'class': 'chip-text'})

        if c_texts:
            # Extract the text content of each 'chip-text'
            descriptions = [chip_text.text.strip() for chip_text in c_texts]
            i_str = ', '.join(descriptions)

            return i_str
        else:
            return f"No Info found for {comp_name}"
    else:
        return f"Failed to get info. Error: {response.status_code}"

In [24]:
def l_fund(comp_name,response):

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> elements with the specified class
        a_elements = soup.find_all('a', {'class': 'component--field-formatter field-type-enum accent highlight-color-contrast-light ng-star-inserted'})

        if a_elements and len(a_elements) >= 2:
            # Extract and print the text content of the second <a> element
            fund = a_elements[1].text.strip()
            return fund
        else:
            return f"No Funding found for {comp_name}"
    else:
        return f"Failed to get info. Error: {response.status_code}"

In [25]:
def f_compet(comp_name):
    #since competitors information is on another page we are navigating to different api and extracting the information
    url = f'https://www.crunchbase.com/organization/{comp_name.lower()}/org_similarity_overview'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    #time.sleep(2)
    response = requests.get(url, headers=headers)
    print(response.status_code)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> elements with specific attributes
        matching_a_elements = soup.find_all('div', {'class': 'reasons-container ng-star-inserted'})
        com_name=soup.find('h1',{'class':'profile-name'})
        if matching_a_elements and com_name:
            # Extract and return the text content of all matching <a> elements in a list
            results = [element.text.strip() for element in matching_a_elements]
            compy_name=com_name.text.strip()
            result=[]
            #print(results)
            for text in results:
              # matches = re.findall(fr"{company_name} and (\w+)", text)
              pattern = re.compile(compy_name + r'\s+and\s+(\w+)')
              matches = pattern.findall(text)
              result.extend(matches)
            c_str = ', '.join(result)

            return c_str
        else:
            return [f"No Competitors found for {comp_name}"]
    else:
        return [f"Failed to get info. Error: {response.status_code}"]

In [26]:
def products(comp_name,response):

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        #print(soup)

        p = soup.find('span', {'class': 'description has-overflow ng-star-inserted'})

        if p:
            # Check if text content is available
            res = p.text.strip() if p.text else "No info available"
            return res
        else:
            return f"No Products/Services found for {comp_name}"
    else:
        return f"Failed to get info. Error: {response.status_code}"

In [27]:
def test(comp_name):
  #Processing the Name of company before we pass it to the url
  t_name=comp_name.lower().replace(' ', '-')
  url = f'https://www.crunchbase.com/organization/{t_name}'

  headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  }

  response = requests.get(url, headers=headers)
  print(response.status_code)
  #check the Response code
  if response.status_code==200:
    desc=comp_description(comp_name,response)
    sec=fsector(comp_name,response)
    ind=f_ind(comp_name,response)
    fund=l_fund(comp_name,response)
    compet=f_compet(t_name)
    prod=products(comp_name,response)

    return {
            'Company Name' : comp_name,
            'Company Description': desc,
            'Sector': sec,
            'Industry': ind,
            'Funding': fund,
            'Products/Services Description': prod,
            'Competitors': compet
        }
  else:
      print(f"Failed to find information for {comp_name}")
      return None

In [28]:
#Generating Excel File for List of Companies
def generate_excel_file(comp_names):
    data = []
    #Processing Each Company
    for comp_name in comp_names:
        company_info = test(comp_name)
        time.sleep(2)
        if company_info:
            data.append(company_info)

    df = pd.DataFrame(data)
    excel_filename = 'out_file.xlsx'


    df.to_excel(excel_filename, index=False)

    print(f'Excel file "{excel_filename}" generated successfully.')



company_names = ['Amazon','Microsoft','Facebook','Tesla']
generate_excel_file(company_names)

200
200
200
200
200
200
200
200
Excel file "out_file.xlsx" generated successfully.
