In [1]:
import requests
from bs4 import BeautifulSoup
import json
import csv
import re

In [2]:
# Function to append a single row to CSV
def append_row_to_csv(file_path, row_dict, header=None):
    file_exists = False
    try:
        with open(file_path, 'r') as f:
            file_exists = True
    except FileNotFoundError:
        file_exists = False

    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=header)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row_dict)

# File path
file_path = 'data.csv'

header_cols = ['company_name','short_description','current_stage','partners',\
          'full_description','logo_link','website_link','twitter_link',\
          'linkedin_link','instagram_link','youtube_link','job_titles',\
          'team_members','founded_year','partnered_year','ipo_year','acquired_year',\
               'categories','job_link']

In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:126.0) Gecko/20100101 Firefox/126.0',
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Referer': 'https://www.sequoiacap.com/our-companies/',
    'Content-Type': 'application/json',
    'Origin': 'https://www.sequoiacap.com',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'empty',
    'Sec-Fetch-Mode': 'cors',
    'Sec-Fetch-Site': 'same-origin',
    'Priority': 'u=1',
}

def fetch_company_data(post_id:str,nonce:str):
    url = 'https://www.sequoiacap.com/wp-admin/admin-ajax.php'
    data = {
        'action': 'load_company_content',
        'post_id': post_id,
        'nonce': nonce,
    }

    # Sending the POST request
    response = requests.post(url, headers= None, data=data)

    # Clean the response text
    cleaned_text = re.sub(r'\s*\t+\s*', ' ', response.text)

    # Parse the cleaned text with BeautifulSoup
    text_soup = BeautifulSoup(cleaned_text, 'html.parser')

    # Extracting information with error handling
    try:
        full_description = text_soup.find(class_='wysiwyg').get_text(strip=True)
    except AttributeError:
        full_description = None
    try:
        logo_link = text_soup.select_one('img')['src']
    except (AttributeError, TypeError):
        logo_link = None

    try:
        website_link = text_soup.select_one('a')['href']
    except (AttributeError, TypeError):
        website_link = None

    try:
        twitter_link = text_soup.find(class_='ico--twitter')['href']
    except (AttributeError, TypeError):
        twitter_link = None

    try:
        linkedin_link = text_soup.find(class_='ico--linkedin')['href']
    except (AttributeError, TypeError):
        linkedin_link = None

    try:
        instagram_link = text_soup.find(class_='ico--instagram')['href']
    except (AttributeError, TypeError):
        instagram_link = None

    try:
        youtube_link = text_soup.find(class_='ico--youtube')['href']
    except (AttributeError, TypeError):
        youtube_link = None

    try:
        job_titles = [job.get_text(strip=True) for job in text_soup.select('.clist__title:-soup-contains("Jobs") ~ ul.clist__list .clist__link')]
    except AttributeError:
        job_titles = None

    try:
        team_members = [member.get_text(strip=True) for member in text_soup.select('.clist__title:-soup-contains("Team") ~ ul.clist__list .clist__link')]
    except AttributeError:
        team_members = None

    try:
        founded_year = text_soup.select_one('.clist__title:-soup-contains("Milestones") ~ ul.clist__list .clist__item:-soup-contains("Founded")').get_text(strip=True).split()[-1]
    except (AttributeError, TypeError, IndexError):
        founded_year = None

    try:
        partnered_year = text_soup.select_one('.clist__title:-soup-contains("Milestones") ~ ul.clist__list .clist__item:-soup-contains("Partnered")').get_text(strip=True).split()[-1]
    except (AttributeError, TypeError, IndexError):
        partnered_year = None

    try:
        ipo_year = text_soup.select_one('.clist__title:-soup-contains("Milestones") ~ ul.clist__list .clist__item:-soup-contains("IPO")').get_text(strip=True).split()[-1]
    except (AttributeError, TypeError, IndexError):
        ipo_year = None

    try:
        acquired_year = text_soup.select_one('.clist__title:-soup-contains("Milestones") ~ ul.clist__list .clist__item:-soup-contains("Acquired")').get_text(strip=True).split()[-1]
    except (AttributeError, TypeError, IndexError):
        acquired_year = None

    try:
        categories = [category.get_text(strip=True) for category in text_soup.select('.l-hr-row__item a.pill')]
    except AttributeError:
        categories = None

    try:
        job_link = text_soup.select_one('.caption.caption--14')['href']
    except (AttributeError, TypeError):
        job_link = None



    # Returning the extracted information
    return {
        'full_description': full_description,
        'logo_link': logo_link,
        'website_link': website_link,
        'twitter_link': twitter_link,
        'linkedin_link': linkedin_link,
        'instagram_link': instagram_link,
        'youtube_link': youtube_link,
        'job_titles': job_titles,
        'team_members': team_members,
        'founded_year': founded_year,
        'partnered_year': partnered_year,
        'ipo_year': ipo_year,
        'acquired_year': acquired_year,
        'categories': categories,
        'job_link': job_link,
    }

In [4]:
def extract_nonce():
  """
  Extracts the nonce value from the given BeautifulSoup object.

  Args:
    soup: A BeautifulSoup object representing the HTML document.

  Returns:
    The nonce value as a string, or None if not found.
  """
  response = requests.get('https://www.sequoiacap.com/our-companies/#all-panel')
  soup = BeautifulSoup(response.content, 'html.parser')
  script_tag = soup.find('script', {'id': 'theme-scripts-js-before'})
  if script_tag and script_tag.string:
    nonce_match = re.search(r'"nonce":"(.*?)"', script_tag.string)
    if nonce_match:
      return nonce_match.group(1)
  return None


nonce = extract_nonce()
print(nonce)

ed7915fd32


In [5]:
def get_sequoia_pagination_info():
    # Fetch the HTML content
    response = requests.get('https://www.sequoiacap.com/our-companies/#all-panel')

    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: {response.status_code}")

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the <script> tag containing the desired JavaScript object
    page_script_tag = soup.find('script', string=re.compile('window.FWP_JSON'))

    if not page_script_tag:
        raise Exception("Unable to find the script tag containing 'window.FWP_JSON'")

    # Extract the JavaScript object as a string
    js_code = page_script_tag.string

    # Use regex to isolate the JSON part
    json_match = re.search(r'window\.FWP_JSON\s*=\s*(\{.*\});', js_code)

    if not json_match:
        raise Exception("Unable to extract JSON from the script tag")

    json_str = json_match.group(1)

    # Parse the JSON
    data = json.loads(json_str)

    # Extract pagination information
    total_pages = data['preload_data']['settings']['pager']['total_pages']
    total_rows = data['preload_data']['settings']['pager']['total_rows']

    return total_pages, total_rows

total_pages, total_rows = get_sequoia_pagination_info()

In [6]:
gen_count = 0
for pg in range(1,total_pages+1):
  json_data = {
      'action': 'facetwp_refresh',
      'data': {
          'facets': {
              'categories': [],
              'partners': [],
              'stage_current': [],
              'stage_at_investment': [],
              'load-more': [],
          },
          'frozen_facets': {},
          'http_params': {
              'get': [],
              'uri': 'our-companies',
              'url_vars': [],
          },
          'template': 'wp',
          'extras': {
              'selections': True,
              'sort': 'default',
          },
          'soft_refresh': 1,
          'is_bfcache': 1,
          'first_load': 0,
          'paged': pg,
      },
  }

  response = requests.post('https://www.sequoiacap.com/our-companies/#all-panel',\
                           headers=headers, json=json_data)

  # Convert bytes to string
  response_str = response.content.decode('utf-8')

  # Parse JSON
  json_data = json.loads(response_str)

  # Extract HTML from JSON
  html_content = json_data['template']

  # Parse HTML with BeautifulSoup
  soup = BeautifulSoup(html_content, 'html.parser')

  comp_names = soup.find_all(class_="company-listing__cell-wide company-listing__head")
  comp_names = [text.get_text() for text in comp_names]

  partners_list = soup.find_all(class_="u-lg-hide company-listing__list")
  partners_list = [names for names in partners_list]

  # Extract the names from all li tags
  partners_list =[i.get_text(',') for i in partners_list]

  # Find the table an thed extract data
  table = soup.find('table')
  rows = table.find_all('tr')

  # Extract data from table rows
  data = []
  row_count = 0
  Loading = ''
  for i,row in enumerate(rows):
      cells = row.find_all('td')
      row_data = [cell.get_text().strip() for cell in cells]

      try:
        row_data_check = [cell.get_text().strip() for cell in cells][0]
      except IndexError as error:
        row_data_check = None
        continue
      if type(eval(row_data_check))==int:
        # print(row_count,comp_names[row_count], row_data[:-1])

        data_no = row_data[:-1][0]
        # ajax call for more company info
        companies_data = fetch_company_data(data_no,nonce)
        company_name = comp_names[row_count]
        short_description = row_data[:-1][1]
        current_stage = row_data[:-1][2]
        partners = partners_list[row_count]



        result ={'company_name':company_name,
                 'short_description':short_description,
                 'current_stage':current_stage,
                 'partners':partners}
        merged_dict = {**result, **companies_data}
        append_row_to_csv(file_path, row_dict=merged_dict, header=header_cols)

        row_count+=1
        gen_count+=1
  #       if gen_count==15:
  #         break
  # if pg==1:
  #   break
  print('page',pg)

page 1
page 2
page 3
page 4
page 5
page 6
page 7


In [7]:
# preview data
import pandas as pd
df = pd.read_csv('/content/data.csv')
df.head()

Unnamed: 0,company_name,short_description,current_stage,partners,full_description,logo_link,website_link,twitter_link,linkedin_link,instagram_link,youtube_link,job_titles,team_members,founded_year,partnered_year,ipo_year,acquired_year,categories,job_link
0,[24]7.ai,"[24]7 helps businesses create a personalized, ...",Growth,Michael Moritz,[24]7.aiuses AI technology to help businesses ...,https://www.sequoiacap.com/wp-content/uploads/...,https://www.247.ai/,https://www.twitter.com/24_7_inc,https://www.linkedin.com/company/24-7-inc,,,"['DevOps Engineer - 2', 'DevOps Engineer - 2',...","['PV Kannan', 'Shanmugam ""Nags"" Nagarajan']",2000.0,2003.0,,,"['AI/ML', 'Enterprise']",https://jobs.sequoiacap.com/jobs/247ai
1,100 Thieves,100 Thieves is a lifestyle brand for gamers.,Growth,,100 Thieves is a lifestyle brand for gamers. I...,https://www.sequoiacap.com/wp-content/uploads/...,https://100thieves.com/,https://twitter.com/100Thieves,,https://www.instagram.com/100thieves/,https://www.youtube.com/channel/UCnrX2_FoKieob...,"['Marketing Manager', 'Product Project Manager']",[],2017.0,2018.0,,,['Consumer'],
2,23andMe,23andMe is the leading personal genetics infor...,IPO,Roelof Botha,23andMe is dedicated to helping individuals un...,https://www.sequoiacap.com/wp-content/uploads/...,https://www.23andme.com/,https://www.twitter.com/23andMe?ref_src=twsrc%...,https://www.linkedin.com/company/57783/,,,"['Sr. Product Manager, Health', 'Head of Socia...",['Anne Wojcicki'],2006.0,2017.0,2021.0,,['Healthcare'],https://jobs.sequoiacap.com/jobs/23andme
3,Aalto,Aalto is an online alternative to traditional ...,Early,Bryan Schreier,For skilled buyers wanting to increase their c...,https://www.sequoiacap.com/wp-content/uploads/...,https://aalto.com/,https://www.twitter.com/aaltohomes,https://www.linkedin.com/company/aalto-inc/,https://www.instagram.com/aalto/,,[],"['Jon Carpenter', 'Nicholas Narodny']",2018.0,2019.0,,,['Consumer'],
4,ActionIQ,ActionIQ is developing the next generation of ...,Growth,Doug Leone,ActionIQ is developing the next generation of ...,https://www.sequoiacap.com/wp-content/uploads/...,http://www.actioniq.co/,https://www.twitter.com/actioniqinc,,,,[],"['Tasso Argyros', 'Nitay Joffe']",2014.0,2014.0,,,['Enterprise'],


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341 entries, 0 to 340
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company_name       341 non-null    object 
 1   short_description  341 non-null    object 
 2   current_stage      340 non-null    object 
 3   partners           274 non-null    object 
 4   full_description   341 non-null    object 
 5   logo_link          332 non-null    object 
 6   website_link       341 non-null    object 
 7   twitter_link       289 non-null    object 
 8   linkedin_link      305 non-null    object 
 9   instagram_link     75 non-null     object 
 10  youtube_link       33 non-null     object 
 11  job_titles         341 non-null    object 
 12  team_members       341 non-null    object 
 13  founded_year       340 non-null    float64
 14  partnered_year     340 non-null    float64
 15  ipo_year           50 non-null     float64
 16  acquired_year      36 non-

In [11]:
# copy to drive
!cp -r /content/data.csv /content/drive/MyDrive/mldata/mylane