# Get Companies

In [25]:
import re
from typing import List

import pandas as pd
from bs4 import BeautifulSoup


def get_VC_name(content_soup: BeautifulSoup) -> str:
    name = ""
    try:
        name = content_soup.select(
            "div[data-walk-through-id$=-cell-name] .comp-txn-logo-container"
        )[0].next_sibling.text
    except (AttributeError, IndexError):
        name = "None"
    return name


def get_VC_url(content_soup: BeautifulSoup) -> str:
    url = ""
    try:
        url = content_soup.select(
            "div[data-walk-through-id$=-cell-name] .comp-txn-logo-container"
        )[0].parent["href"]
    except (AttributeError, IndexError):
        url = "None"
    return url


def get_VC_score(content_soup: BeautifulSoup) -> str:
    score = ""
    try:
        score = content_soup.select(
            "div[data-walk-through-id$=-cell-participationScore] a"
        )[0].text
    except (AttributeError, IndexError):
        score = "None"
    return score


def get_VC_rounds(content_soup: BeautifulSoup) -> str:
    rounds = ""
    try:
        rounds = content_soup.select(
            "div[data-walk-through-id$=-cell-investmentsCount] a"
        )[0].text
    except (AttributeError, IndexError):
        rounds = "None"
    return rounds


def get_VC_portfolio_companies(content_soup: BeautifulSoup) -> str:
    portfolio_companies = ""
    try:
        portfolio_companies_div = content_soup.select(
            "div[data-walk-through-id$=-cell-portfoliocompanies] > div > div "
            ".comp-txn-logo-container + span"
        )
        portfolio_companies = ";".join(
            map(lambda x: x.text, portfolio_companies_div)
        )
    except (AttributeError, IndexError):
        portfolio_companies = "None"
    return portfolio_companies


def get_VC_location(content_soup: BeautifulSoup) -> str:
    location = ""
    try:
        location = content_soup.select(
            "div[data-walk-through-id$=-cell-locations]"
        )[0].text
    except (AttributeError, IndexError):
        location = "None"
    return location


def get_VC_stages(content_soup: BeautifulSoup) -> str:
    stages = ""
    try:
        stages_div = content_soup.select(
            "div[data-walk-through-id$=-cell-investmentStages] > span"
        )[0]
        stages = stages_div.text.replace(",", ";")
    except (AttributeError, IndexError):
        stages = "None"
    return stages


def get_VC_sectors(content_soup: BeautifulSoup) -> str:
    sectors = ""
    try:
        sectors_div = content_soup.select(
            "div[data-walk-through-id$=-cell-investmentSectors] > span"
        )[0]
        sectors = sectors_div.text.replace(",", ";")
    except (AttributeError, IndexError):
        sectors = "None"
    return sectors


def get_VC_investment_locations(content_soup: BeautifulSoup) -> str:
    try:
        investment_locations_div = content_soup.select(
            "div[data-walk-through-id$=-cell-investmentLocations] > span"
        )[0]
        investment_locations = investment_locations_div.text.replace(",", ";")
    except (AttributeError, IndexError):
        investment_locations = "None"
    return investment_locations


def get_VC_row_content(content_soup: BeautifulSoup) -> List[str]:
    row_content_list = []
    VC_name = get_VC_name(content_soup)
    VC_score = get_VC_score(content_soup)
    VC_rounds = get_VC_rounds(content_soup)
    VC_portfolio_companies = get_VC_portfolio_companies(content_soup)
    VC_location = get_VC_location(content_soup)
    VC_stages = get_VC_stages(content_soup)
    VC_sectors = get_VC_sectors(content_soup)
    VC_investment_locations = get_VC_investment_locations(content_soup)
    VC_url = get_VC_url(content_soup)
    row_content_list = [
        VC_name,
        VC_score,
        VC_rounds,
        VC_portfolio_companies,
        VC_location,
        VC_stages,
        VC_sectors,
        VC_investment_locations,
        VC_url,
    ]
    return row_content_list


def get_VC_contents(soup: BeautifulSoup) -> List[List[str]]:
    contents_div = soup.find_all(
        "div",
        attrs={"data-walk-through-id": re.compile(r"^gridtable-row-[0-9]*$")},
    )
    contents_list = list(map(get_VC_row_content, contents_div))
    return contents_list


def get_VC_title(soup: BeautifulSoup) -> List[str]:
    titles_div = soup.find_all(
        attrs={"data-walk-through-id": "gridtable-column"}
    )[0].children
    titles = map(
        lambda x: x.select(".comp--gridtable__column-cell--menu-middle")[
            0
        ].contents[0]["title"],
        titles_div,
    )
    return list(titles)


def get_VCs_from_html(html_file_path: str) -> pd.DataFrame:
    with open(html_file_path, encoding="utf-8") as html_fp:
        soup = BeautifulSoup(html_fp)
        vc_titles = get_VC_title(soup)
        vc_titles.append("Company URL")
        vc_contents = get_VC_contents(soup)
        vc_df = pd.DataFrame(data=vc_contents, columns=vc_titles)
        return vc_df

In [26]:
vc_html_path = "../data/Tracxn_SeriesA_AI.html"
vc_csv_save_path = "../result_csv/Tracxn_SeriesA_AI.csv"
vc_df = get_VCs_from_html(vc_html_path)
vc_df.to_csv(vc_csv_save_path, sep=",", index=False)
vc_df

Unnamed: 0,Investor Name,Score,#Rounds,Portfolio Companies,Investor Location,Stages of Entry,Sectors of Investment,Locations of Investment,Company URL
0,Sequoia Capital,15,81,Stripe;Bitmain;PicsArt,Menlo Park,Series A (578);Seed (306)[+13],Enterprise Applications (530);Consumer (358)[+39],United States (629);China (421)[+44],https://tracxn.com/a/companies/PuyKDxi6s4MDikT...
1,Y Combinator,13,69,Replit;Scale;Observe.AI,Mountain View,Seed (4153);Series A (108)[+9],Enterprise Applications (1991);Consumer (902)[...,United States (3002);India (213)[+82],https://tracxn.com/a/companies/srAiTt8Aevx0dkP...
2,Accel,12,65,Notion;UiPath;Freshworks,Palo Alto [+3],Series A (372);Seed (300)[+11],Enterprise Applications (484);Consumer (254)[+37],United States (544);India (214)[+40],https://tracxn.com/a/companies/asmY3flzarkQxfc...
3,Khosla Ventures,12,63,Gitlab;Checkr;Upstart,Menlo Park,Series A (242);Seed (159)[+6],Enterprise Applications (205);High Tech (103)[...,United States (505);India (15)[+18],https://tracxn.com/a/companies/-24Z99iK6UymolR...
4,Lightspeed Venture Partners,12,58,ThoughtSpot;Yellow Freight;People.ai,Menlo Park [+13],Series A (269);Seed (189)[+11],Enterprise Applications (356);Consumer (184)[+37],United States (507);China (64)[+27],https://tracxn.com/a/companies/aqLNsYlO_gpgAgY...
...,...,...,...,...,...,...,...,...,...
115,Serena,3,14,Dataiku;Lifen;Implicity,Paris,Series A (31);Seed (24)[+5],Enterprise Applications (41);Consumer (19)[+28],France (57);United States (18)[+7],https://tracxn.com/a/companies/iV_toH2_ePTPbiD...
116,TLV Partners,2,13,Immunai;AIdoc;Oribi,Tel Aviv,Seed (25);Series A (15)[+2],Enterprise Applications (54);Enterprise Infras...,Israel (41);United States (27),https://tracxn.com/a/companies/FX8q4lwleC1lNn6...
117,StageOne Ventures,2,13,Capitalise.ai;theator;Avanan,Herzliya,Seed (25);Series A (16)[+2],Enterprise Applications (40);Enterprise Infras...,Israel (35);United States (16),https://tracxn.com/a/companies/5GyKI7OOhPjFzGo...
118,Benhamou Global Ventures,2,13,AiDash;Drishti;German Bionic System,Tel Aviv [+4],Series A (26);Seed (16)[+3],Enterprise Applications (44);Enterprise Infras...,United States (51);Israel (9)[+6],https://tracxn.com/a/companies/rulJd1Fz9Y7uORe...


# Get LinkedIn Profile Link from Company Page

In [35]:
def get_employee_row_content(content_soup: BeautifulSoup) -> List[str]:
    employee_div = content_soup.select(".employeeCard__wrapper")[0]
    employee_name = ""
    employee_linkedin_profile = ""
    try:
        employee_name = employee_div.a.text
    except (AttributeError, IndexError):
        employee_name = "None"
    try:
        employee_linkedin_profile = employee_div.a.next_sibling.a["href"]
    except (AttributeError, IndexError):
        employee_linkedin_profile = "None"
    employee_info_list = [employee_name, employee_linkedin_profile]
    return employee_info_list


def get_employee_contents(soup: BeautifulSoup) -> List[List[str]]:
    contents_div = soup.find_all(
        "div",
        attrs={"data-walk-through-id": re.compile(r"^gridtable-row-[0-9]*$")},
    )
    contents_list = list(map(get_employee_row_content, contents_div))
    return contents_list


def get_employees_from_html(html_file_path: str) -> pd.DataFrame:
    with open(html_file_path, encoding="utf-8") as employee_fp:
        soup = BeautifulSoup(employee_fp)
        employee_titles = ["Name", "LinkedIn Profile"]
        employee_contents = get_employee_contents(soup)
        employee_df = pd.DataFrame(
            data=employee_contents, columns=employee_titles
        )
        return employee_df

In [36]:
employee_html_path = "../data/Sequoia Capital _ Tracxn.html"
employee_csv_save_path = "../result_csv/Sequoia Capital _ Tracxn.csv"
employee_df = get_employees_from_html(employee_html_path)
employee_df.to_csv(employee_csv_save_path, sep=",", index=False)
employee_df

Unnamed: 0,Name,LinkedIn Profile
0,Matanya Horowitz,https://linkedin.com/in/matanya-horowitz-87805519
1,Nolan Bushnell,https://linkedin.com/in/nolanbushnell
2,Bryan Schreier,https://linkedin.com/in/bryanschreier
3,David Cahn,https://linkedin.com/in/david-cahn-60150793
4,Roelof Botha,https://linkedin.com/in/roelofbotha
5,Clement Delangue,http://linkedin.com/in/clementdelangue
6,Wei Deng,https://linkedin.com/in/dengwei
7,Mike Vernal,https://linkedin.com/in/vernal
8,Bogomil Balkansky,https://linkedin.com/in/balkansky
9,Jess Lee,https://linkedin.com/in/jesskah
