In [203]:
from bs4 import BeautifulSoup
import pandas as pd
import re

In [227]:
def get_VC_name(content_soup):
    name = ""
    try:
        name = content_soup.select("div[data-walk-through-id$=-cell-name] .comp-txn-logo-container")[0].next_sibling.text
    except Exception as e:
        name = "None"
    return name

def get_VC_score(content_soup):
    score = ""
    try:
        score = content_soup.select("div[data-walk-through-id$=-cell-participationScore] a")[0].text
    except Exception as e:
        score = "None"
    return score

def get_VC_rounds(content_soup):
    rounds = ""
    try:
        rounds = content_soup.select("div[data-walk-through-id$=-cell-investmentsCount] a")[0].text
    except Exception as e:
        rounds = "None"
    return rounds

def get_VC_portfolio_companies(content_soup):
    portfolio_companies = ""
    try:
        portfolio_companies_div = content_soup.select("div[data-walk-through-id$=-cell-portfoliocompanies] > div > div .comp-txn-logo-container + span")
        portfolio_companies = ";".join(map(lambda x: x.text, portfolio_companies_div))
    except Exception as e:
        portfolio_companies = "None"
    return portfolio_companies

def get_VC_location(content_soup):
    location = ""
    try:
        location = content_soup.select("div[data-walk-through-id$=-cell-locations]")[0].text
    except Exception as e:
        location = "None"
    return location

def get_VC_stages(content_soup):
    stages = ""
    try:
        stages_div = content_soup.select("div[data-walk-through-id$=-cell-investmentStages] > span")[0]
        stages = stages_div.text.replace(",", ";")
    except Exception as e:
        stages = "None"
    return stages

def get_VC_sectors(content_soup):
    sectors = ""
    try:
        sectors_div = content_soup.select("div[data-walk-through-id$=-cell-investmentSectors] > span")[0]
        sectors = sectors_div.text.replace(",", ";")
    except Exception as e:
        sectors = "None"
    return sectors

def get_VC_investment_locations(content_soup):
    investment_location = ""
    try:
        investment_locations_div = content_soup.select("div[data-walk-through-id$=-cell-investmentLocations] > span")[0]
        investment_locations = investment_locations_div.text.replace(",", ";")
    except Exception as e:
        investment_locations = "None"
    return investment_locations
    
def get_row_content(content_soup):
    row_content_list = []
    VC_name = get_VC_name(content_soup)
    VC_score = get_VC_score(content_soup)
    VC_rounds = get_VC_rounds(content_soup)
    VC_portfolio_companies = get_VC_portfolio_companies(content_soup)
    VC_location = get_VC_location(content_soup)
    VC_stages = get_VC_stages(content_soup)
    VC_sectors = get_VC_sectors(content_soup)
    VC_investment_locations = get_VC_investment_locations(content_soup)
    row_content_list = [VC_name, VC_score, VC_rounds, VC_portfolio_companies, VC_location, VC_stages, VC_sectors, VC_investment_locations]
    return row_content_list

def get_contents(soup):
    contents_div = soup.find_all("div", attrs={"data-walk-through-id": re.compile(r"^gridtable-row-[0-9]*$")})
    contents_list = list(map(get_row_content, contents_div))
    return contents_list

def get_title(soup):
    titles_div = soup.find_all(attrs={"data-walk-through-id": "gridtable-column"})[0].children
    titles = map(lambda x: x.select(".comp--gridtable__column-cell--menu-middle")[0].contents[0]["title"], titles_div)
    return list(titles)

def get_VCs_from_html(filepath):
    fp = open(filepath)
    soup = BeautifulSoup(fp)
    title = get_title(soup)
    contents = get_contents(soup)
    df = pd.DataFrame(data=np.array(contents), columns=titles)
    fp.close()
    return df


In [230]:
filepath = "../data/Tracxn_SeriesA_AI.html"
df = get_VCs_from_html(filepath)
df.to_csv()

Unnamed: 0,Investor Name,Score,#Rounds,Portfolio Companies,Investor Location,Stages of Entry,Sectors of Investment,Locations of Investment
0,Sequoia Capital,15,81,Stripe;Bitmain;PicsArt,Menlo Park,Series A (578);Seed (306)[+13],Enterprise Applications (530);Consumer (358)[+39],United States (629);China (421)[+44]
1,Y Combinator,13,69,Replit;Scale;Observe.AI,Mountain View,Seed (4153);Series A (108)[+9],Enterprise Applications (1991);Consumer (902)[...,United States (3002);India (213)[+82]
2,Accel,12,65,Notion;UiPath;Freshworks,Palo Alto [+3],Series A (372);Seed (300)[+11],Enterprise Applications (484);Consumer (254)[+37],United States (544);India (214)[+40]
3,Khosla Ventures,12,63,Gitlab;Checkr;Upstart,Menlo Park,Series A (242);Seed (159)[+6],Enterprise Applications (205);High Tech (103)[...,United States (505);India (15)[+18]
4,Lightspeed Venture Partners,12,58,ThoughtSpot;Yellow Freight;People.ai,Menlo Park [+13],Series A (269);Seed (189)[+11],Enterprise Applications (356);Consumer (184)[+37],United States (507);China (64)[+27]
...,...,...,...,...,...,...,...,...
115,Serena,3,14,Dataiku;Lifen;Implicity,Paris,Series A (31);Seed (24)[+5],Enterprise Applications (41);Consumer (19)[+28],France (57);United States (18)[+7]
116,TLV Partners,2,13,Immunai;AIdoc;Oribi,Tel Aviv,Seed (25);Series A (15)[+2],Enterprise Applications (54);Enterprise Infras...,Israel (41);United States (27)
117,StageOne Ventures,2,13,Capitalise.ai;theator;Avanan,Herzliya,Seed (25);Series A (16)[+2],Enterprise Applications (40);Enterprise Infras...,Israel (35);United States (16)
118,Benhamou Global Ventures,2,13,AiDash;Drishti;German Bionic System,Tel Aviv [+4],Series A (26);Seed (16)[+3],Enterprise Applications (44);Enterprise Infras...,United States (51);Israel (9)[+6]
