In [88]:
# Import statements
import pandas as pd
import numpy as np 
import csv 
from parsel import Selector
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import json

In [99]:
'''
Class to describe a given company 
    @field: name: a string for the company name
    @field: description: a string for the company description
    @field: founders: a list of Founder objects
    @field: industries: a list of strings for different industries
    @field: website: a string for the website
    @field: lastStage: a string for the last stage of funding (eg. Series A)
    @field: linkedin: a string for the company's LinkedIn profile
    @field: location: a string for the company's location
'''
class Company:
    def __init__(self, companyName):
        self.name = companyName
        self.description = None
        self.founders = []
        self.industries = []
        self.website = None
        self.lastStage = None
        self.linkedin = None
    
    def toJson(self):
        return json.dumps(self, default=lambda o: o.__dict__)
    
'''
Class to describe a founder
    @field: name: a string for the founder's name
    @field: education: an list of education objects
    @field: experience: a list of experience objects
'''
class Founder:
    def __init__(self, founderName):
        self.name = founderName
        self.education = []
        self.experience = []

'''
Class to help describe a founder's education
    @field: degree: a string to describe the degree objective
    @field: school: a string for the school attended
    @field: field: a string to describe the major
'''
class Education:
    def __init__(self, schoolName):
        self.school = schoolName
        self.degree = None
        self.field = None
'''
Class to help describe a founder's experience
    @field: companyName: a string to describe the company's name
    @field: title: a string to describe the title held
    @field: description: a string to describe the job description
'''       
class Experience:
    def __init__(self, companyName):
        self.companyName = companyName
        self.title = None
        self.dates = None

In [90]:
'''
Loads the dataframe from Query 1 and Query 2 and merges + drops duplicates and NaNs
    @param: csv1: path to CSV 1 (formed by Query 1)
    @param: csv2: path to CSV 2 (formed by Query 2)
    @return: df1: a merged dataframe of csv1 and csv2
'''
def loadBacktestData(csv1, csv2):
    df1 = pd.read_csv(csv1)
    df2 = pd.read_csv(csv2)
    df1 = df1.append(df2)
    df1 = df1.drop_duplicates(subset=['Organization Name'])
    df1 = df1[df1.Founders.notna()]
    df1 = df1[df1.LinkedIn.notna()]
    return df1

# Loads and combines both CSVs into df1 dataframe
df1 = loadBacktestData('backtest1.csv', 'backtest2.csv')
# Creates a company_data dictionary to store scraped data
company_data = {}

In [91]:
'''
Method to set up and log into the LinkedIn using chromedriver
    @param: driverPath: path to the chromedriver.exe file
    @param: liUsername: string of LI username
    @param: liPassword: string of LI password
    @return: driver: the Chrome Webdriver (can be passed into future function arguments)
'''
def setupDriver(driverPath, liUsername, liPassword):
    # Sets up Chrome Webdriver and navigates to LinkedIn
    driver = webdriver.Chrome(driverPath)
    driver.get('https://www.linkedin.com/')
    sleep(2.0)
    
    # Signs in with given credentials and returns the driver
    driver.find_element_by_xpath('//a[text()="Sign in"]').click()
    sleep(2.0)
    username_input = driver.find_element_by_name('session_key')
    username_input.send_keys(liUsername)
    password_input = driver.find_element_by_name('session_password')
    password_input.send_keys(liPassword)
    sleep(2.0)
    driver.find_element_by_xpath('//button[text()="Sign in"]').click()
    return driver

# Launches LinkedIn and logs in
driver = setupDriver('./chromedriver', 'LIUsername', 'LIPassword')

In [94]:
'''
Method to add LI information for a company into the company_data dictionary with a new company object
    @param: entry: a pandas series extracted from a single row in the dataframe from loadBacktestData
    @return: None
This method updates the company_data dictionary and returns nothing
'''
def extractLIInfo(entry):
    # Adds a new company entry to the company_data dictionary and populates fields
    company_ = Company(entry['Organization Name'])
    company_.description = entry['Description']
    company_.industries = [i.strip() for i in entry['Industries'].split(',')]
    company_.website = entry['Website']
    company_.lastStage = entry['Last Funding Type']
    # Edge case where the LinkedIn link does not end in '/'
    if entry['LinkedIn'][-1] != '/':
        entry['LinkedIn'] = entry['LinkedIn'] + '/'
    company_.linkedin = entry['LinkedIn']
    company_.location = entry['Headquarters Location']
    
    # Generates a list of founder names
    founderNames = [i.strip() for i in entry['Founders'].split(',')]
    # Navigates to the company's primary LinkedIn page [People Tab]
    driver.get(entry['LinkedIn'] + "people/")
    sleep(1.0)
    # For each founder in the list, the school name, degree, and major is extracted
    for founder in founderNames:
        name_input = driver.find_element_by_id('people-search-keywords')
        name_input.send_keys(founder)
        driver.find_element_by_id("people-search-keywords").send_keys(Keys.ENTER)
        sleep(1.0)
        try:
            driver.find_element_by_xpath('//a[@data-control-name = "people_profile_card_name_link"]').click()
            sleep(2.0)
            # Scrolls to the bottom of the webpage (if no scroll, error where the full webpage doens't load)
            html = driver.find_element_by_tag_name('html')
            html.send_keys(Keys.END)
            sleep(0.75)
            # Creates a founder object for the given founder
            founder_ = Founder(founder)
            
            ###########################################################
            # Extracts degree information (formatted as a list of items)
            schools = driver.find_elements_by_xpath('//div[@class="pv-entity__degree-info"]')
            for school in schools:
                school_ = school.text.split('\n')
                # The school is formatted as a list [School Name,'Degree Name',Degree Name,'Field of Study',FOS]
                try:
                    educ_ = Education(school_[0])
                    try:
                        educ_.degree = school_[2]
                        educ_.field = school_[4]
                    except:
                        pass
                    # Appends the temporary education object to the founder
                    founder_.education.append(educ_)
                except:
                    pass
            
            ###############################################################    
            # Extracts experience information (formatted as a list of items)
            experiences = driver.find_elements_by_xpath('//a[@data-control-name="background_details_company"]')
            for exp in experiences:
                exp_lst = exp.text.split('\n')
                # The exp_ is formatted as [title, 'companyname', companyname, 'datesemployed', datesemployed, ..]
                try:
                    if exp_lst[0] == 'Company Name':
                        exp_ = Experience(exp_list[1])
                        founder_.experience.append(exp_)
                    else:
                        exp_ = Experience(exp_lst[2])
                        try:
                            exp_.title = exp_lst[0]
                            exp_.dates = exp_lst[4]
                            founder_.experience.append(exp_)
                        except:
                            founder_.experience.append(exp_)
                except:
                    pass
            # Appends the temporary founder object to the company
            company_.founders.append(founder_)
            # Re-navigates to company's LinkedIn page
            driver.get(entry['LinkedIn'] + "people/")
            sleep(1.0)
        except:
            founder_ = Founder(founder)
            company_.founders.append(founder_)
            print("{} not found for {}".format(company_.name, founder_.name))
            driver.get(entry['LinkedIn'] + "people/")
            sleep(1.0)
    # Adds the company to the company_data dictionary
    company_data[company_.name] = company_

In [95]:
# I haven't tried running this across all entries in the dataframe yet
# General idea is outlined below:

# Stores data for a single company in company_data
for i in range(2):
    print(i)
    extractLIInfo(df1.iloc[i])
# Data can then be extracted from company data
# company_data['Compound'].founders[0].education[0].school

0
1


company_data['Bison Trails'].founders[0].education[0].field

In [14]:
company_data['Tecton'].founders[1].experience[0].location

'2018 – Present'

In [83]:
company_data['Kodiak Robotics'].founders[1].experience[0].title

'Co-Founder & COO'

In [108]:
company_json = json.dumps(company_data, default=lambda x: x.__dict__)

In [109]:
company_json

'{"Bright Machines": {"name": "Bright Machines", "description": "Bright Machines brings together flexible factory robots with intelligent software, production data and machine learning.", "founders": [{"name": "Amar Hanspal", "education": [{"school": "Stanford University", "degree": "Director\'s College", "field": null}, {"school": "State University of New York at Stony Brook", "degree": "MS", "field": "Mechanical Engineering"}, {"school": "University Of Bombay", "degree": "Bachelor of Engineering (B.Eng.)", "field": "Mechanical Engineering"}], "experience": [{"companyName": "Bright Machines", "title": "Chief Executive Officer", "dates": "May 2018 \\u2013 Present"}, {"companyName": "BeyondTrust", "title": "Board Member", "dates": "Jul 2019 \\u2013 Present"}, {"companyName": "eSilicon", "title": "Board Member", "dates": "Jun 2014 \\u2013 Nov 2019"}, {"companyName": "RedSpark Inc", "title": "Vice President and Co-Founder", "dates": "Oct 1999 \\u2013 Oct 2001"}]}], "industries": ["Compute