In [1]:
import time
import os
import random
import logging
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Configure logging
logging.basicConfig(filename='scraping_errors.log', level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')

# Set up Chrome options
chrome_options = Options()

# # Run Chrome in headless mode, i.e., without opening a browser window
chrome_options.add_argument('--headless')  

# Set ChromeDriver path explicitly
webdriver.chrome.driver = r'C:\Users\hafiz\AppData\Local\Programs\chromedriver-win64'

# Run the driver
driver = webdriver.Chrome(options=chrome_options)

In [3]:
file_name = 'links_of_companies.txt'

if not os.path.exists(file_name):
    print(file_name, 'file does not exist, starting scraping the links...')

    # URL of the website to scrape
    url = 'https://www.findfunding.vc/'

    # Load the webpage
    driver.get(url)

    # Clicking the 'see more' button at the bottom of 
    # the page until everything is loaded
    count_click = 0
    while True:
        try:
            # Find the button element
            time.sleep(5)
            xpath = '//*[@id="list2"]/section/div/div[2]/div[2]/div/div[2]/div[2]/button'
            button = driver.find_element(By.XPATH, xpath)

            # If the button is found, click it
            button.click()
            count_click += 1
            print("Button clicked successfully. No of clicks:", count_click)
        except NoSuchElementException:
            # Stop if button not found
            print("Button not found.")
            break

    # Find the elements
    xpath = '//*[@id="list2"]/section/div/div[2]/div[2]/div/div[2]/div[1]/div/div/div/div/div[1]/a'
    link_elements = driver.find_elements(By.XPATH, xpath)

    # Use list comprehension with tqdm for the progress bar
    links = [link_element.get_attribute('href') for link_element in 
            tqdm(link_elements, desc="Extracting links", unit=" links")]

    # Save the links to the local directory
    with open(file_name, 'w') as file:
        for link in links:
            file.write("%s\n" % link)
else:
    # Open the file in read mode
    with open(file_name, 'r') as file:
        # Read lines from the file and store them into a list
        links = file.readlines()
        # Strip newline characters from each line
        links = [link.strip() for link in links]

    print(file_name, 'file exist, links retrieved from the file.')

links_of_companies.txt file exist, links retrieved from the file.


In [4]:
# Initialize the dataframe
df = pd.DataFrame()

for count, link in enumerate(tqdm(links, desc="Extracting company info", unit=" companies")):
    # Restart the chrome browser
    if count % 100 == 0 and count > 0:
        driver.quit()
        driver = webdriver.Chrome(options=chrome_options)

    # Load the webpage
    driver.get(link)
    time.sleep(3)

    while True:
        try:
            # Add the data into a json file
            details2_element = driver.find_element(By.XPATH, '//*[@id="list-details2"]/section/div/div/div/div/div[2]')
            details3_element = driver.find_element(By.ID, 'list-details3')
            details4_element_p1 = driver.find_element(By.XPATH, '//*[@id="list-details4"]/section/div/div/div[1]')
            details4_element_p2 = driver.find_element(By.XPATH, '//*[@id="list-details4"]/section/div/div/div[2]/div/div[2]')

            company_detail = {
                'company_name': details3_element.find_element(By.TAG_NAME, 'h1').text,
                'about': details3_element.find_element(By.TAG_NAME, 'p').text,
                'website': details4_element_p1.find_element(By.XPATH, 'a[1]').get_attribute('href'),
                'email': details4_element_p2.find_element(By.XPATH, 'div[1]/div/div/div/div/div').text,
                'investor_names': details4_element_p2.find_element(By.XPATH, 'div[2]/div/div/div/div/div').text,
                'location/hq': details4_element_p2.find_element(By.XPATH, 'div[3]/div/div/div/div/div').text,
                'lead/follow': details2_element.find_element(By.XPATH, 'div[1]/div/div/div/div/div').text,
                'initial_check_size': details2_element.find_element(By.XPATH, 'div[4]/div/div/div/div/div').text,
                'hardware/software': details2_element.find_element(By.XPATH, 'div[7]/div/div/div/div/div').text,
                'primary_stage_focus': details2_element.find_element(By.XPATH, 'div[2]/div/div/div/div/div').text,
                'initial_ownership_target': details2_element.find_element(By.XPATH, 'div[5]/div/div/div/div/div').text,
                'b2b/b2c/b2b2c': details2_element.find_element(By.XPATH, 'div[8]/div/div/div/div/div').text,
                'can_also_enter_at': details2_element.find_element(By.XPATH, 'div[3]/div/div/div/div/div').text,
                'investment_focus': details2_element.find_element(By.XPATH, 'div[6]/div/div/div/div/div').text,
                'does_not_invest_in': details2_element.find_element(By.XPATH, 'div[9]/div/div/div/div/div').text
            }

            # Trying to get the Linkedin URL
            try:
                company_detail = {
                    **company_detail, 'linkedin': details4_element_p1.find_element(By.XPATH, 'a[2]').get_attribute('href')
                    }
            except NoSuchElementException as e:
                logging.error('No LinkedIn profile found! URL: %s\n%s', link, e.msg)

            # Trying to get the Crunchbase URL
            try:
                # Click the button
                driver.find_element(By.XPATH, '//*[@id="dot-button"]').click()
                company_detail = {
                    **company_detail, 'crunchbase': driver.find_element(By.XPATH, '//*[@id="demo-positioned-menu"]/div[3]/ul/div/a').get_attribute('href')
                }
            except NoSuchElementException as e:
                logging.error('No CrunchBase URL found! URL: %s\n%s', link, e.msg)
            except ElementClickInterceptedException as e:
                logging.error('No CrunchBase URL found! URL: %s\n%s', link, e.msg)
            
            # Move to next page
            break
        # Trying again if fails
        except NoSuchElementException:
            logging.error('Element not found! URL: %s\n%s', link, e.msg)
            time.sleep(1)

    # Appending to a pandas dataframe
    df = pd.concat([df, pd.DataFrame.from_records([company_detail])])

Extracting company info: 100%|██████████| 1234/1234 [1:45:31<00:00,  5.13s/ companies]


In [5]:
# Quit the selenium driver
driver.quit()

In [6]:
# Show samples of the data
df.head()

Unnamed: 0,company_name,about,website,email,investor_names,location/hq,lead/follow,initial_check_size,hardware/software,primary_stage_focus,initial_ownership_target,b2b/b2c/b2b2c,can_also_enter_at,investment_focus,does_not_invest_in,linkedin,crunchbase
0,.406 Ventures,.406 Ventures is an early-stage venture capita...,http://www.406ventures.com/,contact@406ventures.com,"Liam Donohue, Graham Brooks, Payal Agrawal Div...",Massachusetts,Mostly Follow,$1-3m\n> $3m,Software / Code,Series A,-,-,-,Privacy & Security\nHealthcare\nData & Analytics,-,https://www.linkedin.com/company/.406-ventures,https://www.crunchbase.com/organization/406-ve...
0,(ERA) Entrepreneurs Roundtable Accelerator,"ERA combines seed capital, hands-on help and a...",http://eranyc.com/,info@eranyc.com,"Murat Aktihanoglu, Jonathan Axelrod",New York,Mostly Follow,$100-250k,Software / Code,Seed,5-7%,-,-,Financial Services / Payments\nSaaS\nE-Commerc...,-,https://www.linkedin.com/company/er-accelerator/,https://www.crunchbase.com/organization/er-acc...
0,1confirmation,1confirmation is a venture fund that supports ...,http://www.1confirmation.com/,-,"Nick Tomaino, Richard Chen, Frank Hourigan, Mi...",California,Can Lead,-,Software / Code,Seed,-,-,-,Web3 / Blockchain,-,https://www.linkedin.com/company/1confirmation/,https://www.crunchbase.com/organization/1confi...
0,1Sharpe Ventures,_[1Sharpe Ventures](http://1sharpe.ventures/) ...,https://1sharpe.ventures/,Email me! Riley@1sharpe.com,"Ginny Miller, Kat Collins, Riley Finch",California,Can Lead,$500k-1m\n$1-3m,Software / Code\nServices,Pre-Seed\nSeed\nSeries A,7-10%,Agnostic,-,Agriculture & Farming\nAI / ML\nCleanTech / Im...,"Infrastructure, Dev Tools, B2B Productivity to...",https://www.linkedin.com/company/1sharpe-capit...,https://www.crunchbase.com/organization/1sharp...
0,1Up Ventures,At 1Up Ventures we are building a diverse and ...,https://1upfund.com/,Contact page on website,"Ed Fries, Kelly Wallick",Washington,Mostly Follow,$500k-1m,Software / Code,Seed,No Target,B2C,Seed,Gaming,We only invest in games and content focused st...,https://www.linkedin.com/company/1upfund/about/,https://www.crunchbase.com/organization/1up-ve...


In [7]:
# Save the DataFrame to a CSV file
df.to_csv('find_funding_companies.csv', index=False)