In [1]:
import time
import requests
import json
import pandas as pd
import numpy as np
import polars as pl
import re
import os
import glob
import shutil
from pathlib import Path

# disable truncation with respect to the maximum string length in some column
# max_len = df["Headquarters Location"].str.lengths().max()
# pl.Config.set_fmt_str_lengths(max_len)

In [2]:
# Crunchbase API Key goes here
api_key = "5f2dfeab559fe9ca8a710e3db58c5f0c"

# Base URL for the Crunchbase API
base_url = "https://api.crunchbase.com/api/v4/searches/organizations?user_key="
# Headers for the POST request
headers = {
    "Content-Type": "application/json",
}

# Function to make API request
def make_request(data, page_number):
    data["page"] = {"number": page_number}
    response = requests.post(base_url + api_key, headers=headers, data=json.dumps(data))
    return response.json()

# The body request
data = {
    "field_ids": [
        "identifier",
        "founded_on",
        "name",
        "rank_org",
        "short_description",
        "categories",
        "location_identifiers"
    ],
    "order": [
        {
            "field_id": "name",
            "sort": "asc"
        }
    ],
    "query": [
        {
            "type": "predicate",
            "field_id": "category_group_list",
            "operator_id": "contains",
            "values": ["Artificial Intelligence", "Machine Learning", "Intelligent Systems", "Natural Language Processing"]
        },
        {
            "type": "predicate",
            "field_id": "location_identifiers",
            "operator_id": "includes",
            "values": ["498a147a51854faeb55ae18beb8d8548"]
        }
    ],
    "limit": 1000
}

all_data = []
for i in range(10):  # 10,000 organizations, 1,000 per request
    print(f"Making request {i+1}")
    res = make_request(data, i+1)
    print(res)
    # all_data.extend(res["data"]["items"])
    
    # Sleep for 2.4 seconds to respect rate limit of 25 requests per minute
    time.sleep(2.4)
    
# Now all_data contains 10,000 organization data
# print(all_data)

Making request 1
[{'message': 'invalid field_id', 'code': 'MD101', 'entity_def_id': 'organization', 'collection_id': 'organizations', 'field_id': 'category_group_list'}]
Making request 2
[{'message': 'invalid field_id', 'code': 'MD101', 'entity_def_id': 'organization', 'collection_id': 'organizations', 'field_id': 'category_group_list'}]
Making request 3
[{'message': 'invalid field_id', 'code': 'MD101', 'entity_def_id': 'organization', 'collection_id': 'organizations', 'field_id': 'category_group_list'}]
Making request 4
[{'message': 'invalid field_id', 'code': 'MD101', 'entity_def_id': 'organization', 'collection_id': 'organizations', 'field_id': 'category_group_list'}]
Making request 5
[{'message': 'invalid field_id', 'code': 'MD101', 'entity_def_id': 'organization', 'collection_id': 'organizations', 'field_id': 'category_group_list'}]
Making request 6
[{'message': 'invalid field_id', 'code': 'MD101', 'entity_def_id': 'organization', 'collection_id': 'organizations', 'field_id': 'cat

# DATA IMPORT

## AI company data

In [3]:
# create the input_dir（input directory）
source_path = os.path.dirname(os.path.abspath('__file__'))
# source_path = '/scratch/bell/sido/m&a'
INPUT_DIR = os.path.join(source_path, 'data')

# if INPUT_DIR has not been created yet, create it
if not os.path.isdir(INPUT_DIR):
    os.mkdir(INPUT_DIR)

# output_dir(output directory) creation
OUTPUT_DIR = os.path.join(source_path, 'outputs')

# if OUTPUT_DIR has not been created yet, create it
if not os.path.isdir(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [5]:
# Once you run this code, comment it out
# move csv files to `data` directory(=folder)
unique_dir_names = []
for f in Path(f'{source_path}').rglob('*.csv'):
    unique_dir_names.append(f)
for g in Path(f'{source_path}').rglob('*.xlsx'):
    unique_dir_names.append(g)

for file in list(set(unique_dir_names)):
    print(f'moved file: {file}')
    shutil.move(f'{file}', f'{INPUT_DIR}')

moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2017-0102-1231-all-US.csv
moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2012-07-12.csv
moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2016-0101-0101-all-US.csv
moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2014-0101-0101-others.csv
moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2017-0101-0101-all-US.csv
moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2019-0101-0101-all-US.csv
moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2018-0102-1231-all-US.csv
moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2012-01-06.csv
moved file: /Users/satoshiido/Documents/programming/RA/crunchbase/ai-project-2023-2015-0101-0101-all-US.csv
moved file: /Users/satoshiido/Documents/programmin

In [4]:
# Pandas function to let us read csv files without having to specify the directory
def read_csv(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.csv')
    print(f'Load: {path}')
    return pd.read_csv(path, **kwrgs)

# Polars function to let us read csv files without having to specify the directory
def read_csvpl(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.csv')
    print(f'Load: {path}')
    return pl.read_csv(path, **kwrgs)

In [132]:
a = read_csvpl("ai-project-2023-2010",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
b = read_csvpl("ai-project-2023-2011",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
c = read_csvpl("ai-project-2023-2012-01-06",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
d = read_csvpl("ai-project-2023-2012-07-12",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
e = read_csvpl("ai-project-2023-2013-0101-0101-ai",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
f = read_csvpl("ai-project-2023-2013-0102-1231-ai",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
g = read_csvpl("ai-project-2023-2013-0101-1231-others",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
h = read_csvpl("ai-project-2023-2014-0101-0101-ai",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
i = read_csvpl("ai-project-2023-2014-0101-0101-others",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
j = read_csvpl("ai-project-2023-2014-0102-1231-ai",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
k = read_csvpl("ai-project-2023-2014-0102-1231-others",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
l = read_csvpl("ai-project-2023-2015-0101-0101-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
m = read_csvpl("ai-project-2023-2015-0102-1231-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
n = read_csvpl("ai-project-2023-2016-0101-0101-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
o = read_csvpl("ai-project-2023-2016-0102-1231-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
p = read_csvpl("ai-project-2023-2017-0101-0101-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
q = read_csvpl("ai-project-2023-2017-0102-1231-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
r = read_csvpl("ai-project-2023-2018-0101-0101-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
s = read_csvpl("ai-project-2023-2018-0102-1231-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
t = read_csvpl("ai-project-2023-2019-0101-0101-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
u = read_csvpl("ai-project-2023-2019-0102-1231-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
v = read_csvpl("ai-project-2023-2020-0101-0101-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})
w = read_csvpl("ai-project-2023-2020-0102-1231-all-US",infer_schema_length=10000, dtypes={"Number of Founders (Alumni)": str, "Number of Alumni": str, "CB Rank (School)": str, "Similar Companies":str, "Accelerator Duration (in weeks)": str, "Number of Articles": str, "Valuation at IPO": str, "Valuation at IPO Currency (in USD)": str, "IPqwery - Patents Granted": str, "Number of Employees": str, "Number of Diversity Investments": str, "Money Raised at IPO": str, "Money Raised at IPO Currency (in USD)": str, "Number of Lead Investments": str, "Number of Exits": str, "Number of Exits (IPO)": str, "Price": str, "Price Currency (in USD)": str})

Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2010.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2011.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2012-01-06.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2012-07-12.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2013-0101-0101-ai.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2013-0102-1231-ai.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2013-0101-1231-others.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2014-0101-0101-ai.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2014-0101-0101-others.csv
Load: /Users/satoshiido/Documents/programming/RA/crunchbase/data/ai-project-2023-2014-0102-1231-ai.csv
Lo

In [150]:
# df = pl.concat([a, b, c, d, e, f, g, h, i, j, k, l, m, n ,o, p, q, r, s, t, u, v, w],how="vertical_relaxed")
df = pl.concat([a, b, c, d, e, f, g, h, i, j, k, l, m, n ,o, p, q, r, s, t, u, v, w])

# drop the duplicate rows
df = df.unique(keep="first")

df.head()

Organization Name,Organization Name URL,Industries,Headquarters Location,Description,CB Rank (Company),Headquarters Regions,Estimated Revenue Range,Operating Status,Founded Date,Founded Date Precision,Exit Date,Exit Date Precision,Closed Date,Closed Date Precision,Company Type,Website,Twitter,Facebook,LinkedIn,Contact Email,Phone Number,Number of Articles,Hub Tags,Full Description,Investor Type,Investment Stage,Number of Portfolio Organizations,Number of Investments,Number of Lead Investments,Number of Exits,Number of Exits (IPO),Accelerator Program Type,Accelerator Application Deadline,Accelerator Duration (in weeks),School Type,School Program,…,Announced Date Precision,Price,Price Currency,Price Currency (in USD),Acquisition Type,Acquisition Terms,IPO Status,IPO Date,Delisted Date,Delisted Date Precision,Money Raised at IPO,Money Raised at IPO Currency,Money Raised at IPO Currency (in USD),Valuation at IPO,Valuation at IPO Currency,Valuation at IPO Currency (in USD),Stock Symbol,Stock Symbol URL,Stock Exchange,Number of Events,CB Rank (Organization),CB Rank (School),Trend Score (7 Days),Trend Score (30 Days),Trend Score (90 Days),G2 Stack - Total Products Active,Apptopia - Number of Apps,BuiltWith - Active Tech Count,IPqwery - Patents Granted,IPqwery - Most Popular Patent Class,Aberdeen - IT Spend,Aberdeen - IT Spend Currency,Aberdeen - IT Spend Currency (in USD),Number of Diversity Investments,Last Leadership Hiring Date,Last Layoff Mention Date,Similar Companies
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,str,str,str,str,str,str,str,str,…,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,str,str,f64,f64,f64,i64,i64,i64,str,str,i64,str,i64,str,str,str,str
"""3rdPlace""","""https://www.crunchbase.com/organization/3rdpla…","""Analytics, Artificial Intelligence, Internet, …","""Milano, Lombardia, Italy""","""Data-driven tech company focusing on data gove…","""801,127""","""European Union (EU), Europe, Middle East, and …","""$1M to $10M""","""Active""","""2010-01-01""","""day""",,,,,"""For Profit""","""http://3rdplace.com/""","""https://twitter.com/3rdplacedigital""",,"""https://www.linkedin.com/company/1459001/""","""info@3rdplace.com""","""+39 02 76 28 10 64""","""4""",,"""3rdPlace is a data-driven tech company founded…",,,,,,,,,,,,,…,,,,,,,"""Private""",,,,,,,,,,,,,,"""822,732""",,-0.6,-0.3,-0.9,20.0,,70,,,9987.0,"""USD""",9987.0,,,,"""3"""
"""AlphaGenius""","""https://www.crunchbase.com/organization/alphag…","""Fashion, Finance, FinTech, Machine Learning, S…","""Los Angeles, California, United States""","""Technology and Investment Company""","""862,752""","""Greater Los Angeles Area, West Coast, Western …","""$1M to $10M""","""Active""","""2010-01-01""","""year""",,,,,"""For Profit""","""http://alphagenius.com""","""http://twitter.com/randysaaf""",,"""http://www.linkedin.com/company/alphagenius-in…","""info@alphagenius.com""",,"""1""",,"""AlphaGenius is a technology and investment com…",,,,,,,,,,,,,…,,,,,,,"""Private""",,,,,,,,,,,,,,"""885,631""",,0.2,-0.6,-1.0,12.0,,37,,,,,,,,,"""3"""
"""alva""","""https://www.crunchbase.com/organization/alva-2…","""Analytics, Artificial Intelligence, Business I…","""London, England, United Kingdom""","""alva provides Stakeholder Intelligence for Cor…","""138,150""","""Europe, Middle East, and Africa (EMEA)""","""$1M to $10M""","""Active""","""2010-03-01""","""day""",,,,,"""For Profit""","""http://www.alva-group.com/en/""","""https://twitter.com/alvareputation""","""https://www.facebook.com/alva.reputation.analy…","""https://www.linkedin.com/company/alva_858801""","""info@alva-group.com""","""+442037359780""","""8""",,"""alva provides Stakeholder Intelligence for Cor…",,,,,,,,,,,,,…,,,,,,,"""Private""",,,,,,,,,,,,,,"""140,044""",,-0.1,2.2,-0.2,24.0,,24,"""0""",,205809.0,"""USD""",205809.0,,,,"""4"""
"""Anand International College of Engineering""","""https://www.crunchbase.com/organization/anand-…","""Artificial Intelligence, Education, Higher Edu…","""Jaipur, Rajasthan, India""","""Anand International College of Engineering off…","""1,207,389""","""Asia-Pacific (APAC)""",,"""Active""","""2010-01-01""","""year""",,,,,"""For Profit""","""https://anandice.ac.in""","""https://twitter.com/anandcollegeeng""","""https://www.facebook.com/Anand-International-C…","""https://www.linkedin.com/school/anand-college-…","""info@anandice.ac.in""","""01429- 234994""",,,,,,,,,,,,,,,,…,,,,,,,"""Private""",,,,,,,,,,,,,,"""1,235,859""",,3.6,4.2,3.0,,,6,,,,,,,,,"""6"""
"""Argentum""","""https://www.crunchbase.com/organization/global…","""Advertising, Artificial Intelligence, Digital …","""Windsor, Windsor and Maidenhead, United Kingdo…","""Supplier of Innovative Marketing Tech Solution…","""291,440""","""Europe, Middle East, and Africa (EMEA)""",,"""Active""","""2010-01-01""","""year""","""2016-06-06""","""day""",,,"""For Profit""","""https://www.argq.io/""","""https://twitter.com/Argentum47inc""",,"""https://www.linkedin.com/company/argentum-47-i…","""IR@ARGQ.io""",,"""4""",,"""Argentum 47 Inc. is a diversified holding comp…",,,,,,,,,,,,,…,,,,,,,"""Public""","""2016-06-06""",,,,,,,,,"""ARGQ""","""https://www.crunchbase.com/ipo/global-equity-i…","""OTCQB""",,"""299,284""",,0.7,3.0,1.9,,,19,"""1""","""Medical Or Veterinary Science; Hygiene""",,,,,,,"""8"""


In [158]:
# select only data with US headquarters
if_us = ["United States"]
pattern = r"\bUnited States\b"
df = df.with_columns(pl.col("Headquarters Location").str.contains(pattern).alias("location_us"))
df = df.filter(pl.col("location_us") == True)

In [159]:
# output the combined csv file
df.write_csv(os.path.join(OUTPUT_DIR, "combine_crunchbase.csv"))

## High tech company data

In [15]:
df = pd.read_csv("/Users/satoshiido/Documents/programming/RA/ai/data/high-tech-company/ai-project-hightech-IT-2010-0101-0101-(1).csv")
df.head()

Unnamed: 0,Organization Name,Organization Name URL,Industries,Headquarters Location,Description,CB Rank (Company),Headquarters Regions,Estimated Revenue Range,Operating Status,Founded Date,...,BuiltWith - Active Tech Count,IPqwery - Patents Granted,IPqwery - Most Popular Patent Class,Aberdeen - IT Spend,Aberdeen - IT Spend Currency,Aberdeen - IT Spend Currency (in USD),Number of Diversity Investments,Last Leadership Hiring Date,Last Layoff Mention Date,Similar Companies
0,Domo,https://www.crunchbase.com/organization/domo,"Analytics, Business Intelligence, Enterprise S...","American Fork, Utah, United States",Domo designs and delivers an executive managem...,369,Western US,$10M to $50M,Active,2010-01-01,...,84.0,112.0,Computing; Calculating,,,,,2022-03-01,2020-04-10,10.0
1,ID.me,https://www.crunchbase.com/organization/id-me,"Cloud Data Services, Cyber Security, Fraud Det...","Mclean, Virginia, United States",ID.me is an online identity verification platf...,1690,"Washington DC Metro Area, East Coast, Southern US",,Active,2010-01-01,...,95.0,13.0,Computing; Calculating,2073852.0,USD,2073852.0,,2023-04-13,2022-06-21,7.0
2,Mapbox,https://www.crunchbase.com/organization/mapbox,"Automotive, Business Intelligence, Data Visual...","Washington, District of Columbia, United States",Mapbox is a location data platform for mobile ...,1894,"Washington DC Metro Area, Southern US",$100M to $500M,Active,2010-01-01,...,74.0,53.0,Computing; Calculating,,,,,2021-03-01,,8.0
3,OneStream Software,https://www.crunchbase.com/organization/onestr...,"Information Technology, Software","Rochester, Michigan, United States",OneStream Software is an independent software ...,2097,"Greater Detroit Area, Great Lakes, Midwestern US",$100M to $500M,Active,2010-01-01,...,36.0,0.0,,1044277.0,USD,1044277.0,,,,5.0
4,Arbitrum,https://www.crunchbase.com/organization/arbitrum,"Content, Digital Media, Information Technology...","Newark, Delaware, United States","Arbitrum is a text, image, and video moderatio...",2298,"Greater Philadelphia Area, East Coast, Souther...",Less than $1M,Active,2010-01-01,...,22.0,,,,,,,,,220.0
