# In this project we will be analyzing data from Crunchbase related to Start-Up Fundraising Deals

Given the size of the dataset and the limitations we want to stick to in terms of memory usage of our system, we will be using methods for reading the dataset in chunks rather than as a whole.

In [1]:
import pandas as pd
pd.options.display.max_columns = 99

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
mv_list = []
for chunk in chunk_iter:
    mv_list.append(chunk.isnull().sum())

combined_mv_counts = pd.concat(mv_list)
unique_combined_mv_counts = combined_mv_counts.groupby(combined_mv_counts.index).sum()
unique_combined_mv_counts.sort_values()

company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

In [2]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
counter = 0
memory_fp = pd.Series()
for chunk in chunk_iter:
    if counter == 0:
        memory_fp = chunk.memory_usage(deep=True)
    else:
        memory_fp += chunk.memory_usage(deep=True)
    counter += 1

memory_fp

Index                         920
company_permalink         4057788
company_name              3591326
company_category_code     3421104
company_country_code      3172176
company_state_code        3106051
company_region            3411585
company_city              3505926
investor_permalink        4980548
investor_name             3915666
investor_category_code     622424
investor_country_code     2647292
investor_state_code       2476607
investor_region           3396281
investor_city             2885083
funding_round_type        3410707
funded_at                 3542185
funded_month              3383584
funded_quarter            3383584
funded_year                422960
raised_amount_usd          422960
dtype: int64

In [3]:
memory_fp.drop('Index', inplace=True)
memory_fp

company_permalink         4057788
company_name              3591326
company_category_code     3421104
company_country_code      3172176
company_state_code        3106051
company_region            3411585
company_city              3505926
investor_permalink        4980548
investor_name             3915666
investor_category_code     622424
investor_country_code     2647292
investor_state_code       2476607
investor_region           3396281
investor_city             2885083
funding_round_type        3410707
funded_at                 3542185
funded_month              3383584
funded_quarter            3383584
funded_year                422960
raised_amount_usd          422960
dtype: int64

In [4]:
memory_fp.sum() / 2**20

56.9876070022583

Link columns don't really add to our analysis.

The number of missing values for the investor_category_code column seems high enough that it would hinder our analysis.

In [5]:
analysis_cols = chunk.columns.drop(['investor_permalink','company_permalink','investor_category_code']).tolist()

In [6]:
col_types = {}
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=analysis_cols)
for chunk in chunk_iter:
    for col in chunk.columns:
        if col not in col_types:
            col_types[col] = [str(chunk.dtypes[col])]
        else:
            col_types[col].append(str(chunk.dtypes[col]))

In [7]:
unique_col_types = {}
for key, val in col_types.items():
    unique_col_types[key] = set(col_types[key])
unique_col_types

{'company_category_code': {'object'},
 'company_city': {'object'},
 'company_country_code': {'object'},
 'company_name': {'object'},
 'company_region': {'object'},
 'company_state_code': {'object'},
 'funded_at': {'object'},
 'funded_month': {'object'},
 'funded_quarter': {'object'},
 'funded_year': {'float64', 'int64'},
 'funding_round_type': {'object'},
 'investor_city': {'float64', 'object'},
 'investor_country_code': {'float64', 'object'},
 'investor_name': {'object'},
 'investor_region': {'object'},
 'investor_state_code': {'float64', 'object'},
 'raised_amount_usd': {'float64'}}

In [8]:
chunk

Unnamed: 0,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_name,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
50000,NuORDER,fashion,USA,CA,Los Angeles,West Hollywood,Mortimer Singer,,,unknown,,series-a,2012-10-01,2012-10,2012-Q4,2012,3060000.0
50001,ChaCha,advertising,USA,IN,Indianapolis,Carmel,Morton Meyerson,,,unknown,,series-b,2007-10-01,2007-10,2007-Q4,2007,12000000.0
50002,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,Moshe Ariel,,,unknown,,angel,2008-04-18,2008-04,2008-Q2,2008,500000.0
50003,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,Moshe Ariel,,,unknown,,angel,2010-01-01,2010-01,2010-Q1,2010,750000.0
50004,Unified Color,software,USA,CA,SF Bay,South San Frnacisco,Mr. Andrew Oung,,,unknown,,angel,2010-01-01,2010-01,2010-Q1,2010,
50005,HItviews,advertising,USA,NY,New York,New York City,multiple parties,,,unknown,,angel,2007-11-29,2007-11,2007-Q4,2007,485000.0
50006,LockerDome,social,USA,MO,Saint Louis,St. Louis,multiple parties,,,unknown,,angel,2012-04-17,2012-04,2012-Q2,2012,300000.0
50007,ThirdLove,ecommerce,USA,CA,SF Bay,San Francisco,Munjal Shah,,,unknown,,series-a,2012-12-01,2012-12,2012-Q4,2012,5600000.0
50008,Hakia,search,USA,,TBD,,Murat Vargi,,,unknown,,series-a,2006-11-01,2006-11,2006-Q4,2006,16000000.0
50009,bookacoach,sports,USA,IN,Indianapolis,Indianapolis,Myles Grote,,,unknown,,angel,2012-11-01,2012-11,2012-Q4,2012,


### Numeric columns

- funded_year
- raised_amount_usd

In [9]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=analysis_cols)
overall_vc = {}
for chunk in chunk_iter:
    str_data = chunk.select_dtypes('object')
    for col in str_data.columns:
        str_val_counts = str_data[col].value_counts()
        if col not in overall_vc:
            overall_vc[col] = [str_val_counts]
        else:
            overall_vc[col].append(str_val_counts)
print(overall_vc)
combined_vc = {}
for key in overall_vc:
    combined_vc[key] = pd.concat(overall_vc[col])
    print(key)
    print(combined_vc[key])
    print('--------')

{'investor_name': [Accel Partners                          322
Techstars                               267
500 Startups                            254
Y Combinator                            180
betaworks                                80
Advanced Technology Ventures             79
Cisco                                    67
Adams Street Partners                    63
Advantage Capital Partners               48
Allen & Company                          46
3i Group                                 46
Allegis Capital                          42
Betaspring                               41
Rock Health                              40
Entrepreneurs Roundtable Accelerator     38
Adams Capital Management                 36
Safeguard Scientifics                    36
.406 Ventures                            36
Aberdare Ventures                        34
5AM Ventures                             32
Salesforce                               30
Access Venture Partners                  28
WPP          

There do not appear to be any text columns that can be converted straight to a numeric type. However, the funded_month and funded_quarter could be cleaned and converted to numeric given we don't need the year in each with the funded_year column provided. But this would require overhead when querying.

In [10]:
import sqlite3

conn = sqlite3.connect('crunchbase.db')
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

for chunk in chunk_iter:
    chunk.to_sql('investments', conn, if_exists='append', index=False)

In [11]:
print(pd.read_sql('PRAGMA table_info(investments);', conn))

    cid                    name     type  notnull dflt_value  pk
0     0       company_permalink     TEXT        0       None   0
1     1            company_name     TEXT        0       None   0
2     2   company_category_code     TEXT        0       None   0
3     3    company_country_code     TEXT        0       None   0
4     4      company_state_code     TEXT        0       None   0
5     5          company_region     TEXT        0       None   0
6     6            company_city     TEXT        0       None   0
7     7      investor_permalink     TEXT        0       None   0
8     8           investor_name     TEXT        0       None   0
9     9  investor_category_code     TEXT        0       None   0
10   10   investor_country_code     TEXT        0       None   0
11   11     investor_state_code     TEXT        0       None   0
12   12         investor_region     TEXT        0       None   0
13   13           investor_city     TEXT        0       None   0
14   14      funding_roun

The column data types seem to match what we expected.

In [14]:
query = '''
SELECT COUNT(DISTINCT company_name) FROM investments;
'''
print(pd.read_sql(query, conn))

   COUNT(DISTINCT company_name)
0                         11573


Proportion of total funds by top 1/10% and bottom 1/10% of companies

In [20]:
query = '''
SELECT
    SUM(CASE WHEN company_name IN (SELECT company_name FROM investments
                                    GROUP BY company_name
                                    ORDER BY SUM(raised_amount_usd) DESC
                                    LIMIT 1157)
        THEN raised_amount_usd
        ELSE 0 END) * 100 / SUM(raised_amount_usd) top_10_percent_funds_proportion,
    SUM(CASE WHEN company_name IN (SELECT company_name FROM investments
                                    GROUP BY company_name
                                    ORDER BY SUM(raised_amount_usd) DESC
                                    LIMIT 115)
        THEN raised_amount_usd
        ELSE 0 END) * 100 / SUM(raised_amount_usd) top_1_percent_funds_proportion, 
    SUM(CASE WHEN company_name IN (SELECT company_name FROM investments
                                    GROUP BY company_name
                                    ORDER BY SUM(raised_amount_usd) ASC
                                    LIMIT 1157)
        THEN raised_amount_usd
        ELSE 0 END) * 100 / SUM(raised_amount_usd) bottom_10_percent_funds_proportion, 
    SUM(CASE WHEN company_name IN (SELECT company_name FROM investments
                                    GROUP BY company_name
                                    ORDER BY SUM(raised_amount_usd) ASC
                                    LIMIT 115)
        THEN raised_amount_usd
        ELSE 0 END) * 100 / SUM(raised_amount_usd) bottom_1_percent_funds_proportion 


FROM investments;
'''
print(pd.read_sql(query, conn))

   top_10_percent_funds_proportion  top_1_percent_funds_proportion  \
0                        67.127727                       26.217737   

   bottom_10_percent_funds_proportion  bottom_1_percent_funds_proportion  
0                                 0.0                                0.0  


In [None]:
Top category of company by number of investments.

In [26]:
query = '''
SELECT company_category_code, COUNT(*) num_of_investments
FROM investments
GROUP BY company_category_code
ORDER BY COUNT(*) DESC
LIMIT 1;
'''

print(pd.read_sql(query, conn))

  company_category_code  num_of_investments
0              software               14486


Top contributing investor by amount invested

In [27]:
query = '''
SELECT investor_name, SUM(raised_amount_usd) amount_invested
FROM investments
GROUP BY investor_name
ORDER BY SUM(raised_amount_usd) DESC
LIMIT 1;
'''

print(pd.read_sql(query, conn))

                      investor_name  amount_invested
0  Kleiner Perkins Caufield & Byers     2.243565e+10


Top Investments Companies for Single Start-Up by Amount Contributed

In [30]:
query = '''
SELECT investor_name, company_name, SUM(raised_amount_usd) amount_invested
FROM investments
GROUP BY investor_name, company_name
ORDER BY SUM(raised_amount_usd) DESC
LIMIT 10;
'''

print(pd.read_sql(query, conn))

              investor_name company_name  amount_invested
0                   Comcast    Clearwire     1.124000e+10
1                     Intel    Clearwire     1.124000e+10
2               Time Warner    Clearwire     1.124000e+10
3               BrightHouse    Clearwire     9.400000e+09
4                    Google    Clearwire     6.400000e+09
5    Marlin Equity Partners    sigmacare     5.200000e+09
6             Sprint Nextel    Clearwire     5.000000e+09
7      Eagle River Holdings    Clearwire     4.840000e+09
8  Digital Sky Technologies     Facebook     3.400000e+09
9             Goldman Sachs     Facebook     3.000000e+09


Most popular funding round

In [31]:
query = '''
SELECT funding_round_type
FROM investments
GROUP BY funding_round_type
ORDER BY COUNT(*) DESC
LIMIT 1;
'''

print(pd.read_sql(query, conn))

  funding_round_type
0           series-a


Least popular funding round

In [33]:
query = '''
SELECT funding_round_type
FROM investments
WHERE funding_round_type IS NOT NULL
GROUP BY funding_round_type
ORDER BY COUNT(*) ASC
LIMIT 1;
'''

print(pd.read_sql(query, conn))

  funding_round_type
0       crowdfunding
