In [388]:
import pandas as pd

# Data snapshot

In [389]:
df = pd.read_csv('crunchbase-investments.csv', encoding='ISO-8859-1', low_memory=False)
df.head(5)

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012.0,2000000.0
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012.0,20000.0
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012.0,20000.0
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012.0,20000.0
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011.0,20000.0


In [390]:
df.isnull().sum()

company_permalink             1
company_name                  1
company_category_code       643
company_country_code          1
company_state_code          492
company_region                1
company_city                533
investor_permalink            2
investor_name                 2
investor_category_code    50427
investor_country_code     12001
investor_state_code       16809
investor_region               2
investor_city             12480
funding_round_type            3
funded_at                     3
funded_month                  3
funded_quarter                3
funded_year                   3
raised_amount_usd          3599
dtype: int64

In [391]:
df.describe()

Unnamed: 0,funded_year,raised_amount_usd
count,52867.0,49271.0
mean,2009.598615,13836380.0
std,2.677429,48939520.0
min,1987.0,1000.0
25%,2008.0,1875000.0
50%,2010.0,6500000.0
75%,2012.0,15000000.0
max,2013.0,3200000000.0


In [392]:
df.shape

(52870, 20)

# Unnecessary columns

We can drop/exclude the columns **`funded_month`** and **`funded_year`** because they contain redundant info that is already contained in **`funded_at`**.

# Missing value counts

In [393]:
use_cols = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'raised_amount_usd']

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=use_cols)

for chunk in chunk_iter:    
    missing_value_counts = chunk.isnull().sum()
    print(missing_value_counts)
    
    print('='*50)

company_permalink            0
company_name                 0
company_category_code       52
company_country_code         0
company_state_code          53
company_region               0
company_city                64
investor_permalink           0
investor_name                0
investor_category_code    2557
investor_country_code      778
investor_state_code       1371
investor_region              0
investor_city              900
funding_round_type           0
funded_at                    0
funded_quarter               0
raised_amount_usd          653
dtype: int64
company_permalink            0
company_name                 0
company_category_code       51
company_country_code         0
company_state_code          43
company_region               0
company_city                45
investor_permalink           0
investor_name                0
investor_category_code    5000
investor_country_code      261
investor_state_code        714
investor_region              0
investor_city             

# Memory footprint

In [394]:
use_cols = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'raised_amount_usd']

total_memory = 0
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=use_cols)
for chunk in chunk_iter:    
    chunk_memory = chunk.memory_usage(deep=True, index=False).sum()
    total_memory += chunk_memory    
    print('Chunk memory usage (MB) =', chunk_memory/(1024*1024))

print('-'*50)
print('Total memory usage across all chunks (MB) =', total_memory/(1024*1024))

Chunk memory usage (MB) = 5.235795974731445
Chunk memory usage (MB) = 5.184783935546875
Chunk memory usage (MB) = 5.191601753234863
Chunk memory usage (MB) = 5.184760093688965
Chunk memory usage (MB) = 5.180904388427734
Chunk memory usage (MB) = 5.210009574890137
Chunk memory usage (MB) = 5.188079833984375
Chunk memory usage (MB) = 5.166210174560547
Chunk memory usage (MB) = 5.052687644958496
Chunk memory usage (MB) = 4.296049118041992
Chunk memory usage (MB) = 2.4665212631225586
--------------------------------------------------
Total memory usage across all chunks (MB) = 53.35740375518799


# Column types (by reading entire file at once)

In [395]:
use_cols = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'raised_amount_usd']

df = pd.read_csv('crunchbase-investments.csv', encoding='ISO-8859-1', low_memory=False, usecols=use_cols)
df.dtypes

company_permalink          object
company_name               object
company_category_code      object
company_country_code       object
company_state_code         object
company_region             object
company_city               object
investor_permalink         object
investor_name              object
investor_category_code     object
investor_country_code      object
investor_state_code        object
investor_region            object
investor_city              object
funding_round_type         object
funded_at                  object
funded_quarter             object
raised_amount_usd         float64
dtype: object

# Column types (by reading file in chunks)

In [396]:
# A column's datatype may not be consistent across all chunks.
# To get a column's datatype, we gather its datatype in each chunk, 
# then call max() to get its type with the highest precedence.
# Example: 'investor_category_code' is listed as an object type 
# in one chunk and as a float in another chunk because all the
# values in that chunk are NaN. Between object and float, the 
# object type has a higher precedence, so that's the datatype
# returned.

use_cols = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'raised_amount_usd']

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=use_cols)
datatypes = pd.Series()
for chunk in chunk_iter:
    d_types = chunk.dtypes
    datatypes = datatypes.append(d_types)
    
colnames = datatypes.index
datatypes = datatypes.groupby(by=colnames).max().sort_values()
numeric_cols = datatypes[datatypes != 'object']
string_cols = datatypes[datatypes == 'object']

print('Numeric columns: ( n =', numeric_cols.size, ')\n')
print(numeric_cols)
print('\nString columns: ( n =', string_cols.size, ')\n')
print(string_cols)

Numeric columns: ( n = 1 )

raised_amount_usd    float64
dtype: object

String columns: ( n = 17 )

investor_region           object
investor_permalink        object
investor_name             object
investor_country_code     object
investor_city             object
investor_category_code    object
funding_round_type        object
company_category_code     object
funded_at                 object
company_state_code        object
company_region            object
company_permalink         object
company_name              object
company_country_code      object
company_city              object
investor_state_code       object
funded_quarter            object
dtype: object


# Optimize numeric columns

In [397]:
use_cols = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'raised_amount_usd']

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=use_cols)
for chunk in chunk_iter:
    print(chunk['raised_amount_usd'].head(2)) 
    print('-'*50)

0    2000000.0
1      20000.0
Name: raised_amount_usd, dtype: float64
--------------------------------------------------
5000    25000000.0
5001    20000000.0
Name: raised_amount_usd, dtype: float64
--------------------------------------------------
10000    37400000.0
10001     6000000.0
Name: raised_amount_usd, dtype: float64
--------------------------------------------------
15000    3000000.0
15001    5000000.0
Name: raised_amount_usd, dtype: float64
--------------------------------------------------
20000    15000000.0
20001    14900000.0
Name: raised_amount_usd, dtype: float64
--------------------------------------------------
25000    29000000.0
25001     2000000.0
Name: raised_amount_usd, dtype: float64
--------------------------------------------------
30000     500000.0
30001    3000000.0
Name: raised_amount_usd, dtype: float64
--------------------------------------------------
35000    13000000.0
35001    12000000.0
Name: raised_amount_usd, dtype: float64
-------------------

From the results above, we can see that we should cast **`raised_amount_usd`** to an `int64`. The float type is overkill for this column.

Let's make that change.

In [398]:
use_cols = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'raised_amount_usd']

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=use_cols)
for chunk in chunk_iter:
    print('dtype before optimization:', chunk['raised_amount_usd'].dtype)
    chunk['raised_amount_usd'] = chunk['raised_amount_usd'].fillna(0).astype('int')
    print('dtype after optimization:', chunk['raised_amount_usd'].dtype)
    print('-'*50)

dtype before optimization: float64
dtype after optimization: int64
--------------------------------------------------
dtype before optimization: float64
dtype after optimization: int64
--------------------------------------------------
dtype before optimization: float64
dtype after optimization: int64
--------------------------------------------------
dtype before optimization: float64
dtype after optimization: int64
--------------------------------------------------
dtype before optimization: float64
dtype after optimization: int64
--------------------------------------------------
dtype before optimization: float64
dtype after optimization: int64
--------------------------------------------------
dtype before optimization: float64
dtype after optimization: int64
--------------------------------------------------
dtype before optimization: float64
dtype after optimization: int64
--------------------------------------------------
dtype before optimization: float64
dtype after optimizat

# Optimize string columns

By looking at the string column **`funded_quarter`**, we can see that we could optimize it by extracting the number representing the quarter and then cast the column to an `int8` type.

- Example: Extract the `4` from `2012-Q4`	 

In [399]:
use_cols = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'raised_amount_usd']

total_memory = 0
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=use_cols)
for chunk in chunk_iter:
    # Optimize numeric column
    chunk['raised_amount_usd'] = chunk['raised_amount_usd'].fillna(0).astype('int')

    # Optimize 'funded_quarter' column
    chunk['funded_quarter'] = chunk['funded_quarter'].str.extract(r'Q(\d)', expand=False)
    chunk['funded_quarter'] = chunk['funded_quarter'].fillna(-1)
    chunk['funded_quarter'] = chunk['funded_quarter'].astype('int8')

    chunk_memory = chunk.memory_usage(deep=True).sum()
    total_memory += chunk_memory
    print('Chunk memory usage (MB) =', chunk_memory/(1024*1024))

print('-'*50)
print('Total memory usage (MB) =', total_memory/(1024*1024))

Chunk memory usage (MB) = 4.935464859008789
Chunk memory usage (MB) = 4.884456634521484
Chunk memory usage (MB) = 4.891274452209473
Chunk memory usage (MB) = 4.884432792663574
Chunk memory usage (MB) = 4.880577087402344
Chunk memory usage (MB) = 4.909682273864746
Chunk memory usage (MB) = 4.887844085693359
Chunk memory usage (MB) = 4.865882873535156
Chunk memory usage (MB) = 4.7523603439331055
Chunk memory usage (MB) = 3.9957218170166016
Chunk memory usage (MB) = 2.2941675186157227
--------------------------------------------------
Total memory usage (MB) = 50.181864738464355


From the results above, we can see that we were able to lower the total memory usage from 53.35 MB to 49.98 MB by optimizing some of the columns.

# Load data into SQLite

In [400]:
import sqlite3

In [401]:
# Create and connect to a new database
conn = sqlite3.connect('crunchbase.db')
cur = conn.cursor()
cur.execute('''DROP TABLE IF EXISTS investments''')

use_cols = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'raised_amount_usd']

total_memory = 0
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=use_cols)
for chunk in chunk_iter:
    # Optimize numeric column
    chunk['raised_amount_usd'] = chunk['raised_amount_usd'].fillna(0).astype('int')
    
    # Optimize 'funded_quarter' column
    chunk['funded_quarter'] = chunk['funded_quarter'].str.extract(r'Q(\d)', expand=False)
    chunk['funded_quarter'] = chunk['funded_quarter'].fillna(-1)
    chunk['funded_quarter'] = chunk['funded_quarter'].astype('int8')

    # Write chunk out to database table
    chunk.to_sql('investments', conn, if_exists='append', index=False)

# Data load verification

Let's now confirm that the data is in the database table.

In [402]:
verify_df = pd.read_sql('PRAGMA table_info(investments)', conn)
verify_df

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,company_permalink,TEXT,0,,0
1,1,company_name,TEXT,0,,0
2,2,company_category_code,TEXT,0,,0
3,3,company_country_code,TEXT,0,,0
4,4,company_state_code,TEXT,0,,0
5,5,company_region,TEXT,0,,0
6,6,company_city,TEXT,0,,0
7,7,investor_permalink,TEXT,0,,0
8,8,investor_name,TEXT,0,,0
9,9,investor_category_code,TEXT,0,,0


In [403]:
verify_df = pd.read_sql('SELECT COUNT(*) FROM investments', conn)
verify_df

Unnamed: 0,COUNT(*)
0,52870


In [404]:
verify_df = pd.read_sql('SELECT * FROM investments LIMIT 3', conn)
verify_df

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_quarter,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,4,2000000
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,1,20000
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,1,20000


In [405]:
# Testing
q = """
    select company_name, investor_name, raised_amount_usd 
    from investments 
    where company_name = '0xdata'
"""
verify_df = pd.read_sql(q, conn)
verify_df

Unnamed: 0,company_name,investor_name,raised_amount_usd
0,0xdata,Nexus Venture Partners,1700000


# Data analysis

In [406]:
query = '''
    SELECT 
        company_name, 
        company_category_code,
        investor_name,
        funding_round_type,
        raised_amount_usd
    FROM 
        investments
'''
results_df = pd.read_sql(query, conn)

In [407]:
grouped = results_df.groupby(by='company_name').raised_amount_usd.sum()
ncompanies = results_df.nunique().company_name 
ordered = grouped.sort_values()

##  What proportion of the total amount of funds did the top 10% raise?

In [408]:
top10 = ordered.tail(int(round(ncompanies*.10)))
print(top10.sum(), '/', grouped.sum())

457631322689 / 681732187973


## What proportion of the total amount of funds did the top 1% raise?

In [409]:
top1 = ordered.tail(int(round(ncompanies*.01)))
print(top1.sum(), '/', grouped.sum())

179373534008 / 681732187973


## What proportion of the total amount of funds did the bottom 10% raise?

In [410]:
bottom10 = ordered.head(int(round(ncompanies*.10)))
print(bottom10.sum(), '/', grouped.sum())

0 / 681732187973


## What proportion of the total amount of funds did the bottom 1% raise?

In [411]:
bottom1 = ordered.head(int(round(ncompanies*.01)))
print(bottom1.sum(), '/', grouped.sum())

0 / 681732187973


## Which category of company attracted the most investments?

In [412]:
results_df.groupby(by='company_category_code').raised_amount_usd.sum().sort_values().tail(1)

company_category_code
biotech    110396423062
Name: raised_amount_usd, dtype: int64

## Which investor contributed the most money (across all startups)?

In [413]:
results_df.groupby(by='investor_name').raised_amount_usd.sum().sort_values().tail(1)

investor_name
Kleiner Perkins Caufield & Byers    11217826376
Name: raised_amount_usd, dtype: int64

## Which investors contributed the most money per startup?

In [414]:
sums = results_df.groupby(by=['company_name', 'investor_name']).raised_amount_usd.sum()
ranked = sums.groupby(by='company_name').rank(method='dense', ascending=False)
maxes = ranked[ranked == 1]
maxes
#maxes.loc[['Clearwire', '#waywire', '0xdata']] # Get a sample subset

company_name       investor_name                                    
#waywire           First Round Capital                                  1.0
                   Innovation Endeavors                                 1.0
                   Jeff Weiner                                          1.0
                   Oprah Winfrey                                        1.0
                   Troy Carter                                          1.0
0xdata             Nexus Venture Partners                               1.0
1-800-DENTIST      Bain Capital Ventures                                1.0
                   Cue Ball                                             1.0
1000memories       Caterina Fake                                        1.0
                   Chris Sacca                                          1.0
                   FLOODGATE                                            1.0
                   Felicis Ventures                                     1.0
                   

## Which funding round was the most popular? Which was the least popular?

In [415]:
rounds = results_df.funding_round_type.value_counts()
rounds

series-a          13938
series-c+         10870
angel              8989
venture            8917
series-b           8794
other               964
private-equity      357
post-ipo             33
crowdfunding          5
Name: funding_round_type, dtype: int64

In [416]:
# Most popular
rounds.idxmax()

'series-a'

In [417]:
# Least popular
rounds.idxmin()

'crowdfunding'