# Analyzing Startup Fundraising Deals

In this dataquest project, I analyze startup fundraising deals using data from Crunchbase, which is a website that gathers such information. In order to practice techniques for working with large datasets, I will assume there are certain memory constraints on my computer. To begin, I assume that there is only 10 MB of memory available. 

In [89]:
import pandas as pd
import chardet
import sqlite3
pd.options.display.max_columns = 99


## Exploring the Data

In [67]:
first_five = pd.read_csv('crunchbase-investments.csv', nrows=5, encoding='ISO-8859-1')
first_five

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012,2000000
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012,20000
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012,20000
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012,20000
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011,20000


### Finding the Memory Footprint of Each Chunk

In [68]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
tot_mem = 0
for chunk in chunk_iter:
    tot_mem += (chunk.memory_usage(deep=True).sum() / 1024**2)
    print(chunk.memory_usage(deep=True).sum() / 1024**2)

5.579240798950195
5.528232574462891
5.535050392150879
5.528193473815918
5.5243377685546875
5.553427696228027
5.531436920166016
5.5096588134765625
5.396121025085449
4.639497756958008
2.6637144088745117


In [72]:
tot_mem #Total Memory Footprint of All Chunks

56.988911628723145

### Finding the Missing Values for Each Column by Chunk 

In [77]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
null_values_by_chunk = pd.DataFrame()
for chunk in chunk_iter:
    null_values_by_chunk = pd.concat([null_values_by_chunk, pd.DataFrame(chunk.isnull().sum()).transpose()])
null_values_by_chunk.reset_index(drop=True, inplace=True)
null_values_by_chunk.loc['Total Missing'] = null_values_by_chunk.sum()

In [78]:
null_values_by_chunk

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,0,0,52,0,53,0,64,0,0,2557,778,1371,0,900,0,0,0,0,0,653
1,0,0,51,0,43,0,45,0,0,5000,261,714,0,313,0,0,0,0,0,239
2,0,0,82,0,56,0,44,0,0,5000,271,808,0,314,0,0,0,0,0,221
3,0,0,57,0,36,0,44,0,0,5000,239,714,0,274,0,0,0,0,0,299
4,0,0,69,0,40,0,41,0,0,5000,352,1116,0,434,0,0,0,0,0,243
5,0,0,61,0,39,0,33,0,0,5000,218,732,0,238,0,0,0,0,0,206
6,1,1,88,1,35,1,42,2,2,5000,313,922,2,339,3,3,3,3,3,287
7,0,0,83,0,65,0,55,0,0,5000,267,775,0,334,0,0,0,0,0,368
8,0,0,75,0,50,0,50,0,0,5000,1432,1787,0,1464,0,0,0,0,0,324
9,0,0,15,0,46,0,70,0,0,5000,5000,5000,0,5000,0,0,0,0,0,469


## Selecting Column Datatypes

In [83]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', parse_dates=['funded_at'])
col_type_by_chunk = pd.DataFrame()
for chunk in chunk_iter:
    col_type_by_chunk = pd.concat([col_type_by_chunk, pd.DataFrame(chunk.dtypes).transpose()])
col_type_by_chunk

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,datetime64[ns],object,object,int64,float64
0,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,datetime64[ns],object,object,int64,float64
0,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,datetime64[ns],object,object,int64,float64
0,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,datetime64[ns],object,object,int64,float64
0,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,datetime64[ns],object,object,int64,float64
0,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,datetime64[ns],object,object,int64,float64
0,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,datetime64[ns],object,object,float64,float64
0,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,datetime64[ns],object,object,int64,float64
0,object,object,object,object,object,object,object,object,object,float64,object,object,object,object,object,datetime64[ns],object,object,int64,float64
0,object,object,object,object,object,object,object,object,object,float64,float64,float64,object,float64,object,datetime64[ns],object,object,int64,float64


In [84]:
chunk.head()

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
50000,/company/nuorder,NuORDER,fashion,USA,CA,Los Angeles,West Hollywood,/person/mortimer-singer,Mortimer Singer,,,,unknown,,series-a,2012-10-01,2012-10,2012-Q4,2012,3060000.0
50001,/company/chacha,ChaCha,advertising,USA,IN,Indianapolis,Carmel,/person/morton-meyerson,Morton Meyerson,,,,unknown,,series-b,2007-10-01,2007-10,2007-Q4,2007,12000000.0
50002,/company/binfire,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,/person/moshe-ariel,Moshe Ariel,,,,unknown,,angel,2008-04-18,2008-04,2008-Q2,2008,500000.0
50003,/company/binfire,Binfire,software,USA,FL,Bocat Raton,Bocat Raton,/person/moshe-ariel,Moshe Ariel,,,,unknown,,angel,2010-01-01,2010-01,2010-Q1,2010,750000.0
50004,/company/unified-color,Unified Color,software,USA,CA,SF Bay,South San Frnacisco,/person/mr-andrew-oung,Mr. Andrew Oung,,,,unknown,,angel,2010-01-01,2010-01,2010-Q1,2010,


Above, we see that most columns are object columns. Additionally, there are several data columns that could be converted to datetime. 

## Dropping Unnecessary Columns

Above, we can see that almost all values in the 'investor_category_code' column are missing, so we will drop that column as it is not helpful for the analysis. Additionally, we can drop all the date columns except for 'funded_at', as all the other date columns are just subsets of the funded date. 

In [88]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', parse_dates=['funded_at'])
for chunk in chunk_iter:
    chunk.drop(['investor_category_code', 'funded_month', 'funded_quarter', 'funded_year'], axis=1, inplace=True)

## Loading the Data into SQLite

In [90]:
conn = sqlite3.connect('crunchbase.db')

In [92]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', parse_dates=['funded_at'])
for chunk in chunk_iter:
    chunk.drop(['investor_category_code', 'funded_month', 'funded_quarter', 'funded_year'], axis=1, inplace=True)
    chunk.to_sql('investment_data', conn, if_exists='append', index=False)

In [96]:
pd.read_sql('PRAGMA table_info(investment_data);', conn)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,company_permalink,TEXT,0,,0
1,1,company_name,TEXT,0,,0
2,2,company_category_code,TEXT,0,,0
3,3,company_country_code,TEXT,0,,0
4,4,company_state_code,TEXT,0,,0
5,5,company_region,TEXT,0,,0
6,6,company_city,TEXT,0,,0
7,7,investor_permalink,TEXT,0,,0
8,8,investor_name,TEXT,0,,0
9,9,investor_country_code,TEXT,0,,0


Above, we see that the data was loaded into SQLite without the dropped columns and with the appropriate datatypes.

## Analyzing Data in SQLite

### Top Company Categories

In [103]:
pd.read_sql("""
SELECT company_category_code, SUM(raised_amount_usd) amount_raised
FROM investment_data
GROUP BY company_category_code
ORDER BY amount_raised DESC
LIMIT 10;
""", conn)


Unnamed: 0,company_category_code,amount_raised
0,biotech,110396400000.0
1,software,73084520000.0
2,mobile,64777380000.0
3,cleantech,52705230000.0
4,enterprise,45860930000.0
5,web,40143260000.0
6,medical,25367110000.0
7,advertising,25076660000.0
8,ecommerce,22567220000.0
9,network_hosting,22419680000.0


### Top Investors

In [149]:
pd.read_sql("""
SELECT investor_name, SUM(raised_amount_usd) amount_raised, COUNT(*) number_of_investments
FROM investment_data
GROUP BY investor_name
ORDER BY amount_raised DESC
LIMIT 10;
""", conn)

Unnamed: 0,investor_name,amount_raised,number_of_investments
0,Kleiner Perkins Caufield & Byers,11217830000.0,393
1,New Enterprise Associates,9692542000.0,445
2,Accel Partners,6472126000.0,322
3,Goldman Sachs,6375459000.0,123
4,Sequoia Capital,6039402000.0,369
5,Intel,5969200000.0,18
6,Google,5808800000.0,22
7,Time Warner,5730000000.0,12
8,Comcast,5669000000.0,9
9,Greylock Partners,4960983000.0,251


In [152]:
pd.read_sql("""
SELECT investor_name, SUM(raised_amount_usd) amount_raised, COUNT(*) number_of_investments
FROM investment_data
GROUP BY investor_name
ORDER BY number_of_investments DESC
LIMIT 10;
""", conn)

Unnamed: 0,investor_name,amount_raised,number_of_investments
0,New Enterprise Associates,9692542000.0,445
1,SV Angel,1786013000.0,436
2,Kleiner Perkins Caufield & Byers,11217830000.0,393
3,Sequoia Capital,6039402000.0,369
4,Draper Fisher Jurvetson (DFJ),4501461000.0,360
5,Intel Capital,4695617000.0,331
6,First Round Capital,1915207000.0,326
7,Accel Partners,6472126000.0,322
8,Techstars,70003550.0,267
9,500 Startups,436995000.0,254


### Investment by Funding Round Type

In [147]:
pd.read_sql("""
SELECT funding_round_type, SUM(raised_amount_usd) amount_raised
FROM investment_data
GROUP BY funding_round_type
ORDER BY amount_raised DESC
;""", conn)

Unnamed: 0,funding_round_type,amount_raised
0,series-c+,265753500000.0
1,venture,130556500000.0
2,series-b,128326800000.0
3,series-a,86542150000.0
4,post-ipo,30917600000.0
5,other,18507260000.0
6,private-equity,16159880000.0
7,angel,4962075000.0
8,crowdfunding,6491500.0
9,,


### Top Companies Analysis

In [160]:
pd.read_sql("""
SELECT 
    company_name, 
    SUM(raised_amount_usd) amount_raised, 
    COUNT(DISTINCT(investor_name)) number_of_investors, 
    CAST(SUM(raised_amount_usd) AS FLOAT) / COUNT(DISTINCT(investor_name)) funding_per_investor
FROM investment_data
GROUP BY company_name
ORDER BY amount_raised DESC
LIMIT 10
;""", conn)

Unnamed: 0,company_name,amount_raised,number_of_investors,funding_per_investor
0,Clearwire,29680000000.0,7,4240000000.0
1,Groupon,10185400000.0,18,565855600.0
2,Nanosolar,4505000000.0,23,195869600.0
3,Facebook,4154100000.0,16,259631200.0
4,SurveyMonkey,3250000000.0,11,295454500.0
5,Zynga,2886013000.0,22,131182400.0
6,Fisker Automotive,2788000000.0,10,278800000.0
7,Dropbox,2764400000.0,13,212646200.0
8,LivingSocial,2685000000.0,8,335625000.0
9,sigmacare,2600000000.0,1,2600000000.0


In [162]:
pd.read_sql("""
SELECT 
    company_name, 
    SUM(raised_amount_usd) amount_raised, 
    COUNT(DISTINCT(investor_name)) number_of_investors, 
    CAST(SUM(raised_amount_usd) AS FLOAT) / COUNT(DISTINCT(investor_name)) funding_per_investor
FROM investment_data
GROUP BY company_name
ORDER BY funding_per_investor DESC
LIMIT 10
;""", conn)

Unnamed: 0,company_name,amount_raised,number_of_investors,funding_per_investor
0,Clearwire,29680000000.0,7,4240000000.0
1,sigmacare,2600000000.0,1,2600000000.0
2,Wave Broadband,2100000000.0,2,1050000000.0
3,AOL,1000000000.0,1,1000000000.0
4,University of Maryland,750000000.0,1,750000000.0
5,Groupon,10185400000.0,18,565855600.0
6,"Vivint, Inc.",565000000.0,1,565000000.0
7,LivingSocial,2685000000.0,8,335625000.0
8,Kosmos Biotherapeutics,1595000000.0,5,319000000.0
9,SunGard,316000000.0,1,316000000.0


### Top States Analysis

In [167]:
pd.read_sql("""
SELECT company_state_code, SUM(raised_amount_usd) amount_raised
FROM investment_data
GROUP BY company_state_code
ORDER BY amount_raised DESC
LIMIT 10
;""", conn)

Unnamed: 0,company_state_code,amount_raised
0,CA,352920700000.0
1,MA,63025920000.0
2,WA,50125760000.0
3,NY,46002920000.0
4,TX,21448480000.0
5,IL,19071990000.0
6,CO,13227100000.0
7,NJ,11868860000.0
8,MD,11229790000.0
9,PA,10328840000.0
