In [1]:
import pandas as pd

# Data exploration

## Missing value counts

In [2]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

for chunk in chunk_iter:    
    missing_value_counts = chunk.isnull().sum()
    print(missing_value_counts)
    
    print('='*50)

company_permalink            0
company_name                 0
company_category_code       52
company_country_code         0
company_state_code          53
company_region               0
company_city                64
investor_permalink           0
investor_name                0
investor_category_code    2557
investor_country_code      778
investor_state_code       1371
investor_region              0
investor_city              900
funding_round_type           0
funded_at                    0
funded_month                 0
funded_quarter               0
funded_year                  0
raised_amount_usd          653
dtype: int64
company_permalink            0
company_name                 0
company_category_code       51
company_country_code         0
company_state_code          43
company_region               0
company_city                45
investor_permalink           0
investor_name                0
investor_category_code    5000
investor_country_code      261
investor_state_code       

## Memory footprint

In [33]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

total_memory = 0
for chunk in chunk_iter:    
    memory_per_col = chunk.memory_usage(deep=True, index=False)
    chunk_memory = memory_per_col.sum()
    total_memory += chunk_memory
    
    print(memory_per_col/(1024*1024))
    print('\nChunk memory usage (MB) =', chunk_memory/(1024*1024))
    print('='*50)

print('Total memory usage across all chunks (MB) =', total_memory/(1024*1024))

company_permalink         0.365729
company_name              0.323832
company_category_code     0.308253
company_country_code      0.286102
company_state_code        0.279969
company_region            0.308153
company_city              0.315244
investor_permalink        0.406121
investor_name             0.340663
investor_category_code    0.228371
investor_country_code     0.265327
investor_state_code       0.246032
investor_region           0.306885
investor_city             0.286545
funding_round_type        0.305766
funded_at                 0.319481
funded_month              0.305176
funded_quarter            0.305176
funded_year               0.038147
raised_amount_usd         0.038147
dtype: float64

Chunk memory usage (MB) = 5.579118728637695
company_permalink         0.367855
company_name              0.326042
company_category_code     0.309870
company_country_code      0.286102
company_state_code        0.280227
company_region            0.307571
company_city              0.31

## Column types

In [34]:
df = pd.read_csv('crunchbase-investments.csv', encoding='ISO-8859-1', low_memory=False)
print(df.info(verbose=True, memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52870 entries, 0 to 52869
Data columns (total 20 columns):
company_permalink         52869 non-null object
company_name              52869 non-null object
company_category_code     52227 non-null object
company_country_code      52869 non-null object
company_state_code        52378 non-null object
company_region            52869 non-null object
company_city              52337 non-null object
investor_permalink        52868 non-null object
investor_name             52868 non-null object
investor_category_code    2443 non-null object
investor_country_code     40869 non-null object
investor_state_code       36061 non-null object
investor_region           52868 non-null object
investor_city             40390 non-null object
funding_round_type        52867 non-null object
funded_at                 52867 non-null object
funded_month              52867 non-null object
funded_quarter            52867 non-null object
funded_year               52

From the results above, we can see that there are two numeric type columns: 
1. `funded_year` 
2. `raised_amount_usd` 

Let's take a look at the numeric columns.

## Numeric columns

In [41]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

numeric_cols = ['funded_year', 'raised_amount_usd']
for chunk in chunk_iter:
    print(chunk[numeric_cols].head(2))
    print(chunk[numeric_cols].dtypes)

    print('='*50)

   funded_year  raised_amount_usd
0         2012          2000000.0
1         2012            20000.0
funded_year            int64
raised_amount_usd    float64
dtype: object
      funded_year  raised_amount_usd
5000         2008         25000000.0
5001         2008         20000000.0
funded_year            int64
raised_amount_usd    float64
dtype: object
       funded_year  raised_amount_usd
10000         2009         37400000.0
10001         2009          6000000.0
funded_year            int64
raised_amount_usd    float64
dtype: object
       funded_year  raised_amount_usd
15000         2011          3000000.0
15001         2011          5000000.0
funded_year            int64
raised_amount_usd    float64
dtype: object
       funded_year  raised_amount_usd
20000         2010         15000000.0
20001         2010         14900000.0
funded_year            int64
raised_amount_usd    float64
dtype: object
       funded_year  raised_amount_usd
25000         2009         29000000.0
25001    

## Optimize numeric columns

From the results above, we can see that we should downcast both `funded_year` and `raised_amount_usd` to a more space efficient subtype - specifically, an integer subtype. Even though `raised_amount_usd` is currently a float column, the underlying data is best represented as integers.

Let's make those changes.

In [63]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

numeric_cols = ['funded_year', 'raised_amount_usd']
for chunk in chunk_iter:
    chunk[numeric_cols] = chunk[numeric_cols].fillna(-1).apply(pd.to_numeric, downcast='integer')
    
    print(chunk[numeric_cols].dtypes)
    print('='*50)

funded_year          int16
raised_amount_usd    int64
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object
funded_year          int16
raised_amount_usd    int64
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object
funded_year          int16
raised_amount_usd    int32
dtype: object


## String columns

In [76]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

for chunk in chunk_iter:
    string_types_df = chunk.select_dtypes(include='object')
    string_types = string_types_df.columns.values
    
    #print(string_types_df.nunique())
    #print(string_types_df.head(3))
    print(string_types)
    print(chunk[['funded_at', 'funded_month', 'funded_quarter']])
    #print(len(string_types))
    #print(chunk.dtypes)
    print('='*50)

['company_permalink' 'company_name' 'company_category_code'
 'company_country_code' 'company_state_code' 'company_region'
 'company_city' 'investor_permalink' 'investor_name'
 'investor_category_code' 'investor_country_code' 'investor_state_code'
 'investor_region' 'investor_city' 'funding_round_type' 'funded_at'
 'funded_month' 'funded_quarter']
       funded_at funded_month funded_quarter
0     2012-10-30      2012-10        2012-Q4
1     2012-01-23      2012-01        2012-Q1
2     2012-01-01      2012-01        2012-Q1
3     2012-02-15      2012-02        2012-Q1
4     2011-09-08      2011-09        2011-Q3
5     2012-02-01      2012-02        2012-Q1
6     2012-06-01      2012-06        2012-Q2
7     2012-08-07      2012-08        2012-Q3
8     2010-04-01      2010-04        2010-Q2
9     2011-09-01      2011-09        2011-Q3
10    2012-10-01      2012-10        2012-Q4
11    2010-10-28      2010-10        2010-Q4
12    2012-01-10      2012-01        2012-Q1
13    2010-05-01     