In [1]:
import pandas as pd

# Data snapshot

In [112]:
df = pd.read_csv('crunchbase-investments.csv', nrows=5, encoding='ISO-8859-1')
df

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012,2000000
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012,20000
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012,20000
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012,20000
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011,20000


# Missing value counts

In [2]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

for chunk in chunk_iter:    
    missing_value_counts = chunk.isnull().sum()
    print(missing_value_counts)
    
    print('='*50)

company_permalink            0
company_name                 0
company_category_code       52
company_country_code         0
company_state_code          53
company_region               0
company_city                64
investor_permalink           0
investor_name                0
investor_category_code    2557
investor_country_code      778
investor_state_code       1371
investor_region              0
investor_city              900
funding_round_type           0
funded_at                    0
funded_month                 0
funded_quarter               0
funded_year                  0
raised_amount_usd          653
dtype: int64
company_permalink            0
company_name                 0
company_category_code       51
company_country_code         0
company_state_code          43
company_region               0
company_city                45
investor_permalink           0
investor_name                0
investor_category_code    5000
investor_country_code      261
investor_state_code       

# Memory footprint

In [3]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

total_memory = 0
for chunk in chunk_iter:    
    memory_per_col = chunk.memory_usage(deep=True, index=False)
    chunk_memory = memory_per_col.sum()
    total_memory += chunk_memory
    
    print(memory_per_col/(1024*1024))
    print('\nChunk memory usage (MB) =', chunk_memory/(1024*1024))
    print('='*50)

print('Total memory usage across all chunks (MB) =', total_memory/(1024*1024))

company_permalink         0.365729
company_name              0.323832
company_category_code     0.308253
company_country_code      0.286102
company_state_code        0.279969
company_region            0.308153
company_city              0.315244
investor_permalink        0.406121
investor_name             0.340663
investor_category_code    0.228371
investor_country_code     0.265327
investor_state_code       0.246032
investor_region           0.306885
investor_city             0.286545
funding_round_type        0.305766
funded_at                 0.319481
funded_month              0.305176
funded_quarter            0.305176
funded_year               0.038147
raised_amount_usd         0.038147
dtype: float64

Chunk memory usage (MB) = 5.579118728637695
company_permalink         0.367855
company_name              0.326042
company_category_code     0.309870
company_country_code      0.286102
company_state_code        0.280227
company_region            0.307571
company_city              0.31

# Column types (by reading entire file at once)

In [4]:
df = pd.read_csv('crunchbase-investments.csv', encoding='ISO-8859-1', low_memory=False)
df.dtypes

company_permalink          object
company_name               object
company_category_code      object
company_country_code       object
company_state_code         object
company_region             object
company_city               object
investor_permalink         object
investor_name              object
investor_category_code     object
investor_country_code      object
investor_state_code        object
investor_region            object
investor_city              object
funding_round_type         object
funded_at                  object
funded_month               object
funded_quarter             object
funded_year               float64
raised_amount_usd         float64
dtype: object

# Column types (by reading file in chunks)

In [39]:
# A column's datatype may not be consistent across all chunks.
# To get a column's datatype, we gather its datatype in each chunk, 
# then call max() to get its type with the highest precedence.
# Example: 'investor_category_code' is listed as an object type 
# in one chunk and as a float in another chunk because all the
# values in that chunk are NaN. Between object and float, the 
# object type has a higher precedence, so that's the datatype
# returned.

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

datatypes = pd.Series()
for chunk in chunk_iter:
    d_types = chunk.dtypes
    datatypes = datatypes.append(d_types)
    
colnames = datatypes.index
datatypes = datatypes.groupby(by=colnames).max().sort_values()
numeric_cols = datatypes[datatypes != 'object']
string_cols = datatypes[datatypes == 'object']
print('Numeric columns: ( n =', numeric_cols.size, ')\n')
print(numeric_cols)
print('\nString columns: ( n =', string_cols.size, ')\n')
print(string_cols)

Numeric columns: ( n = 2 )

raised_amount_usd    float64
funded_year          float64
dtype: object

String columns: ( n = 18 )

investor_region           object
investor_permalink        object
investor_name             object
investor_country_code     object
investor_city             object
investor_category_code    object
funding_round_type        object
company_category_code     object
funded_month              object
funded_at                 object
company_state_code        object
company_region            object
company_permalink         object
company_name              object
company_country_code      object
company_city              object
investor_state_code       object
funded_quarter            object
dtype: object


# Optimize numeric columns

In [6]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')

numeric_cols = ['funded_year', 'raised_amount_usd']
total_memory = 0
for chunk in chunk_iter:
    print(chunk[numeric_cols].head(2))
    print()
    
    print(chunk[numeric_cols].dtypes)
    
    chunk_memory = chunk.memory_usage(deep=True).sum()
    total_memory += chunk_memory
    
    print('='*50)
    
print('Total memory usage (MB) =', total_memory/(1024*1024))

   funded_year  raised_amount_usd
0         2012          2000000.0
1         2012            20000.0

funded_year            int64
raised_amount_usd    float64
dtype: object
      funded_year  raised_amount_usd
5000         2008         25000000.0
5001         2008         20000000.0

funded_year            int64
raised_amount_usd    float64
dtype: object
       funded_year  raised_amount_usd
10000         2009         37400000.0
10001         2009          6000000.0

funded_year            int64
raised_amount_usd    float64
dtype: object
       funded_year  raised_amount_usd
15000         2011          3000000.0
15001         2011          5000000.0

funded_year            int64
raised_amount_usd    float64
dtype: object
       funded_year  raised_amount_usd
20000         2010         15000000.0
20001         2010         14900000.0

funded_year            int64
raised_amount_usd    float64
dtype: object
       funded_year  raised_amount_usd
25000         2009         29000000.0
2500

From the results above, we can see that we should downcast both **`funded_year`** and **`raised_amount_usd`** to an integer type, since the underlying data is best represented as integers.

Let's make those changes.

In [129]:
numeric_cols = ['funded_year', 'raised_amount_usd']
total_memory = 0

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
for chunk in chunk_iter:
    # Optimize numeric columns by downcasting to a more space efficient type
    chunk[numeric_cols] = chunk[numeric_cols].fillna(-1).apply(pd.to_numeric, downcast='integer')
    
    chunk_memory = chunk.memory_usage(deep=True).sum()
    total_memory += chunk_memory
    print('Chunk memory usage (MB) =', chunk_memory/(1024*1024))
    
print('-'*50)   
print('Total memory usage (MB) =', total_memory/(1024*1024))

Chunk memory usage (MB) = 5.55058479309082
Chunk memory usage (MB) = 5.480503082275391
Chunk memory usage (MB) = 5.487320899963379
Chunk memory usage (MB) = 5.4804792404174805
Chunk memory usage (MB) = 5.47662353515625
Chunk memory usage (MB) = 5.524802207946777
Chunk memory usage (MB) = 5.483707427978516
Chunk memory usage (MB) = 5.4619293212890625
Chunk memory usage (MB) = 5.348406791687012
Chunk memory usage (MB) = 4.591768264770508
Chunk memory usage (MB) = 2.636298179626465
--------------------------------------------------
Total memory usage (MB) = 56.52242374420166


From the results above, we can see that we've reduced our total memory usage by about 0.5 MB (56.98 --> 56.52) by optimizing the numeric columns.

# Optimize string columns

In [130]:
# Exclude 'funded_month' because it contains redundant info
keep = ['investor_region', 'investor_permalink', 'investor_name',
        'investor_country_code', 'investor_city', 'investor_category_code', 
        'funding_round_type', 'company_category_code', 'funded_at', 
        'company_state_code', 'company_region', 'company_permalink', 
        'company_name', 'company_country_code', 'company_city', 
        'investor_state_code', 'funded_quarter', 'funded_year', 'raised_amount_usd']
numeric_cols = ['funded_year', 'raised_amount_usd']
total_memory = 0

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep)
for chunk in chunk_iter:
    # Optimize numeric columns by downcasting to a more space efficient type
    chunk[numeric_cols] = chunk[numeric_cols].fillna(-1).apply(pd.to_numeric, downcast='integer')

    # Optimize 'funded_quarter' column by extracting out the numeric value 
    # for the quarter and casting to an int8
    chunk['funded_quarter'] = chunk['funded_quarter'].str.extract(r'Q(\d)', expand=False)
    chunk['funded_quarter'] = chunk['funded_quarter'].fillna(-1)
    chunk['funded_quarter'] = chunk['funded_quarter'].astype('int8')

    chunk_memory = chunk.memory_usage(deep=True).sum()
    total_memory += chunk_memory
    print('Chunk memory usage (MB) =', chunk_memory/(1024*1024))

print('-'*50)
print('Total memory usage (MB) =', total_memory/(1024*1024))

Chunk memory usage (MB) = 4.945001602172852
Chunk memory usage (MB) = 4.874919891357422
Chunk memory usage (MB) = 4.88173770904541
Chunk memory usage (MB) = 4.874896049499512
Chunk memory usage (MB) = 4.871040344238281
Chunk memory usage (MB) = 4.919219017028809
Chunk memory usage (MB) = 4.878307342529297
Chunk memory usage (MB) = 4.856346130371094
Chunk memory usage (MB) = 4.742823600769043
Chunk memory usage (MB) = 3.986185073852539
Chunk memory usage (MB) = 2.288693428039551
--------------------------------------------------
Total memory usage (MB) = 50.11917018890381


From the results above, we can see that we were able to lower the total memory usage even further (56.52 --> 50.11) by optimizing the string columns. To optimize the string columns, we dropped the column **`funded_month`** and casted the column **`funded_quarter`** to an `int8` after extracting out the numeric value for the quarter, which is the only portion of the value that really matters.