In [1]:
import pandas as pd
import numpy as np

def first_chunk(filename, chunksize):
    chunker = pd.read_csv(filename, chunksize=chunksize)
    chunk = next(chunker)
    return chunk

def memory_cost(df):
    memory = df.memory_usage()
    result = pd.DataFrame(memory / 1024**2)
    result = pd.concat([result, df.dtypes], axis=1)
    result.columns = ['MB', 'dtype']
    result = result.sort_values(by='MB', ascending=False)
    total_mem = result['MB'].sum()
    result = result.append(pd.Series({'MB': total_mem}, name='Total'))
    return result

def range_vals(df):
    mins = df.min()
    maxes = df.max()
    result = pd.concat([maxes, mins], axis=1)
    result.columns = ['max', 'min']
    return result

def count_nulls(df):
    result = pd.DataFrame(df.isnull().sum(), columns=['nulls'])
    return result

In [4]:
types = {
    'is_auto_renew': np.int8,
    'is_cancel': np.int8,
    'msno': str,
    'payment_method_id': np.int8,
    'payment_plan_days': np.int16,
    'plan_list_price': np.int16,
}
date_cols = ['transaction_date', 'membership_expire_date']
tx = pd.read_csv('data/transactions.csv', dtype=types, parse_dates=date_cols)

In [2]:
types = {
    'city': np.int8,
    'bd': np.int16,
    'gender': str,
    'msno': str,
    'registered_via': np.int8,
    'registration_init_time': np.int32,
}
date_cols = ['transaction_date', 'membership_expire_date']
memb = pd.read_csv('data/members_v3.csv', dtype=types)

In [None]:
types = {
    'msno': str,
    'num_100': np.int16,
    'num_25': np.int16,
    'num_50': np.int16,
    'num_75': np.int16,
    'num_985': np.int16,
    'num_unq': np.int16,
    'total_secs': np.float64
}
chunksize = 1e6
chunker = pd.read_csv('data/user_logs.csv', chunksize=chunksize, dtype=types, parse_dates=['date'])
chunk = next(chunker)

In [5]:
memb.to_feather('feather/members.feather')
tx.to_feather('feather/transactions.feather')