In [9]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
data_path = "../../inputs/ubiquant-market-prediction"
train_csv = "train.csv"

train = pd.read_csv(os.path.join(data_path, train_csv))

In [12]:
def reduce_mem_usage(df, verbose=True):
    
    numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    
    if verbose: 
        print("Mem. usage {:5.2f} Mb".format(start_mem))
    
    for col in tqdm(df.columns):
              
        col_type = df[col].dtypes
              
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
                    
    end_mem = df.memory_usage().sum() / 1024 ** 2
              
    if verbose: 
        print("Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(end_mem, 100 * (start_mem - end_mem) / start_mem))
        
    return df

In [13]:
train = reduce_mem_usage(train, verbose=True)

  3%|██▍                                                                               | 9/304 [00:00<00:03, 75.04it/s]

Mem. usage 6842.59 Mb


100%|████████████████████████████████████████████████████████████████████████████████| 304/304 [04:21<00:00,  1.16it/s]

Mem. usage decreased to 3642.99 Mb (46.8% reduction)





In [15]:
train.to_pickle(os.path.join(data_path, "train.pkl"))