## A.1 피쳐 요약표

In [7]:
import pandas as pd

train = pd.read_csv('data/cat-in-the-dat/train.csv', index_col='id')
train.dtypes[:5]

bin_0     int64
bin_1     int64
bin_2     int64
bin_3    object
bin_4    object
dtype: object

In [9]:
summary = pd.DataFrame(train.dtypes, columns=['데이터 타입'])
summary.head()

Unnamed: 0,데이터 타입
bin_0,int64
bin_1,int64
bin_2,int64
bin_3,object
bin_4,object


In [10]:
summary = summary.reset_index()
summary.head()

Unnamed: 0,index,데이터 타입
0,bin_0,int64
1,bin_1,int64
2,bin_2,int64
3,bin_3,object
4,bin_4,object


In [11]:
summary = summary.rename(columns={'index':'피쳐'})
summary.head()

Unnamed: 0,피쳐,데이터 타입
0,bin_0,int64
1,bin_1,int64
2,bin_2,int64
3,bin_3,object
4,bin_4,object


In [13]:
summary['결측값 개수'] = train.isnull().sum().values
summary['고윳값 개수'] = train.nunique().values
summary['첫 번째 값'] = train.loc[0].values
summary['두 번째 값'] = train.loc[1].values
summary['세 번째 값'] = train.loc[2].values

summary.head()

Unnamed: 0,피쳐,데이터 타입,결측값 개수,고윳값 개수,첫 번째 값,두 번째 값,세 번째 값
0,bin_0,int64,0,2,0,0,0
1,bin_1,int64,0,2,0,1,0
2,bin_2,int64,0,2,0,0,0
3,bin_3,object,0,2,T,T,F
4,bin_4,object,0,2,Y,Y,Y


In [14]:
def resumetable(df):
    summary = pd.DataFrame(train.dtypes, columns=['데이터 타입'])
    summary = summary.reset_index()
    summary = summary.rename(columns={'index': '피쳐'})

    summary['결측값 개수'] = train.isnull().sum().values
    summary['고윳값 개수'] = train.nunique().values
    summary['첫 번째 값'] = train.loc[0].values
    summary['두 번째 값'] = train.loc[1].values
    summary['세 번째 값'] = train.loc[2].values                              
    
    return summary

## A.2 메모리 절약을 위한 데이터 다운캐스팅

In [15]:
import pandas as pd

path = 'data/competitive-data-science-predict-future-sales/'

sales_train = pd.read_csv(path+'sales_train.csv')
shops = pd.read_csv(path+'shops.csv')
items = pd.read_csv(path+'items.csv')
item_categories = pd.read_csv(path+'item_categories.csv')

train = sales_train.merge(shops, on='shop_id', how='left')
train = train.merge(items, on='item_id', how='left')
train = train.merge(item_categories, on='item_category_id', how='left')

train.dtypes

date                   object
date_block_num          int64
shop_id                 int64
item_id                 int64
item_price            float64
item_cnt_day          float64
shop_name              object
item_name              object
item_category_id        int64
item_category_name     object
dtype: object

In [16]:
train.memory_usage()

Index                 23486792
date                  23486792
date_block_num        23486792
shop_id               23486792
item_id               23486792
item_price            23486792
item_cnt_day          23486792
shop_name             23486792
item_name             23486792
item_category_id      23486792
item_category_name    23486792
dtype: int64

In [17]:
start_mem = train.memory_usage().sum() / 1024 ** 2
start_mem

246.3862533569336

In [18]:
for col in train.columns:
    dtype_name = train[col].dtype.name
    if dtype_name == 'object':
        pass
    elif dtype_name == 'bool' : 
        train[col] = train[col].astype('int8')
    elif dtype_name.startswith('int') or (train[col].round() == train[col]).all():
        train[col] = pd.to_numeric(train[col], downcast='integer')
    else:
        train[col] = pd.to_numeric(train[col], downcast='float')

train.dtypes        

date                   object
date_block_num           int8
shop_id                  int8
item_id                 int16
item_price            float32
item_cnt_day            int16
shop_name              object
item_name              object
item_category_id         int8
item_category_name     object
dtype: object

In [19]:
train.memory_usage()

Index                 23486792
date                  23486792
date_block_num         2935849
shop_id                2935849
item_id                5871698
item_price            11743396
item_cnt_day           5871698
shop_name             23486792
item_name             23486792
item_category_id       2935849
item_category_name    23486792
dtype: int64

In [20]:
end_mem = train.memory_usage().sum() / 1024 ** 2
end_mem

142.7920331954956

In [21]:
print("{:.1f}% 압축됨".format(100 * (start_mem - end_mem)/start_mem))

42.0% 압축됨


In [24]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    
    for col in df.columns:
        for col in df.columns:
            dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool' : 
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    
    if verbose:
        print("{:.1f}% 압축됨".format(100 * (start_mem - end_mem)/start_mem))
        
    return df