# Separate dataset by Store

## Summary
- 문제 : train_use dataset의 사용 메모리가 너무 커서 컴퓨팅 파워가 따라가지 못하고 퍼지는 현상 발생.
- 해결-1 : Memory Optimization을 해도 약 1GB에 달하는 메모리 차지. 당연히 연산은 불가능함. AWS를 사용하는 방법도 있지만 금전적 한계
- 해결-2 : 54개의 store 중 1개만 사용하기로 함. 사용 메모리 227MB로 감소.

In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
%%time

# Memory optimization
dtypes = {'id': 'int32',
          'store_nbr': 'int8',
          'item_nbr': 'int32',
          'class': 'int16',
          'perishable': 'bool',
          'cluster': 'int8',
          'earthquake': 'bool',
          'year': 'int16',
          'month': 'int8',
          'day': 'int8'}

df_train = pd.read_csv('./data/train_use.csv', low_memory=True, dtype=dtypes)

df_train = df_train.drop(df_train.columns[[0]], axis=1)

CPU times: user 4min 31s, sys: 1min 21s, total: 5min 52s
Wall time: 6min 2s


In [3]:
mem_test = df_train.memory_usage(index=True).sum()
print('df_train uses', mem_test/1024**2, 'MB')

df_train uses 10596.098616600037 MB


In [4]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.memory_usage', True)

In [6]:
# Describe datasets
def desc_dataset(dataset):
    desc_dataset_ = pd.DataFrame()
    colname = dataset.keys()
    
    dataset_type = []
    dataset_nunique = []
    
    for i in colname:
        dataset_type.append(dataset[i].dtypes)
        dataset_nunique.append(dataset[i].nunique())
        
    desc_dataset_['column_name'] = colname
    desc_dataset_['dtype'] = dataset_type
    desc_dataset_['n_unique'] = dataset_nunique
    
    return desc_dataset_

In [7]:
%%time
desc_train = desc_dataset(df_train)
desc_train

Unnamed: 0,column_name,dtype,n_unique
0,id,int32,103839389
1,store_nbr,int8,54
2,item_nbr,int32,4036
3,unit_sales,float64,214925
4,onpromotion,bool,2
5,family,object,33
6,class,int16,334
7,perishable,bool,2
8,city,object,22
9,state,object,16


In [8]:
%%time
df_train_store_1 = df_train[df_train.store_nbr == 1]

CPU times: user 558 ms, sys: 675 ms, total: 1.23 s
Wall time: 1.77 s


In [8]:
mem_test = df_train_store_1.memory_usage(index=True).sum()
print('df_train_store_1 uses', mem_test/1024**2, 'MB')

df_train_store_1 uses 227.822780609 MB


In [10]:
desc_train_store_1 = desc_dataset(df_train_store_1)
desc_train_store_1

Unnamed: 0,column_name,dtype,n_unique
0,id,int32,2077300
1,store_nbr,int8,1
2,item_nbr,int32,3557
3,unit_sales,float64,40653
4,onpromotion,bool,2
5,family,object,32
6,class,int16,313
7,perishable,bool,2
8,city,object,1
9,state,object,1


In [6]:
%%time
df_train_store_1.to_csv('./data/train_store_1.csv', index=False)

CPU times: user 14.2 s, sys: 98.8 ms, total: 14.3 s
Wall time: 14.4 s
