# 0. 개요

- 시계열 데이터

# 3. 피쳐 엔지니어링

## 3.1 train, test

prepare training and test data.
- 2011-01-29 ~ 2016-04-24 : d_1    ~ d_1913
- 2016-04-25 ~ 2016-05-22 : d_1914 ~ d_1941 (public)
- 2016-05-23 ~ 2016-06-19 : d_1942 ~ d_1969 (private)

In [None]:
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit, KFold

import matplotlib.pyplot as plt
import seaborn as sns

import time
import datetime
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def write_record(features, params):
    record = open("record model and features.txt", 'a')
    record.write("\n")
    record.write(str(datetime.datetime.now())+"\n")

    check = 0
    for _ in features:
        check += 1
        if check % 5 == 0:
            record.write("\n")
        record.write(_+"  ")
    record.write("\n")
    for i  in params.items():
        record.write(str(i) + "\n")

    record.write('--------------------------------\n')
    record.close()

In [2]:
train = pd.read_csv('inputs/sales_train_validation.csv')
train = pd.melt(train, id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'], var_name='d', value_name='sales')
train = reduce_mem_usage(train)

Mem. usage decreased to 3226.27 Mb (9.4% reduction)


In [3]:
test = pd.read_csv('inputs/sample_submission.csv')
test = test[:30490]
test = pd.melt(test, id_vars=['id'], var_name='d', value_name='sales')
for i in range(1, 29):
    test = test.replace({f'F{i}': f'd_{1913+i}'})

test[['cat_id', 'dept_id', 'item_id', 'state_id', 'store_id', 'tmp']] = pd.DataFrame(test['id'].str.split('_').tolist())
del test['tmp']
test['store_id'] = test['state_id'] + '_' + test['store_id']
test['dept_id'] = test['cat_id'] + '_' + test['dept_id']
test['item_id'] = test['dept_id'] + '_' + test['item_id']

test = test[train.columns]
test = reduce_mem_usage(test)

Mem. usage decreased to 46.41 Mb (10.9% reduction)


In [4]:
train.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
58327365,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1913,1
58327366,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1913,0
58327367,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1913,0
58327368,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1913,3
58327369,FOODS_3_827_WI_3_validation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1913,0


## 3.2 calendar (date 데이터)

In [5]:
calendar = pd.read_csv('inputs/calendar.csv')

calendar = calendar.drop(['event_name_2', 'event_type_2'], axis=1)
# calendar['is_event'] = calendar['event_name_1'].notna().astype('int8')
del calendar['wday']  # weekday랑 똑같은 컬럼.
calendar['day'] = calendar['date'].astype('datetime64').dt.day
calendar['week'] = calendar['date'].astype('datetime64').dt.week

In [6]:
train = train.merge(calendar, how='left')
test = test.merge(calendar, how='left')

## 3.3 sell_prices

In [None]:
sell_prices = pd.read_csv('inputs/sell_prices.csv')

In [None]:
train = train.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
test = test.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

In [7]:
train.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,weekday,month,year,event_name_1,event_type_1,snap_CA,snap_TX,snap_WI,day,week
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4


In [8]:
all_df = pd.concat([train, test])

In [9]:
prices_df = pd.read_csv('inputs/sell_prices.csv')
calendar_df = pd.read_csv('inputs/calendar.csv')
prices_df['price_max'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('max')
prices_df['price_min'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('min')
prices_df['price_std'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('std')
prices_df['price_mean'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('mean')

# and do price normalization (min/max scaling)
prices_df['price_norm'] = prices_df['sell_price']/prices_df['price_max']

# Some items are can be inflation dependent
# and some items are very "stable"
prices_df['price_nunique'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
prices_df['item_nunique'] = prices_df.groupby(['store_id','sell_price'])['item_id'].transform('nunique')

# I would like some "rolling" aggregations
# but would like months and years as "window"
calendar_prices = calendar_df[['wm_yr_wk','month','year']]
calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
prices_df = prices_df.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')
del calendar_prices

# Now we can add price "momentum" (some sort of)
# Shifted by week 
# by month mean
# by year mean
prices_df['price_momentum'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
prices_df['price_momentum_m'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
prices_df['price_momentum_y'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

del prices_df['month'], prices_df['year']

all_df = all_df.merge(prices_df, on=['store_id','item_id','wm_yr_wk'], how='left')

In [10]:
all_df.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,weekday,month,year,event_name_1,event_type_1,snap_CA,snap_TX,snap_WI,day,week,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y
59181085,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,2016,,,0,0,0,22,20,2.98,2.98,2.48,0.1716,2.80156,1.0,5.0,206.0,1.0,1.029224,1.02279
59181086,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,2016,,,0,0,0,22,20,2.48,2.68,2.0,0.253165,2.507979,0.925373,4.0,135.0,1.0,0.996908,1.111908
59181087,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,2016,,,0,0,0,22,20,3.98,4.38,3.98,0.188591,4.115957,0.908676,3.0,150.0,1.0,0.965839,1.0
59181088,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,2016,,,0,0,0,22,20,1.28,1.28,1.28,0.0,1.28,1.0,1.0,44.0,1.0,1.0,1.0
59181089,FOODS_3_827_WI_3_validation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,Sunday,5,2016,,,0,0,0,22,20,1.0,1.0,1.0,0.0,1.0,1.0,1.0,142.0,1.0,1.0,1.0


In [11]:
icols =  [
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
            ]

for col in icols:
    print('Encoding', col)
    col_name = '_'+'_'.join(col)+'_'
    all_df['enc'+col_name+'mean'] = all_df.groupby(col)['sales'].transform('mean').astype(np.float16)
    all_df['enc'+col_name+'std'] = all_df.groupby(col)['sales'].transform('std').astype(np.float16)

Encoding ['state_id']
Encoding ['store_id']
Encoding ['cat_id']
Encoding ['dept_id']
Encoding ['state_id', 'cat_id']
Encoding ['state_id', 'dept_id']
Encoding ['store_id', 'cat_id']
Encoding ['store_id', 'dept_id']
Encoding ['item_id']
Encoding ['item_id', 'state_id']
Encoding ['item_id', 'store_id']


In [12]:
all_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,weekday,month,year,event_name_1,event_type_1,snap_CA,snap_TX,snap_WI,day,week,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y,enc_state_id_mean,enc_state_id_std,enc_store_id_mean,enc_store_id_std,enc_cat_id_mean,enc_cat_id_std,enc_dept_id_mean,enc_dept_id_std,enc_state_id_cat_id_mean,enc_state_id_cat_id_std,enc_state_id_dept_id_mean,enc_state_id_dept_id_std,enc_store_id_cat_id_mean,enc_store_id_cat_id_std,enc_store_id_dept_id_mean,enc_store_id_dept_id_std,enc_item_id_mean,enc_item_id_std,enc_item_id_state_id_mean,enc_item_id_state_id_std,enc_item_id_store_id_mean,enc_item_id_store_id_std
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4,,,,,,,,,,,,1.210938,4.078125,1.300781,4.03125,0.558594,2.023438,0.692871,2.300781,0.692871,2.503906,0.875488,2.863281,0.799316,2.814453,1.017578,3.220703,0.210815,0.572266,0.326416,0.723633,0.309082,0.681152
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4,,,,,,,,,,,,1.210938,4.078125,1.300781,4.03125,0.558594,2.023438,0.692871,2.300781,0.692871,2.503906,0.875488,2.863281,0.799316,2.814453,1.017578,3.220703,0.260742,0.590332,0.205566,0.497559,0.253906,0.565918
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4,,,,,,,,,,,,1.210938,4.078125,1.300781,4.03125,0.558594,2.023438,0.692871,2.300781,0.692871,2.503906,0.875488,2.863281,0.799316,2.814453,1.017578,3.220703,0.073914,0.320801,0.094177,0.371582,0.148438,0.483398
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4,,,,,,,,,,,,1.210938,4.078125,1.300781,4.03125,0.558594,2.023438,0.692871,2.300781,0.692871,2.503906,0.875488,2.863281,0.799316,2.814453,1.017578,3.220703,2.017578,2.660156,2.939453,3.314453,1.694336,1.986328
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,Saturday,1,2011,,,0,0,0,29,4,,,,,,,,,,,,1.210938,4.078125,1.300781,4.03125,0.558594,2.023438,0.692871,2.300781,0.692871,2.503906,0.875488,2.863281,0.799316,2.814453,1.017578,3.220703,0.753418,1.219727,0.963379,1.341797,0.952637,1.290039


In [13]:
all_df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'month', 'year', 'event_name_1',
       'event_type_1', 'snap_CA', 'snap_TX', 'snap_WI', 'day', 'week',
       'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean',
       'price_norm', 'price_nunique', 'item_nunique', 'price_momentum',
       'price_momentum_m', 'price_momentum_y', 'enc_state_id_mean',
       'enc_state_id_std', 'enc_store_id_mean', 'enc_store_id_std',
       'enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean',
       'enc_dept_id_std', 'enc_state_id_cat_id_mean',
       'enc_state_id_cat_id_std', 'enc_state_id_dept_id_mean',
       'enc_state_id_dept_id_std', 'enc_store_id_cat_id_mean',
       'enc_store_id_cat_id_std', 'enc_store_id_dept_id_mean',
       'enc_store_id_dept_id_std', 'enc_item_id_mean', 'enc_item_id_std',
       'enc_item_id_state_id_mean', 'enc_item_id_state_id_std',
       'enc_item_id_store_id_mean', 'enc_item_id

In [14]:
all_df = all_df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'month', 'year', 'event_name_1',
       'event_type_1', 'snap_CA', 'snap_TX', 'snap_WI', 'day', 'week',
       'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean',
       'price_norm', 'price_nunique', 'item_nunique', 'price_momentum',
       'price_momentum_m', 'price_momentum_y', 
       'enc_cat_id_mean', 'enc_cat_id_std', 'enc_dept_id_mean',
       'enc_dept_id_std', 'enc_state_id_cat_id_mean']]

## 3.4 라벨인코딩

In [15]:

# all_df['revenue'] = all_df['sales'] * all_df['sell_price']

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le_classes = []
for i in all_df.columns[all_df.dtypes == 'object']:
    if i == 'id' or i == 'date':
        continue
    all_df[i] = le.fit_transform(list(all_df[i]))
    le_classes.append(le.classes_.tolist())



In [32]:
le_classes[6]

['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']

In [16]:
all_df = reduce_mem_usage(all_df)

Mem. usage decreased to 4458.72 Mb (69.6% reduction)


## 3.5 lag 데이터 제작

In [17]:
for i in range(28, 43):
    all_df[f'lag_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(i))



In [18]:
all_df = reduce_mem_usage(all_df)

Mem. usage decreased to 6151.90 Mb (45.2% reduction)


In [19]:
weeks = [7, 28, 56, 84, 112, 168] # 7 30 60 120 180
lag_day = 28
for i in weeks:
    all_df[f'rolling_mean_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(lag_day).rolling(i).mean())
    all_df[f'rolling_std_t{i}'] = all_df.groupby(['id'])['sales'].transform(lambda x: x.shift(lag_day).rolling(i).std())

all_df = reduce_mem_usage(all_df)

Mem. usage decreased to 7506.45 Mb (35.1% reduction)


In [20]:
all_df = reduce_mem_usage(all_df)

Mem. usage decreased to 7506.45 Mb (0.0% reduction)


In [21]:
all_df.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'month', 'year', 'event_name_1',
       'event_type_1', 'snap_CA', 'snap_TX', 'snap_WI', 'day', 'week',
       'sell_price', 'price_max', 'price_min', 'price_std', 'price_mean',
       'price_norm', 'price_nunique', 'item_nunique', 'price_momentum',
       'price_momentum_m', 'price_momentum_y', 'enc_cat_id_mean',
       'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std',
       'enc_state_id_cat_id_mean', 'lag_t28', 'lag_t29', 'lag_t30', 'lag_t31',
       'lag_t32', 'lag_t33', 'lag_t34', 'lag_t35', 'lag_t36', 'lag_t37',
       'lag_t38', 'lag_t39', 'lag_t40', 'lag_t41', 'lag_t42',
       'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t28',
       'rolling_std_t28', 'rolling_mean_t56', 'rolling_std_t56',
       'rolling_mean_t84', 'rolling_std_t84', 'rolling_mean_t112',
       'rolling_std_t112', 'rolling_mean_t168', 'rolling_std_t168'],
      dtype='object'

## 3.7 price 통계량 피쳐

In [22]:
all_df['lag_price_t1'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))

all_df['price_change_t1'] = (all_df['lag_price_t1'] - all_df['sell_price']) / (all_df['lag_price_t1'])

all_df['rolling_price_max_t365'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())

all_df['price_change_t365'] = (all_df['rolling_price_max_t365'] - all_df['sell_price']) / (all_df['rolling_price_max_t365'])

all_df['rolling_price_std_t7'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())

# 새롭게 만들거
# all_df['rolling_price_std_t32'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(32).std())

all_df['rolling_price_std_t28'] = all_df.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(28).std())

all_df = all_df.drop(['rolling_price_max_t365', 'lag_price_t1'], axis = 1)
# all_df = reduce_mem_usage(all_df)


In [23]:
all_df = reduce_mem_usage(all_df)

Mem. usage decreased to 7957.97 Mb (0.0% reduction)


In [24]:
all_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,weekday,month,year,event_name_1,event_type_1,snap_CA,snap_TX,snap_WI,day,week,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y,enc_cat_id_mean,enc_cat_id_std,enc_dept_id_mean,enc_dept_id_std,enc_state_id_cat_id_mean,lag_t28,lag_t29,lag_t30,lag_t31,lag_t32,lag_t33,lag_t34,lag_t35,lag_t36,lag_t37,lag_t38,lag_t39,lag_t40,lag_t41,lag_t42,rolling_mean_t7,rolling_std_t7,rolling_mean_t28,rolling_std_t28,rolling_mean_t56,rolling_std_t56,rolling_mean_t84,rolling_std_t84,rolling_mean_t112,rolling_std_t112,rolling_mean_t168,rolling_std_t168,price_change_t1,price_change_t365,rolling_price_std_t7,rolling_price_std_t28
0,HOBBIES_1_001_CA_1_validation,1437,3,1,0,0,0,0,2011-01-29,11101,2,1,2011,30,4,0,0,0,29,4,,,,,,,,,,,,0.558594,2.023438,0.692871,2.300781,0.692871,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HOBBIES_1_002_CA_1_validation,1438,3,1,0,0,0,0,2011-01-29,11101,2,1,2011,30,4,0,0,0,29,4,,,,,,,,,,,,0.558594,2.023438,0.692871,2.300781,0.692871,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,HOBBIES_1_003_CA_1_validation,1439,3,1,0,0,0,0,2011-01-29,11101,2,1,2011,30,4,0,0,0,29,4,,,,,,,,,,,,0.558594,2.023438,0.692871,2.300781,0.692871,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,HOBBIES_1_004_CA_1_validation,1440,3,1,0,0,0,0,2011-01-29,11101,2,1,2011,30,4,0,0,0,29,4,,,,,,,,,,,,0.558594,2.023438,0.692871,2.300781,0.692871,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,HOBBIES_1_005_CA_1_validation,1441,3,1,0,0,0,0,2011-01-29,11101,2,1,2011,30,4,0,0,0,29,4,,,,,,,,,,,,0.558594,2.023438,0.692871,2.300781,0.692871,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [25]:
import pickle

with open('inputs/all_df_val2.pickle', 'wb') as f:
    pickle.dump(all_df, f, protocol=4)
