In [1]:
import pandas as pd


In [2]:
data = pd.read_hdf('data_feat.h5', 'table')

In [3]:
data.head().T

Unnamed: 0,1497465,1497466,1497467,1497468,1497469
date_block_num,4.0,4.0,4.0,4.0,4.0
shop_id,2.0,2.0,2.0,2.0,2.0
item_id,27.0,28.0,29.0,30.0,31.0
item_cnt_month,0.0,0.0,0.0,0.0,0.0
shop_category,4.0,4.0,4.0,4.0,4.0
shop_city,0.0,0.0,0.0,0.0,0.0
item_category_id,19.0,30.0,23.0,40.0,37.0
name2,76.0,107.0,123.0,4.0,4.0
name3,42.0,42.0,42.0,42.0,562.0
subtype_code,10.0,55.0,16.0,4.0,1.0


In [4]:
missing_val_count_by_column = (data.isnull().sum())

In [5]:
missing_val_count_by_column

date_block_num                                  0
shop_id                                         0
item_id                                         0
item_cnt_month                                  0
shop_category                                   0
shop_city                                       0
item_category_id                                0
name2                                           0
name3                                           0
subtype_code                                    0
type_code                                       0
item_cnt_month_lag_1                      1957843
item_cnt_month_lag_2                      2252408
item_cnt_month_lag_3                      2539771
date_avg_item_cnt_lag_1                   1957843
date_item_avg_item_cnt_lag_1              1957843
date_item_avg_item_cnt_lag_2              2252408
date_item_avg_item_cnt_lag_3              2539771
date_shop_avg_item_cnt_lag_1              1957843
date_shop_avg_item_cnt_lag_2              2252408


In [6]:
data.nunique()

date_block_num                               31
shop_id                                      55
item_id                                   20883
item_cnt_month                              436
shop_category                                 5
shop_city                                    31
item_category_id                             82
name2                                       166
name3                                      1583
subtype_code                                 65
type_code                                    10
item_cnt_month_lag_1                        427
item_cnt_month_lag_2                        418
item_cnt_month_lag_3                        415
date_avg_item_cnt_lag_1                      30
date_item_avg_item_cnt_lag_1               2530
date_item_avg_item_cnt_lag_2               2523
date_item_avg_item_cnt_lag_3               2510
date_shop_avg_item_cnt_lag_1               1121
date_shop_avg_item_cnt_lag_2               1113
date_shop_avg_item_cnt_lag_3            

In [4]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1).fillna(-1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1).fillna(-1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1).fillna(-1)

In [5]:
Y_train = Y_train.clip(0, 20)
Y_valid = Y_valid.clip(0, 20)

In [6]:
import time
ts = time.time()

from sklearn import linear_model
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, Y_train)

y_pred_train = regr.predict(X_train).clip(0, 20)
y_pred_valid = regr.predict(X_valid).clip(0, 20)

time.time() - ts

16.924362182617188

In [7]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse_train = sqrt(mean_squared_error(Y_train, y_pred_train))
rmse_valid = sqrt(mean_squared_error(Y_valid, y_pred_valid))

print("RMSE for train: {}, RMSE for valid: {}".format(rmse_train, rmse_valid))

RMSE for train: 1.0361586989064262, RMSE for valid: 1.0355244683396088


In [9]:
# use only numeric features

cats_cols = ['date_block_num', 'shop_id', 'item_id', 'shop_category', 'shop_city', 'item_category_id', 'name2', 'name3', 'subtype_code', 'type_code', 'month', 'days']
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'] + cats_cols, axis=1).fillna(-1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'] + cats_cols, axis=1).fillna(-1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']

X_test = data[data.date_block_num == 34].drop(['item_cnt_month'] + cats_cols, axis=1).fillna(-1)

Y_train = Y_train.clip(0, 20)
Y_valid = Y_valid.clip(0, 20)

regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, Y_train)

y_pred_train = regr.predict(X_train).clip(0, 20)
y_pred_valid = regr.predict(X_valid).clip(0, 20)


rmse_train = sqrt(mean_squared_error(Y_train, y_pred_train))
rmse_valid = sqrt(mean_squared_error(Y_valid, y_pred_valid))

print("RMSE for train: {}, RMSE for valid: {}".format(rmse_train, rmse_valid))


RMSE for train: 1.0398918799905654, RMSE for valid: 1.0330248500423664


In [10]:
# let's standardize the data

from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)

regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, Y_train)

y_pred_train = regr.predict(X_train).clip(0, 20)
y_pred_valid = regr.predict(X_valid).clip(0, 20)


rmse_train = sqrt(mean_squared_error(Y_train, y_pred_train))
rmse_valid = sqrt(mean_squared_error(Y_valid, y_pred_valid))

print("RMSE for train: {}, RMSE for valid: {}".format(rmse_train, rmse_valid))

RMSE for train: 1.039902311835593, RMSE for valid: 1.0330217919834546


In [14]:
data.shape

(9630539, 34)