In [1]:
import pandas as pd
import load_datasets
import os
from sklearn.linear_model import HuberRegressor, LinearRegression, SGDRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np

# Load Data

In [2]:
def fill_missing_add_indicator(df, cols):
    for c in cols:
        indicator_name = c + '_is_none'
        df[indicator_name] = df[c].isna().astype(np.int8)
    df = df.fillna(-1)
    return df
    

In [3]:
training_name = 'linear'

# cat_feature_names = ['shop_category', 'type_code', 'subtype_code', 'shop_city']
cat_feature_names = None

cols_to_drop = ['shop_id', 'item_id', 'item_category_id' ]
X_train, y_train = load_datasets.train(one_hot=False, cat_feature_names=cat_feature_names)
X_val, y_val = load_datasets.val(one_hot=False, cat_feature_names=cat_feature_names)
X_test = load_datasets.test(one_hot=False, cat_feature_names=cat_feature_names)

X_train = X_train.drop(cols_to_drop, axis=1)
X_val = X_val.drop(cols_to_drop, axis=1)
X_test = X_test.drop(cols_to_drop, axis=1)

missing_cols = ['item_cnt_month_lag_1',
 'item_cnt_month_lag_2',
 'item_cnt_month_lag_3',
 'date_avg_item_cnt_lag_1',
 'date_avg_item_cnt_lag_2',
 'date_avg_item_cnt_lag_3',
 'date_item_avg_item_cnt_lag_1',
 'date_item_avg_item_cnt_lag_2',
 'date_item_avg_item_cnt_lag_3',
 'date_shop_avg_item_cnt_lag_1',
 'date_shop_avg_item_cnt_lag_2',
 'date_shop_avg_item_cnt_lag_3',
 'date_city_avg_item_cnt_lag_1',
 'date_city_avg_item_cnt_lag_2',
 'date_city_avg_item_cnt_lag_3',
 'date_item_city_avg_item_cnt_lag_1',
 'date_item_city_avg_item_cnt_lag_2',
 'date_item_city_avg_item_cnt_lag_3',
 'delta_revenue_lag_1',
 'item_cnt_month_lag_1_adv',
 'item_cnt_month_lag_2_adv',
 'item_cnt_month_lag_3_adv']

X_train = fill_missing_add_indicator(X_train, missing_cols)
X_val = fill_missing_add_indicator(X_val, missing_cols)
X_test = fill_missing_add_indicator(X_test, missing_cols)

In [4]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Train the Model

In [5]:
huber = HuberRegressor(max_iter=200).fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [6]:
y_pred_train = huber.predict(X_train).clip(0, 20)
y_pred_val = huber.predict(X_val).clip(0, 20)



In [7]:

rmse_train = sqrt(mean_squared_error(y_train, y_pred_train))
rmse_val= sqrt(mean_squared_error(y_val, y_pred_val))

print("RMSE for train: {}, RMSE for valid: {}".format(rmse_train, rmse_val))

RMSE for train: 1.0510215362879711, RMSE for valid: 1.0301744921963345


# Make Predictions

In [9]:
DATA_DIR = "data"
OUTPUT_DIR = 'output'


pred_test = huber.predict(X_test)


pred = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
pred['item_cnt_month'] = pred_test
pred['item_cnt_month'] = pred['item_cnt_month'].clip(lower=0, upper=20)
pred.to_csv(os.path.join(OUTPUT_DIR, training_name+'.csv'), index=False)

Your public LB scores are: 1.089920

In [11]:
# predict for val, and test for emsembling
pred_val = huber.predict(X_val)
pd.DataFrame({training_name + 'val': pred_val}).to_hdf(os.path.join(OUTPUT_DIR, training_name+'.h5'), key='val', mode='a')
pd.DataFrame({training_name + 'test': pred_test}).to_hdf(os.path.join(OUTPUT_DIR, training_name+'.h5'), key='test', mode='a')

In [13]:
from joblib import dump, load
dump(huber, os.path.join(OUTPUT_DIR, 'huber_model.joblib')) 

['output/huber_model.joblib']