In [17]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score, KFold, cross_val_score 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import clone
from sklearn import linear_model
#silence future warning message
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

In [18]:
item = pd.read_csv('items.csv')

In [19]:
df = pd.read_csv('sales_train.csv')

In [20]:
df = df.merge(item, left_on=df['item_id'], right_on = item['item_id'])

In [21]:
df.head()

Unnamed: 0,key_0,date,date_block_num,shop_id,item_id_x,item_price,item_cnt_day,item_name,item_id_y,item_category_id
0,22154,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),22154,37
1,22154,23.01.2013,0,24,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),22154,37
2,22154,20.01.2013,0,27,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),22154,37
3,22154,02.01.2013,0,25,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),22154,37
4,22154,03.01.2013,0,25,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),22154,37


In [22]:
df.drop(['item_name','item_id_y'], axis =1, inplace = True)
df = df.rename(columns={'item_id_x': 'item_id'})

In [23]:
df = df.drop(['date'], axis = 1)

In [24]:
df.head()

Unnamed: 0,key_0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
0,22154,0,59,22154,999.0,1.0,37
1,22154,0,24,22154,999.0,1.0,37
2,22154,0,27,22154,999.0,1.0,37
3,22154,0,25,22154,999.0,1.0,37
4,22154,0,25,22154,999.0,1.0,37


In [25]:
is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]

rows_with_NaN.head(50)

Unnamed: 0,key_0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id


In [26]:
def overview(dataframe):
    
    data_resumen = {'Valores Unicos':dataframe.nunique(),'No-Nulos': dataframe.notnull().sum(), 'Nulos': dataframe.isnull().sum(), 'Formato': dataframe.dtypes, 'Min': dataframe.min(), 'Max': dataframe.max()}
    resumen = pd.DataFrame(data=data_resumen)
    return resumen

In [27]:
overview(df)

Unnamed: 0,Valores Unicos,No-Nulos,Nulos,Formato,Min,Max
key_0,21807,2935849,0,int64,0.0,22169.0
date_block_num,34,2935849,0,int64,0.0,33.0
shop_id,60,2935849,0,int64,0.0,59.0
item_id,21807,2935849,0,int64,0.0,22169.0
item_price,19993,2935849,0,float64,-1.0,307980.0
item_cnt_day,198,2935849,0,float64,-22.0,2169.0
item_category_id,84,2935849,0,int64,0.0,83.0


In [28]:
#Get rid of outliers
df = df[(df['item_price']<100000) & (df['item_price']>=0)]
df = df[(df['item_cnt_day']<1000) & (df['item_cnt_day']>=0)]

# plot after outliers removal
#plot_features = ['item_price','item_cnt_day']
#for f in plot_features:
#   df.boxplot(f,f)


In [29]:
df2 = df.groupby(by = ['date_block_num','shop_id','item_category_id','item_id','item_price'])['item_cnt_day'].sum()

In [30]:
df2 = df2.reset_index(drop=False)
df2['total_sales'] = df2['item_price'] * df2['item_cnt_day']
df2.head()

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_day,total_sales
0,0,0,2,5572,1322.0,10.0,13220.0
1,0,0,2,5573,560.0,1.0,560.0
2,0,0,2,5575,806.0,4.0,3224.0
3,0,0,2,5576,2231.0,5.0,11155.0
4,0,0,2,5609,2381.0,1.0,2381.0


In [31]:
df2['item_id_avg_item_price'] = df2.groupby(['date_block_num','shop_id','item_id'])['total_sales'].transform('mean')


In [32]:
df2.head(30)

Unnamed: 0,date_block_num,shop_id,item_category_id,item_id,item_price,item_cnt_day,total_sales,item_id_avg_item_price
0,0,0,2,5572,1322.0,10.0,13220.0,13220.0
1,0,0,2,5573,560.0,1.0,560.0,560.0
2,0,0,2,5575,806.0,4.0,3224.0,3224.0
3,0,0,2,5576,2231.0,5.0,11155.0,11155.0
4,0,0,2,5609,2381.0,1.0,2381.0,2381.0
5,0,0,2,5612,3623.0,1.0,3623.0,3623.0
6,0,0,2,5623,294.0,1.0,294.0,294.0
7,0,0,2,5627,2060.0,2.0,4120.0,4120.0
8,0,0,2,5629,1925.0,9.0,17325.0,17325.0
9,0,0,2,5630,2060.0,1.0,2060.0,2060.0


In [33]:
df3 = df2.groupby(by = ['date_block_num','shop_id','item_id'])['item_id_avg_item_price'].mean()

In [34]:
df3 = df3.reset_index(drop=False)
df3.head(6)

Unnamed: 0,date_block_num,shop_id,item_id,item_id_avg_item_price
0,0,0,32,1326.0
1,0,0,33,1041.0
2,0,0,35,247.0
3,0,0,43,221.0
4,0,0,51,128.5
5,0,0,61,195.0


In [35]:
df3['item_id_sum_item_cnt_day'] = df2.groupby(by = ['date_block_num','shop_id','item_id'])['item_cnt_day'].transform('sum')
df3['item_id_avg_item_cnt_day'] = df2.groupby(by = ['date_block_num','shop_id','item_id'])['item_cnt_day'].transform('mean')

In [36]:
df3.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day
0,0,0,32,1326.0,10.0,10.0
1,0,0,33,1041.0,1.0,1.0
2,0,0,35,247.0,4.0,4.0
3,0,0,43,221.0,5.0,5.0
4,0,0,51,128.5,1.0,1.0


In [37]:
df3['shop_id_avg_item_price'] = df2.groupby(['date_block_num','shop_id'])['total_sales'].transform('mean')

df3['shop_id_avg_item_price'] = round(df3['shop_id_avg_item_price'],2)

In [38]:
df3.head(10)

Unnamed: 0,date_block_num,shop_id,item_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price
0,0,0,32,1326.0,10.0,10.0,1228.84
1,0,0,33,1041.0,1.0,1.0,1228.84
2,0,0,35,247.0,4.0,4.0,1228.84
3,0,0,43,221.0,5.0,5.0,1228.84
4,0,0,51,128.5,1.0,1.0,1228.84
5,0,0,61,195.0,1.0,1.0,1228.84
6,0,0,75,76.0,1.0,1.0,1228.84
7,0,0,88,76.0,2.0,2.0,1228.84
8,0,0,95,193.0,9.0,9.0,1228.84
9,0,0,96,70.0,1.0,1.0,1228.84


In [39]:
df3['shop_id_sum_item_cnt_day'] = df2.groupby(['date_block_num','shop_id'])['item_cnt_day'].transform('sum')

In [40]:
df3.sample(20)

Unnamed: 0,date_block_num,shop_id,item_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price,shop_id_sum_item_cnt_day
1126488,21,30,6492,499.0,1.0,1.0,1973.44,3776.0
268780,4,31,12836,149.0,1.0,1.0,899.35,2266.0
81436,1,19,11258,149.0,1.0,1.0,1485.88,2756.0
639319,11,22,3159,1580.0,2.0,1.0,2504.16,2441.0
1469272,29,42,11575,198.0,4.0,4.0,1920.4,7311.0
296631,5,4,21894,149.0,1.0,1.0,1306.18,3602.0
45033,0,42,9650,199.0,1.0,1.0,1809.38,1717.0
1255851,24,4,5820,4465.0,1.0,1.0,3389.09,5495.0
263255,4,28,9256,198.0,1.0,1.0,1151.71,3027.0
547030,9,31,19601,199.0,2.0,2.0,1828.35,7837.0


In [41]:
df3['shop_id_mean_item_cnt_day'] = df2.groupby(['date_block_num','shop_id'])['item_cnt_day'].transform('mean')
df3['shop_id_mean_item_cnt_day'] = round(df3['shop_id_mean_item_cnt_day'],2)
df3.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price,shop_id_sum_item_cnt_day,shop_id_mean_item_cnt_day
0,0,0,32,1326.0,10.0,10.0,1228.84,5578.0,2.31
1,0,0,33,1041.0,1.0,1.0,1228.84,5578.0,2.31
2,0,0,35,247.0,4.0,4.0,1228.84,5578.0,2.31
3,0,0,43,221.0,5.0,5.0,1228.84,5578.0,2.31
4,0,0,51,128.5,1.0,1.0,1228.84,5578.0,2.31


In [42]:
df3['item_category_id_avg_item_price'] = df2.groupby(['date_block_num','shop_id','item_category_id'])['total_sales'].transform('mean')
df3['item_category_id_avg_item_price'] = round(df3['item_category_id_avg_item_price'],2)
df3.sample(20)

Unnamed: 0,date_block_num,shop_id,item_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price,shop_id_sum_item_cnt_day,shop_id_mean_item_cnt_day,item_category_id_avg_item_price
1354843,26,25,9994,1796.0,2.0,2.0,1889.02,1408.0,1.68,1058.84
1564186,32,37,15402,1129.0,1.0,1.0,2151.11,807.0,1.56,1153.0
11158,0,12,5594,1190.0,1.0,1.0,903.75,935.0,1.46,190.28
1549427,32,6,11159,399.0,1.0,1.0,2929.8,980.0,1.96,1381.35
1459065,29,25,13672,399.0,2.0,1.0,1781.46,1688.0,1.59,1353.37
1399467,27,31,5638,3290.0,1.0,1.0,1564.89,2449.0,1.74,383.46
1163298,22,25,10343,149.0,1.0,1.0,3119.1,4410.0,2.75,407.24
1184568,22,49,14229,99.0,1.0,1.0,3311.23,2249.0,4.15,2523.6
1448264,29,2,3446,2999.0,1.0,1.0,1728.67,1209.0,1.81,398.83
6358,0,6,1621,349.0,4.0,4.0,1260.57,2117.0,1.86,1133.33


In [43]:
df3['item_category_id_sum_item_cnt_day'] = df2.groupby(['date_block_num','shop_id','item_category_id'])['item_cnt_day'].transform('sum')
df3['item_category_id_avg_item_cnt_day'] = df2.groupby(['date_block_num','shop_id','item_category_id'])['item_cnt_day'].transform('mean')
df3['item_category_id_sum_item_cnt_day'] = round(df3['item_category_id_sum_item_cnt_day'],2)
df3['item_category_id_avg_item_cnt_day'] = round(df3['item_category_id_avg_item_cnt_day'],2)
df3.sample(20)

Unnamed: 0,date_block_num,shop_id,item_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price,shop_id_sum_item_cnt_day,shop_id_mean_item_cnt_day,item_category_id_avg_item_price,item_category_id_sum_item_cnt_day,item_category_id_avg_item_cnt_day
1240877,23,48,19900,169.0,2.0,2.0,7143.61,3735.0,4.49,3803.0,23.0,1.64
99435,1,31,13854,5391.0,1.0,1.0,1632.48,5449.0,2.4,1030.7,1936.0,3.2
238463,3,57,14826,2985.0,1.0,1.0,1370.78,3986.0,1.96,11740.0,6.0,1.0
644081,11,25,16311,4596.0,1.0,1.0,3674.03,4522.0,2.77,492.34,406.0,1.55
352435,6,4,8859,199.0,1.0,1.0,1545.75,1544.0,1.74,362.38,146.0,1.32
1202377,23,12,5827,349.0,5.0,1.666667,2680.28,1633.0,2.46,5156.94,83.0,2.68
1572200,32,52,13958,299.0,1.0,1.0,1798.23,1304.0,1.67,4964.44,18.0,2.0
122562,1,59,8579,399.0,1.0,1.0,1582.38,3057.0,1.9,344.53,311.0,1.23
718697,12,38,10007,399.0,1.0,1.0,3255.49,4772.0,2.57,423.36,501.0,1.45
593014,10,29,11707,699.0,1.0,1.0,1175.12,1685.0,1.8,392.03,340.0,1.48


In [44]:
df_test = df3

In [51]:
df_test.count()

date_block_num                       1608224
shop_id                              1608224
item_id                              1608224
item_id_avg_item_price               1608224
item_id_sum_item_cnt_day             1608224
item_id_avg_item_cnt_day             1608224
shop_id_avg_item_price               1608224
shop_id_sum_item_cnt_day             1608224
shop_id_mean_item_cnt_day            1608224
item_category_id_avg_item_price      1608224
item_category_id_sum_item_cnt_day    1608224
item_category_id_avg_item_cnt_day    1608224
dtype: int64

In [47]:
temps = pd.DataFrame(df_test.values)
dataframe = pd.concat([temps.shift(3), temps.shift(2), temps.shift(1), temps], axis=1)
dataframe.columns = ['t-3', 't-2', 't-1', 't+1']
print(dataframe.head(5))

ValueError: Length mismatch: Expected axis has 48 elements, new values have 4 elements

In [54]:
df_test.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_id_avg_item_price,item_id_sum_item_cnt_day,item_id_avg_item_cnt_day,shop_id_avg_item_price,shop_id_sum_item_cnt_day,shop_id_mean_item_cnt_day,item_category_id_avg_item_price,item_category_id_sum_item_cnt_day,item_category_id_avg_item_cnt_day,item_id_avg_item_price(t-1),item_id_sum_item_cnt_day(t-1),item_id_avg_item_cnt_day(t-1),shop_id_avg_item_price(t-1),shop_id_sum_item_cnt_day(t-1)
0,0,0,32,1326.0,10.0,10.0,1228.84,5578.0,2.31,6211.25,53.0,3.31,,,,,
1,0,0,33,1041.0,1.0,1.0,1228.84,5578.0,2.31,6211.25,53.0,3.31,1326.0,10.0,10.0,1228.84,5578.0
2,0,0,35,247.0,4.0,4.0,1228.84,5578.0,2.31,6211.25,53.0,3.31,1041.0,1.0,1.0,1228.84,5578.0
3,0,0,43,221.0,5.0,5.0,1228.84,5578.0,2.31,6211.25,53.0,3.31,247.0,4.0,4.0,1228.84,5578.0
4,0,0,51,128.5,1.0,1.0,1228.84,5578.0,2.31,6211.25,53.0,3.31,221.0,5.0,5.0,1228.84,5578.0


In [58]:
for i in range(1,4):

    
    df_test[f'item_id_avg_item_price(t-{i})'] = df_test['item_id_avg_item_price'].shift(i)
    df_test[f'item_id_sum_item_cnt_day(t-{i})'] = df_test['item_id_sum_item_cnt_day'].shift(i)
    df_test[f'item_id_avg_item_cnt_day(t-{i})'] = df_test['item_id_avg_item_cnt_day'].shift(i)
    df_test[f'shop_id_avg_item_price(t-{i})'] = df_test['shop_id_avg_item_price'].shift(i)
    df_test[f'shop_id_sum_item_cnt_day(t-{i})'] = df_test['shop_id_sum_item_cnt_day'].shift(i)
    df_test[f'shop_id_mean_item_cnt_day(t-{i})'] = df_test['shop_id_mean_item_cnt_day'].shift(i)
    df_test[f'item_category_id_avg_item_price(t-{i})'] = df_test['item_category_id_avg_item_price'].shift(i)
    df_test[f'item_category_id_sum_item_cnt_day(t-{i})'] = df_test['item_category_id_sum_item_cnt_day'].shift(i)
    df_test[f'item_category_id_avg_item_cnt_day(t-{i})'] = df_test['item_category_id_avg_item_cnt_day'].shift(i)
    

In [61]:
df_test = df_test.fillna(0)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1608224 entries, 0 to 1608223
Data columns (total 39 columns):
 #   Column                                  Non-Null Count    Dtype  
---  ------                                  --------------    -----  
 0   date_block_num                          1608224 non-null  int64  
 1   shop_id                                 1608224 non-null  int64  
 2   item_id                                 1608224 non-null  int64  
 3   item_id_avg_item_price                  1608224 non-null  float64
 4   item_id_sum_item_cnt_day                1608224 non-null  float64
 5   item_id_avg_item_cnt_day                1608224 non-null  float64
 6   shop_id_avg_item_price                  1608224 non-null  float64
 7   shop_id_sum_item_cnt_day                1608224 non-null  float64
 8   shop_id_mean_item_cnt_day               1608224 non-null  float64
 9   item_category_id_avg_item_price         1608224 non-null  float64
 10  item_category_id_sum_item_cnt_

In [62]:
X, y = df_test.drop(['item_id_sum_item_cnt_day'], axis = 1) , df_test['item_id_sum_item_cnt_day']

In [63]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler


class XyScaler(BaseEstimator, TransformerMixin):
    """Standardize a training set of data along with a vector of targets."""

    def __init__(self):
        self.X_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        
    def fit(self, X, y, *args, **kwargs):
        """Fit the scaler to data and a target vector."""
        self.X_scaler.fit(X)
        self.y_scaler.fit(y.values.reshape(-1, 1))
        return self
    
    def transform(self, X, y, *args, **kwargs):
        """Transform a new set of data and target vector."""
        return (self.X_scaler.transform(X),
                self.y_scaler.transform(y.values.reshape(-1, 1)))

    def inverse_transform(self, X, y, *args, **kwargs):
        """Tranform from a scaled representation back to the original scale."""
        return (self.X_scaler.inverse_transform(X),
                self.y_scaler.inverse_transform(y.values.reshape(-1, 1)))

In [64]:
mdl1 = Ridge(alpha =0.05).fit(X, y)
y_pred = mdl1.predict(X)
mean_squared_error(y, y_pred), r2_score(y, y_pred)

(17.826223090842223, 0.892532640514687)

In [65]:
def cv(X_train, y_train, base_estimator, n_folds, random_seed=154):
    """Estimate the in- and out-of-sample error of a model using cross
    validation.
    
    Parameters
    ----------
    
    X_train: np.array
      Matrix of predictors.
      
    y_train: np.array
      Target array.
      
    base_estimator: sklearn model object.
      The estimator to fit.  Must have fit and predict methods.
      
    n_folds: int
      The number of folds in the cross validation.
      
    random_seed: int
      A seed for the random number generator, for repeatability.
    
    Returns
    -------
      
    train_cv_errors, test_cv_errors: tuple of arrays
      The training and testing errors for each fold of cross validation.
    """
    kf = KFold(n_splits=n_folds, random_state=random_seed)
    train_cv_errors, valid_cv_errors = np.empty(n_folds), np.empty(n_folds)

    for idx, (train_idx, valid_idx) in enumerate(kf.split(X_train)):
        # Split into train and test
        X_cv_train, y_cv_train = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_cv_valid, y_cv_valid = X_train.iloc[valid_idx], y_train.iloc[valid_idx]

        # Standardize data, fit on training set, transform training and test.
        scaler = XyScaler()
        scaler.fit(X_cv_train, y_cv_train)
        X_cv_train_std, y_cv_train_std = scaler.transform(X_cv_train, y_cv_train)
        X_cv_valid_std, y_cv_valid_std = scaler.transform(X_cv_valid, y_cv_valid)

        # Fit ridge regression to training data.
        mdl = clone(base_estimator)
        mdl.fit(X_cv_train_std, y_cv_train_std)

        # Make predictions.
        y_cv_train_pred = mdl.predict(X_cv_train_std)
        y_cv_valid_pred = mdl.predict(X_cv_valid_std)

        # Calculate MSE.
        train_cv_errors[idx] = mean_squared_error(y_cv_train_std, y_cv_train_pred)
        valid_cv_errors[idx] = mean_squared_error(y_cv_valid_std, y_cv_valid_pred)
    return train_cv_errors, valid_cv_errors

In [66]:
train_cv_errors, valid_cv_errors = cv(X, y, base_estimator=Ridge(alpha=0.5), n_folds=5, random_seed=154)

In [67]:
print(f"Training CV error: {train_cv_errors.mean():.2f}")
print(f"Test CV error: {valid_cv_errors.mean():.2f}")

Training CV error: 0.11
Test CV error: 0.12


In [69]:
def train_at_various_alphas(X_train, y_train, model, alphas, n_folds, random_seed, **kwargs):
    """Train a regularized regression model using cross validation at various
    values of alpha.
    
    Parameters
    ----------
    
    X_train: np.array
      Matrix of predictors.
      
    y_train: np.array
      Target array.
      
    model: sklearn model class
      A class in sklearn that can be used to create a regularized regression
      object.  Options are `Ridge` and `Lasso`.
      
    alphas: numpy array
      An array of regularization parameters.
      
    n_folds: int
      Number of cross validation folds.
      
    Returns
    -------
    
    cv_errors_train, cv_errors_valid: tuple of DataFrame
      DataFrames containing the training and validating errors for each value of
      alpha and each cross validation fold.  Each row represents a CV fold, and
      each column a value of alpha.
    """
    cv_errors_train = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))),
                                     columns=alphas)
    cv_errors_valid = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))),
                                        columns=alphas)
    for alpha in alphas:
        train_cv_errors, valid_cv_errors = cv(X_train, y_train, base_estimator=model(alpha=alpha, **kwargs), n_folds=n_folds, random_seed=random_seed)
        cv_errors_train.loc[:,alpha] = train_cv_errors
        cv_errors_valid.loc[:,alpha] = valid_cv_errors
    return cv_errors_train, cv_errors_valid

In [70]:
alphas=np.logspace(-2,4, num=50)
cv_errors_train, cv_errors_valid = train_at_various_alphas(X, y, model=Lasso, alphas=alphas, n_folds=5, random_seed=154)

KeyboardInterrupt: 

In [None]:
train_means, valid_means = cv_errors_train.mean(axis=0), cv_errors_valid.mean(axis=0)

In [None]:
alphas = valid_means.index
optimal_idx = np.argmin(valid_means.values)
optimal_alpha = alphas[optimal_idx]

In [None]:
#method 2:
optimal_alpha = valid_means.idxmin()
optimal_alpha

In [None]:
fig, ax = plt.subplots()
ax.plot(np.log10(alphas), train_means, label='Train cv error')
ax.plot(np.log10(alphas), valid_means, label='Valid cv error')
ax.axvline(np.log10(valid_means.idxmin()), color='red')
ax.legend()
ax.set_title('Ridge Regression Train and Valid MSE')
ax.set_xlabel('log(alpha)')
ax.set_ylabel('MSE');