In [102]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [103]:
oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv")
sample = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv")
holidays = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv")
stores = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
df = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
valid = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
transactions = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv")

In [104]:
oil

In [105]:
holidays

In [106]:
valid

In [107]:
df

In [108]:
df = df.join(stores.set_index('store_nbr'), on='store_nbr')
df = df.merge(holidays, how='left', left_on='date', right_on='date')
df = df.merge(oil, how='left', left_on='date', right_on='date')

In [109]:
df

In [110]:
df = df.drop(['description'], axis=1)

In [111]:
df.shape

In [112]:
df.info()

In [113]:
df.describe()

Data Analysis

In [114]:
df.isnull().sum()

In [115]:
missing = (df.isnull().sum() / df.shape[0] * 100).sort_values(ascending=False)
missing = missing[missing > 0]
missing

In [116]:
df = df.drop(['type_y', 'locale', 'locale_name', 'transferred'], axis=1)

In [117]:
df.isnull().sum()

In [118]:
df = df.drop(['id'], axis=1)

In [119]:
df

In [120]:
cat_features = df.loc[:, df.columns != 'date'].select_dtypes(exclude=['int', 'float']).columns
num_features = df.loc[:, df.columns != 'sales'].select_dtypes(exclude=['object']).columns
date_f = df[['date']]
df_cat = df[cat_features]
df_num = df[num_features]
df_sales = df[['sales']]

In [121]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df_cat = df_cat.apply(le.fit_transform)

In [122]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
df_num = mms.fit_transform(df_num)
df_num = pd.DataFrame(df_num, columns=num_features)

In [123]:
df = pd.concat([df_cat, df_num], axis=1)

In [124]:
df.isnull().sum()

In [125]:
#from sklearn.impute import KNNImputer
#imputer = KNNImputer(n_neighbors=30000)
#df = pd.DataFrame(imputer.fit_transform(df),columns = df.columns)
df = df.fillna(df.median())

In [126]:
df

In [127]:
date_f

In [128]:
from datetime import datetime
date_f['date'] = [datetime.strptime(d, "%Y-%m-%d") for d in date_f['date']]
date_f['year'] = [d.year for d in date_f['date']]
date_f['month'] = [d.month for d in date_f['date']]
date_f['day'] = [d.day for d in date_f['date']]

In [129]:
df = pd.concat([df_sales, df, date_f[['year', 'month', 'day']]], axis=1)

In [130]:
df

In [131]:
q1 = df[num_features].quantile(0.05)
q3 = df[num_features].quantile(0.95)
intr_qr = q3 - q1
df = df[~((df[num_features] < (q1-1.5*intr_qr)) | (df[num_features] > (q3+1.5*intr_qr))).any(axis=1)]

In [132]:
df

In [133]:
from matplotlib import pyplot as plt
df.hist(figsize=(25, 30), xlabelsize=8, ylabelsize=8)
plt.show()

train test split

In [134]:
from sklearn.model_selection import train_test_split
X = df.loc[:, df.columns != 'sales'].values
y = df[['sales']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Prepare model

In [135]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_absolute_percentage_error as mae
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse

In [143]:
models = []
models.append(('LinR', LinearRegression()))
models.append(('RF', RandomForestRegressor()))
models.append(('GB', GradientBoostingRegressor()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('XGB', XGBRegressor()))

In [146]:
# K-fold Cross Validation
def model_performance(X, y, kf_model):
    kfold = KFold(n_splits=5, random_state=42, shuffle=True)
    confusion_mat = np.zeros((2, 2))
    mse_list, mae_list, r2_list = 0, 0, 0
    for train_idx, test_idx in kfold.split(y):
        kf_X_train, kf_y_train = X[train_idx], y[train_idx]
        kf_X_test, kf_y_test = X[test_idx], y[test_idx]
        kf_model.fit(kf_X_train, kf_y_train)
        kf_y_pred = kf_model.predict(kf_X_test)
        mse_list += mse(kf_y_pred, kf_y_test)
        mae_list += mae(kf_y_pred, kf_y_test)
        r2_list += r2(kf_y_pred, kf_y_test)
    
    print("mse: ", mse_list/5)
    print("mae: ", mae_list/5)
    print("r2: ", r2_list/5)
    print()

In [147]:
for model_name, model in models:
    print(model_name)
    model_performance(X, y, model)

model fitting

In [None]:
def best_model(model):
    print("Best Model Performance Score: ", model.best_score_)    
    print("Best Model Parameters: ", model.best_params_)
    
    
def AUC_scores(y, y_pred, y_prob):
    auc_score = roc_auc_score(y, y_pred); 
    fpr_df, tpr_df, _ = roc_curve(y, y_prob); 
    return (auc_score, fpr_df, tpr_df)

Valid 

In [None]:
valid = valid.join(stores.set_index('store_nbr'), on='store_nbr')
valid = valid.merge(holidays, how='left', left_on='date', right_on='date')
valid = valid.merge(oil, how='left', left_on='date', right_on='date')

valid = valid.drop(['description', 'type_y', 'locale', 'locale_name', 'transferred'], axis=1)
valid_id = valid[['id']]
valid = valid.drop(['id'], axis=1)

In [None]:
df_cat = df[cat_features]
df_num = df[num_features]
df_cat = df_cat.apply(le.transform)
df_num = mms.transform(df_num)
df_num = pd.DataFrame(df_num, columns=num_features)
df = pd.concat([df_cat, df_num], axis=1)
df = df.fillna(df.median())

In [None]:
date_f['date'] = [datetime.strptime(d, "%Y-%m-%d") for d in date_f['date']]
date_f['year'] = [d.year for d in date_f['date']]
date_f['month'] = [d.month for d in date_f['date']]
date_f['day'] = [d.day for d in date_f['date']]
df = pd.concat([df_sales, df, date_f[['year', 'month', 'day']]], axis=1)

q1 = df[num_features].quantile(0.05)
q3 = df[num_features].quantile(0.95)
intr_qr = q3 - q1
df = df[~((df[num_features] < (q1-1.5*intr_qr)) | (df[num_features] > (q3+1.5*intr_qr))).any(axis=1)]