In [2]:
import pandas as pd
import numpy as np
import io
import dvc.api
# from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os

import warnings; warnings.simplefilter('ignore')

In [3]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))

In [4]:
df_train = pd.read_csv('../data/df_train.csv')
df_test = pd.read_csv('../data/df_test.csv')

In [5]:
from data_cleaner import Clean_df

In [6]:
train = Clean_df(df_train)
test = Clean_df(df_test)

In [7]:
train.data_pipeline();
test.data_pipeline();

In [8]:
train.df.to_csv('../data/df_train_prep.csv')
test.df.to_csv('../data/df_test_prep.csv')

In [9]:
train.df.sample(5)

Unnamed: 0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,...,Quarter,Week,Day,WeekOfYear,DayOfYear,IsWeekDay,CompetitionOpenMonthDuration,PromoOpenMonthDuration,Season,Month_Status
518128,439,1,5132,753,1,0,0,0,a,a,...,1,13,24,13,83,1,54.0,24171.25,Spring,End
619974,820,1,21485,1985,1,0,0,1,a,c,...,4,52,23,52,357,1,1367.0,-9.0,Winter,End
1000797,313,2,5147,591,1,0,0,0,d,c,...,1,3,15,3,15,1,1356.0,24156.75,Winter,Mid
637742,748,6,5674,506,1,0,0,0,d,a,...,4,49,7,49,341,0,45.0,32.75,Winter,Beginning
374580,907,3,7355,1027,1,1,0,1,a,c,...,3,32,6,32,218,1,1375.0,24176.0,Summer,Beginning


In [10]:
test.df.dtypes

Store                             int64
DayOfWeek                         int64
Open                            float64
Promo                             int64
StateHoliday                     object
SchoolHoliday                     int64
StoreType                        object
Assortment                       object
CompetitionDistance             float64
Promo2                            int64
PromoInterval                    object
Until_Holiday                     int64
Since_Holiday                   float64
Year                              int64
Month                             int64
Quarter                           int64
Week                              int64
Day                               int64
WeekOfYear                        int64
DayOfYear                         int64
IsWeekDay                         int64
CompetitionOpenMonthDuration    float64
PromoOpenMonthDuration          float64
Season                           object
Month_Status                     object


In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

cont_pipeline = make_pipeline(
    SimpleImputer(strategy = 'median'),
    PowerTransformer(method = 'yeo-johnson', standardize = False),
    StandardScaler()
)
disc_pipeline = make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = -1),
    StandardScaler()
)
cat_pipeline = make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'unknown'),
    OneHotEncoder()
)

In [12]:
X_train = train.df.drop(columns=['Sales','Customers'],axis=1).copy(deep=True)
Y_train = train.df['Sales'].copy(deep=True)
X_test = test.df.copy(deep=True)

In [13]:
cont_cols = ['DayOfWeek', 'Until_Holiday', 'Since_Holiday','CompetitionOpenMonthDuration','CompetitionDistance','PromoOpenMonthDuration']
num_cols = list(X_train.select_dtypes(include = 'number').columns)
cat_cols = list(set(X_train.columns)-set(num_cols))
disc_cols = list(set(num_cols) - set(cont_cols))


In [14]:
preprocessor = ColumnTransformer(
    transformers = [
        ('continuous', cont_pipeline, cont_cols),
        ('discrete', disc_pipeline, disc_cols),
        ('categorical', cat_pipeline, cat_cols)
    ]
)




In [15]:
preprocessor.fit(X_train)
preprocessor.fit(X_test)


ColumnTransformer(transformers=[('continuous',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('powertransformer',
                                                  PowerTransformer(standardize=False)),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['DayOfWeek', 'Until_Holiday', 'Since_Holiday',
                                  'CompetitionOpenMonthDuration',
                                  'CompetitionDistance',
                                  'PromoOpenMonthDuration']),
                                ('discrete',
                                 Pipel...
                                 ['Open', 'Store', 'Year', 'Day', 'Month',
                                  'DayOfYear', 'Promo', 'IsWeekDay', 'Week',

In [1]:
def quick_eval(pipeline, X,Y, verbose=True):
    """
    Quickly trains modeling pipeline and evaluates on test data. Returns original model, training RMSE, and testing
    RMSE as a tuple.
    """
    
    X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42)
    
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    train_score = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_score = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    if verbose:
        print(f"Regression algorithm: {pipeline.named_steps['regressor'].__class__.__name__}")
        print(f"Train RMSE: {train_score}")
        print(f"Test RMSE: {test_score}")
    
    return pipeline.named_steps['regressor'], train_score, test_score



In [17]:
regressors = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(n_estimators=200,max_features=20),
]

for r in regressors:
    pipe = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('regressor', r)
    ])

    quick_eval(pipe, X_train,Y_train)
    print()

Regression algorithm: LinearRegression
Train RMSE: 2484.003965585784
Test RMSE: 2487.3412518547602

Regression algorithm: DecisionTreeRegressor
Train RMSE: 0.0
Test RMSE: 1194.0668787443592



: 

: 

## Feature Importance Analysis

In [None]:
from sklearn.model_selection import KFold

# pipeline to estimate feature importances
total_features = preprocessor.fit_transform(X_train).shape[1]
rf = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(max_features=np.int(total_features/3)))
])

# get variable ranks for each fold
var_ranks = []
kf = KFold(n_splits=5, random_state=42, shuffle=True)
for tr_idx, val_idx in kf.split(X_train):
    X_train_cv, y_train_cv = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    X_val_cv, y_val_cv = X_train.iloc[val_idx], y_train.iloc[val_idx]
    
    output = quick_eval(rf, X_train_cv, y_train_cv, X_val_cv, y_val_cv, verbose=False) 
    var_ranks.append(list(output[0].feature_importances_))