In [3]:
import pandas as pd
import numpy as np
import io
import dvc.api
# from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os

import warnings; warnings.simplefilter('ignore')

In [1]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error



In [4]:
path="data/df_test.csv"
repo='https://github.com/isaaclucky/sales-prediction'
version="V1.0"

data_url = dvc.api.read(path=path,
                    repo=repo,
                    rev=version
                    )

In [69]:
df_train = pd.read_csv('../data/df_train_prep.csv')
df_test = pd.read_csv('../data/df_test_prep.csv')

In [77]:
df_train.StateHoliday = df_train.StateHoliday.astype('string')
df_test.StateHoliday = df_test.StateHoliday.astype('string')

In [78]:
X_train = df_train.drop(columns=['Sales','Customers','Date','Unnamed: 0'],axis=1).copy(deep=True)
Y_train = df_train['Sales'].copy(deep=True)
X_test =df_test.drop(columns=['Unnamed: 0'],axis=1).copy(deep=True)

In [79]:
cont_cols = ['DayOfWeek', 'Until_Holiday', 'Since_Holiday','CompetitionOpenMonthDuration','CompetitionDistance','PromoOpenMonthDuration']
num_cols = list(X_train.select_dtypes(include = 'number').columns)
cat_cols = list(set(X_train.columns)-set(num_cols))
disc_cols = list(set(num_cols) - set(cont_cols))


In [80]:
cont_pipeline = make_pipeline(
    SimpleImputer(strategy = 'median'),
    PowerTransformer(method = 'yeo-johnson', standardize = False),
    StandardScaler()
)
disc_pipeline = make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = -1),
    StandardScaler()
)
cat_pipeline = make_pipeline(
    SimpleImputer(strategy = 'constant', fill_value = 'unknown'),
    OneHotEncoder()
)


preprocessor = ColumnTransformer(
    transformers = [
        ('continuous', cont_pipeline, cont_cols),
        ('discrete', disc_pipeline, disc_cols),
        ('categorical', cat_pipeline, cat_cols)
    ]
)




In [82]:
preprocessor.fit(X_train)
preprocessor.fit(X_test)


ColumnTransformer(transformers=[('continuous',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('powertransformer',
                                                  PowerTransformer(standardize=False)),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['DayOfWeek', 'Until_Holiday', 'Since_Holiday',
                                  'CompetitionOpenMonthDuration',
                                  'CompetitionDistance',
                                  'PromoOpenMonthDuration']),
                                ('discrete',
                                 Pipel...
                                 ['Quarter', 'SchoolHoliday', 'Store',
                                  'IsWeekDay', 'Promo', 'Month', 'DayOfYear',
  

In [83]:
def eval_models(pipeline, X,Y, verbose=True):
    """
    Quickly trains modeling pipeline and evaluates on test data. Returns original model, training RMSE, and testing
    RMSE as a tuple.
    """
    
    X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42)
    
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    train_score = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_score = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    if verbose:
        print(f"Regression algorithm: {pipeline.named_steps['regressor'].__class__.__name__}")
        print(f"Train RMSE: {train_score}")
        print(f"Test RMSE: {test_score}")
    
    return pipeline.named_steps['regressor'], train_score, test_score



In [None]:
regressors = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(n_estimators=200,max_features=20),
]

for r in regressors:
    pipe = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('regressor', r)
    ])

    eval_models(pipe, X_train,Y_train)
    print()