# Test set performance

In [7]:
# Define classes for preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import FeatureUnion
from pandas.tseries.offsets import DateOffset
import numpy as np

# select specific columns for pipeline
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# preprocessing:
# - merge in store data
# - add additional columns
# - remove unneeded rows and columns
class AttributeAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.storefile = pd.read_csv('data/store.csv')
    def fit(self, X, y=None):
        X['Date'] = pd.to_datetime(X['Date'], errors='coerce')
        last_date = max(X['Date'])
        interval_start = last_date - DateOffset(months=3)
        self.stores = X.loc[(X['Store'] > 0) & (X['Sales'] > 0) & (X['Date'] > interval_start), 
                            ['Store', 'Sales']].groupby(['Store'])
        
        return self
    def transform(self, X, y=None):     
        
        # remove Sales =0 
        X = X[X['Sales'] > 0]
        
        # remove Stores without id
        X = X[X['Store'] > 0]
    
        
        # merge in store data
        X = pd.merge(X, self.storefile, how='left', on='Store')
        
        # create features mean and median of Sales per store over last 3 months
 
        for store, v in self.stores['Sales']:
            X.loc[X['Store'] == store, 'Store_mean'] = v.mean()
            X.loc[X['Store'] == store, 'Store_median'] = v.median()
       
        # Create more date columns
        X['Date'] = pd.to_datetime(X['Date'], errors='coerce')
        X['Year'] = X.Date.dt.year
        X['Month'] = X.Date.dt.month
        X['DayOfWeek'] = X.Date.dt.dayofweek
        
        # Create feature Competition since months, cap for high values
        X["CompetitionSinceMonths"] = ((X['Year']- X['CompetitionOpenSinceYear']) * 12 +
                                           (X['Month'] - X['CompetitionOpenSinceMonth']))
        X.loc[X["CompetitionSinceMonths"] < 0, "CompetitionSinceMonths"] = 0
        X.loc[X["CompetitionSinceMonths"] > 24, "CompetitionSinceMonths"] = 24
        
        # Set competition distance at cap if competition is not yet open
        max_distance = 10000.
        X.loc[X["CompetitionSinceMonths"] == 0, 'CompetitionDistance'] = max_distance
        X.loc[X['CompetitionDistance'] > max_distance, 'CompetitionDistance']  = max_distance
        
        # Create promo since column, cap for high values
        X["Promo2SinceWeeks"] = ((X['Year']- X['Promo2SinceYear']) * 52 +
                                           (X['Month'] - X['Promo2SinceWeek']))
        X.loc[X["Promo2SinceWeeks"] < 0, "Promo2SinceWeeks"] = 0
        X.loc[X["Promo2SinceWeeks"] > 12, "Promo2SinceWeeks"] = 12
        
        # Clean up state holiday
        X['StateHoliday'] = X['StateHoliday'].replace(0.0, "0")
        X['StateHoliday'] = X['StateHoliday'].replace("0", 0)
        X['StateHoliday'] = X['StateHoliday'].replace("a", "Public")
        X['StateHoliday'] = X['StateHoliday'].replace("b", "Easter")
        X['StateHoliday'] = X['StateHoliday'].replace("c", "Christmas")
        
        # drop columns
        X = X.drop(columns = ['Customers', 'Open', "CompetitionOpenSinceMonth",
                             'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear',
                             'PromoInterval', 'Date']
        )
        self.attribute_names = X.columns
        return X


# define evaluation metrics RMSPE
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])


# full pipeline including attribute adding, missing value imputing
# one-hot encoding
def full_pipeline(data, fit=True):
    
    if fit:
        transformed = pp.fit_transform(data)
    else:
        transformed = pp.transform(data) 
    
    impute_miss_cat = ['SchoolHoliday', 'StateHoliday', "Promo"]
    si = SimpleImputer(strategy="constant", fill_value = 0)
    
    dfs = DataFrameSelector(impute_miss_cat)
    t = dfs.fit_transform(transformed)
    cat_tr = si.fit_transform(t)

    transformed.loc[:, impute_miss_cat] = cat_tr
    
    numerical_features = ['CompetitionDistance', "CompetitionSinceMonths", "Promo2SinceWeeks"]
                    
    target = ['Sales']
    num_columns = numerical_features + target
    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_columns)),
        ('imputer', SimpleImputer(strategy="median")),
       
    ])
    
    num_tr = num_pipeline.fit_transform(transformed)
    transformed.loc[:, num_columns] = num_tr        
    
    one_hot_cat = ['StateHoliday', 'StoreType', 'Assortment',
                   'Month', 'DayOfWeek']
    transformed = pd.get_dummies(transformed, columns=one_hot_cat)
    return transformed

In [2]:
# Load data
import pandas as pd
train_data = pd.read_csv("data/train.csv")
store = pd.read_csv('data/store.csv')


# Insert test data link here
test = pd.read_csv("https://raw.githubusercontent.com/igivis7/dsr28_minicomp_team1/main/data/holdout.csv")


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# run preprocessing on training to fit some values
pp = AttributeAdder()
train_transformed = full_pipeline(train_data)

In [4]:
# run preprocessing on test set
test_trans = full_pipeline(test, fit=False)
for c in train_transformed.columns:
    if not c in test_trans.columns:
        test_trans[c] = 0


In [5]:
# columns to keep
keep_columns = ["Promo", "Promo2SinceWeeks", "CompetitionSinceMonths", 
                "Store_mean", "CompetitionDistance", "Month_12", "DayOfWeek_0", 
                "DayOfWeek_1", "DayOfWeek_5"]

X_train = train_transformed.loc[:, keep_columns]
y_train = train_transformed['Sales']
X_valid = test_trans.loc[:, keep_columns]
y_valid = test_trans['Sales']

In [8]:
from sklearn.tree import DecisionTreeRegressor
import pickle

tree_reg_trained = pickle.load(open("model/tree_26.5.sav", "rb"))
y_pred = tree_reg_trained.predict(X_valid)
rmspe = metric(y_pred, np.array(y_valid))
print(f'Prediction: decision tree, RMSPE={rmspe:.2f}%')

Prediction: decision tree, RMSPE=26.51%
