In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#### Load Datasets

In [8]:
df_train = pd.read_csv('../data/sales.csv')
df_test = pd.read_csv('../data/ironkaggle_notarget.csv')

#### Clean Data

In [9]:
# clean the raw datasets
def clean_dataset(df):
    df = df.drop(columns=['True_index', 'Store_ID', 'School_holiday'])
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
    df = pd.get_dummies(df, columns=['State_holiday']).drop(columns=['State_holiday_0'])
    df['Month'] = df['Date'].apply(lambda x: x.month)
    df = df.drop(columns=['Date'])
    df = pd.get_dummies(df, columns=['Month'])
    df = df.drop(columns=['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11'])
    return df
df_train = clean_dataset(df_train)
df_test = clean_dataset(df_test)

#### Prepare Data

In [10]:
# assign test and train data
X_test = df_test
X_train = df_train.drop(columns=['Sales'])
y_train = df_train['Sales']

# normalize features
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

#### Model Training

In [11]:
# define the model
model = RandomForestRegressor(n_estimators=500,
                               max_depth=8,
                               min_samples_split=2,
                               min_samples_leaf=3,
                               random_state=42)

# train the model
model.fit(X_train, y_train)

#### Model Application

In [None]:
# calculate prediction
pred = model.predict(X_test)

# calculate  scores
r2_score =  model.score(X_test, y_test)
mae = mean_absolute_error(pred, y_test)
mse = mean_squared_error(pred, y_test, squared=False)

# print scores
print(f"Model: tbd")
print(f"R2 score: {r2_score}")
print(f"MAE: {mae}")
print(f"MSE: {mse}")