In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
import pickle
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
import time
import multiprocessing
from parallelization import n_neighbours_test
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost
from sklearn.ensemble import GradientBoostingRegressor

In [9]:
data = pd.read_pickle(r".\data\Sales_Clean.p")

In [51]:
label = data["Sales"]
features = data.drop(["True_index","State_holiday","Sales"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, label)

In [52]:
# normalization

# create the normalizer object
normalizer = MinMaxScaler()

# fit the data
normalizer.fit(X_train)

X_train_normalized = normalizer.transform(X_train)
X_test_normalized = normalizer.transform(X_test)

### KNN

In [53]:
knn_r = KNeighborsRegressor(n_neighbors=8)

#train the model
knn_r.fit(X_train_normalized, y_train)

KNeighborsRegressor(n_neighbors=8)

In [54]:
# evaluate
knn_r.score(X_test_normalized, y_test)

0.8908784355910494

In [45]:
# let's check if the data is overfiting
knn_r.score(X_train_normalized, y_train)

0.9171697995493016

In [33]:
training_accuracy = []
test_accuracy = []
processes = []

neighbours_settings = range(1,20)

for n_neighbour in neighbours_settings:
    p = multiprocessing.Process(target=n_neighbours_test, args=[n_neighbour, X_train_normalized, X_test_normalized, y_train, y_test])
    p.start()
    processes.append(p)

for process in processes:
    process.join()

In [42]:
scores_dif = []
for i in range(1,10):
    scores = pd.read_pickle(r".\knn_scores\knn_scores_" + str(i))
    scores_dif.append(scores[1] - scores[0])
print("train score: ", scores[0], "overfitting: ", min(scores_dif), "n_neighbours: ", i)

train score:  0.8906426440717451 overfitting:  0.02379365592525906 n_neighbours:  9


In [46]:
pickle.dump(knn_r, open(r".\models\knn_r_89.p","wb"))

### Decision Tree

In [22]:
# create model 
tree = DecisionTreeRegressor(max_depth=15)

# train the model
tree.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=15)

In [23]:
# evaluate
tree.score(X_test, y_test)

0.9074820023373599

In [28]:
# let's check if the data is overfiting
tree.score(X_train, y_train)

0.9219094265448073

In [29]:
pickle.dump(tree, open(r".\models\tree_91.p","wb"))

### Decision Tree - Bagging 

In [60]:
bagging_reg = BaggingRegressor(
    DecisionTreeRegressor(max_depth=26),
    n_estimators=30,
    max_samples=100000,
    bootstrap=False
)

bagging_reg.fit(X_train, y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=26),
                 bootstrap=False, max_samples=100000, n_estimators=30)

In [61]:
bagging_reg.score(X_test, y_test)

0.9323694908938109

In [62]:
bagging_reg.score(X_train, y_train)

0.9556238887902971

In [63]:
pickle.dump(bagging_reg, open(r".\models\bagging_reg_93.p","wb"))

### Random Forest

In [78]:
forest = RandomForestRegressor(
    n_estimators=30,
    max_depth=26
)

forest.fit(X_train, y_train)

RandomForestRegressor(max_depth=26, n_estimators=30)

In [79]:
forest.score(X_test, y_test)

0.9513557539587023

In [80]:
forest.score(X_train, y_train)

0.9877060946179733

In [88]:
pickle.dump(forest, open(r".\models\forest_95.p","wb"))

### XGB

In [83]:
xgb_reg = xgboost.XGBRegressor()

In [84]:
xgb_reg.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [85]:
xgb_reg.score(X_test, y_test)

0.9602454048214857

In [86]:
xgb_reg.score(X_train, y_train)

0.9616934008635222

In [90]:
pickle.dump(xgb_reg, open(r".\models\xgb_reg.p","wb"))

### Gradient Boost

In [101]:
gb_reg = GradientBoostingRegressor(
    loss="huber",
    max_depth=15,
    n_estimators= 100
)

gb_reg.fit(X_train, y_train)

GradientBoostingRegressor(loss='huber', max_depth=15)

In [102]:
gb_reg.score(X_test, y_test)

0.9725836760109995

In [103]:
gb_reg.score(X_train, y_train)

0.9866541054560686

In [104]:
pickle.dump(gb_reg, open(r".\models\gb_reg.p","wb"))