In [1]:
# std
import os
import sys
import inspect
import time
import pathlib
from math import sqrt
from math import log2
# packgaes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline

# packages
from matplotlib.colors import ListedColormap

## sklearn
from sklearn.preprocessing import StandardScaler,PowerTransformer,MinMaxScaler,QuantileTransformer,normalize
from sklearn.model_selection import train_test_split, learning_curve, ShuffleSplit
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.metrics import r2_score
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import median_absolute_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


# for selection the right path
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from common.DataParser import parse_metro
from common.model_trainer_reg import *
from common.regression_plotfunctions import *


from GD.LinearRegression import LinearRegression
from KNN.KNNRegressor import KNNRegressor

import metro_preprocessing


# Train and Test

In [2]:
df_raw = parse_metro()
df_raw = df_raw.sample(5000)
X, Y = metro_preprocessing.preprocessing(df_raw, transform = False)

In [3]:
df_raw

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
46407,,293.95,0.00,0.0,1,Thunderstorm,proximity thunderstorm,2018-08-03 23:00:00,1833
1425,,268.72,0.00,0.0,90,Snow,heavy snow,2012-11-23 23:00:00,1512
43904,,293.01,0.00,0.0,1,Clear,sky is clear,2018-05-08 09:00:00,5428
17234,,294.01,0.00,0.0,75,Mist,mist,2015-08-09 08:00:00,1713
32044,,275.62,0.00,0.0,90,Clouds,overcast clouds,2017-03-25 06:00:00,1006
...,...,...,...,...,...,...,...,...,...
47041,,291.48,1.78,0.0,90,Rain,moderate rain,2018-08-25 09:00:00,4100
5412,,280.60,0.00,0.0,90,Clouds,overcast clouds,2013-04-22 21:00:00,1627
45044,,301.51,0.00,0.0,40,Clouds,scattered clouds,2018-06-17 11:00:00,3790
30610,,270.71,0.00,0.0,90,Mist,mist,2017-01-30 11:00:00,3983


In [4]:
del df_raw
# We don't need it anymore :)
try:
    df_raw
except Exception as e:
    print(e)

name 'df_raw' is not defined


In [5]:
n_splits = 10
test_size = 0.3

## SGD-Regression

In [None]:
MODEL = SGDRegressor
NAME = "SGD"
PATH = "out/"+NAME+"/"
params = {"alpha" : [0.0001,0.00001],
          "max_iter" : [1000,2000,3000,5000]}


modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "sklearn_TTS_SGD_raw.csv")
print(PATH + "sklearn_TTS_SGD.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "sklearn_CV_SGD_raw.csv")
display(results)

## My SGD-Regression

In [None]:
MODEL = LinearRegression
params = {"alpha" : [0.0001,0.00001],
          "max_iter" : [1000,2000,3000,5000]}


modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "my_TTS_SGD_transforamtion.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "my_CV_SGD_transforamtion.csv")
display(results)

## KNN-Regression

### Data for runtime analysis

In [None]:
NAME = "KNN"
PATH = "out/"+NAME+"/"

def dataset_size_experiment(
        subset_size=np.arange(0.1, 1.1, 0.1),
        path_csv=PATH+"sklearn_TTS_KNN_app.csv"
        ):
    results = []
    k = 1
    break_next = False
    for subs in subset_size:
        if subs < 1:
            n_train = int(subs * len(Y))
            print(f"{100*subs:.2f}% --> n_train={n_train}")
        else:
            n_train = subs
            print(f"{100*subs/len(Y):.2f}% --> n_train={n_train}")
        if break_next:
            break
        if n_train > len(Y):
            n_train = len(Y)
            break_next = True
        modeltrainer = ModelTrainer(MODEL, params, X[:n_train,:], Y[:n_train], thread_cnt=thread_cnt)
        modeltrainer.TTSplit(test_size = test_size)
        modeltrainer.k = k 
        modeltrainer.train()
        #print(modeltrainer.results)
        results.append(modeltrainer.results)
        k += 1
        print("-"*30)
    results = pd.concat(results, ignore_index=True)
    results.to_csv(path_csv, index=False)
    display(results)
    return results

In [None]:
subsets = [20, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200]

In [None]:
MODEL = KNeighborsRegressor
params = {
    "weights" : ["uniform"],
    "algorithm": ["brute", "kd_tree", "ball_tree"]
}
NAME = "KNN"
PATH = "out/runtimes/"
########### train with TrainTestSplit  ###################
thread_cnt = 4
results = dataset_size_experiment(subsets ,path_csv=PATH+"sklearn_TTS_KNN_app.csv")

In [None]:
MODEL = KNNRegressor
params = {"n_neighbors" : [5],
           "p": [2],
           "chunk_size": [1, 4]}
NAME = "KNN"
PATH = "out/runtimes/"
########### train with TrainTestSplit  ###################
thread_cnt = 4
results = dataset_size_experiment(subsets ,path_csv=PATH+"my_TTS_KNN_app.csv")

#### Rest


In [6]:
MODEL = KNeighborsRegressor
params = {"weights" : ["uniform"],
            "n_neighbors" : [5,10],
           "p": [2,3]}
NAME = "KNN"
PATH = "out/"+NAME+"/"


modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "sklearn_TTS_KNN_raw.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "sklearn_CV_KNN_raw.csv")
display(results)

Training all model variations took 2.3822s - holdout


Unnamed: 0,weights,n_neighbors,p,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train
0,uniform,5,2,0.036229,0.110392,0,0.503103,1933278.0,10,4998,1500,3498
1,uniform,5,3,0.024931,0.97443,0,0.497261,1956007.0,10,4998,1500,3498
2,uniform,10,2,0.021083,0.111877,0,0.516419,1881471.0,10,4998,1500,3498
3,uniform,10,3,0.021312,1.078696,0,0.497971,1953247.0,10,4998,1500,3498


Using CV with k={k} folds.
Training all model variations took 2.2810s - CV fold # =1
Training all model variations took 2.0458s - CV fold # =2
Training all model variations took 2.2041s - CV fold # =3
Training all model variations took 2.2203s - CV fold # =4
Training all model variations took 2.3379s - CV fold # =5
Training all model variations took 2.2698s - CV fold # =6
Training all model variations took 2.0794s - CV fold # =7
Training all model variations took 2.2180s - CV fold # =8
Training all model variations took 2.1963s - CV fold # =9
Training all model variations took 2.2517s - CV fold # =10


Unnamed: 0,weights,n_neighbors,p,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train
0,uniform,5,2,0.025866,0.115524,1,0.503103,1933278.0,10,4998,1500,3498
1,uniform,5,3,0.022403,0.916203,1,0.497261,1956007.0,10,4998,1500,3498
2,uniform,10,2,0.021389,0.115939,1,0.516419,1881471.0,10,4998,1500,3498
3,uniform,10,3,0.020899,1.039822,1,0.497971,1953247.0,10,4998,1500,3498
4,uniform,5,2,0.020941,0.08965,2,0.489195,1969592.0,10,4998,1500,3498
5,uniform,5,3,0.020492,0.830378,2,0.487893,1974610.0,10,4998,1500,3498
6,uniform,10,2,0.020285,0.104607,2,0.493744,1952051.0,10,4998,1500,3498
7,uniform,10,3,0.021042,0.935376,2,0.466912,2055512.0,10,4998,1500,3498
8,uniform,5,2,0.020857,0.098463,3,0.465306,2063780.0,10,4998,1500,3498
9,uniform,5,3,0.021009,0.896233,3,0.453582,2109030.0,10,4998,1500,3498


## my KNN-Regression

In [7]:
MODEL = KNNRegressor
params = {"weights" : ["uniform"],
            "n_neighbors" : [5,10],
           "p": [2,3],
           "chunk_size": [2]}
NAME = "KNN"
PATH = "out/"+NAME+"/"


modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "my_TTS_KNN_raw.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "my_CV_KNN_raw.csv")
display(results)

chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.3813s - holdout


Unnamed: 0,weights,n_neighbors,p,chunk_size,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train
0,uniform,5,2,2,1e-05,0.552836,0,0.503103,1933278.0,10,4998,1500,3498
1,uniform,5,3,2,6e-06,2.137665,0,0.497261,1956007.0,10,4998,1500,3498
2,uniform,10,2,2,7e-06,0.536342,0,0.516419,1881471.0,10,4998,1500,3498
3,uniform,10,3,2,7e-06,2.151402,0,0.497971,1953247.0,10,4998,1500,3498


Using CV with k={k} folds.
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.5473s - CV fold # =1
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.2024s - CV fold # =2
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.1999s - CV fold # =3
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.1990s - CV fold # =4
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.9018s - CV fold # =5
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.6222s - CV fold # =6
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.4060s - CV fold # =7
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.1516s - CV fold # =8
chunking...
chunking...
chunking...
chunking...
Training all model variations took 5.0880s - CV fold # =9
chunking...
chunkin

Unnamed: 0,weights,n_neighbors,p,chunk_size,train_time,inference_time,k,R2_score,RMSE,D,N,n_test,n_train
0,uniform,5,2,2,1.1e-05,0.632534,1,0.503103,1933278.0,10,4998,1500,3498
1,uniform,5,3,2,7e-06,2.300569,1,0.497261,1956007.0,10,4998,1500,3498
2,uniform,10,2,2,8e-06,0.508259,1,0.516419,1881471.0,10,4998,1500,3498
3,uniform,10,3,2,7e-06,2.102785,1,0.497971,1953247.0,10,4998,1500,3498
4,uniform,5,2,2,6e-06,0.515087,2,0.489195,1969592.0,10,4998,1500,3498
5,uniform,5,3,2,6e-06,2.080321,2,0.487893,1974610.0,10,4998,1500,3498
6,uniform,10,2,2,6e-06,0.529763,2,0.493744,1952051.0,10,4998,1500,3498
7,uniform,10,3,2,6e-06,2.074296,2,0.466912,2055512.0,10,4998,1500,3498
8,uniform,5,2,2,7e-06,0.5183,3,0.465306,2063780.0,10,4998,1500,3498
9,uniform,5,3,2,8e-06,2.072098,3,0.453582,2109030.0,10,4998,1500,3498


## RF-Regression

In [None]:
MODEL = RandomForestRegressor
params = {"n_estimators" : [100,200],
            "max_features": ["auto", "sqrt","log2"]}
NAME = "RF"
PATH = "out/"+NAME+"/"



modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "sklearn_TTS_RF_transforamtion.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "sklearn_CV_RF_transforamtion.csv")
display(results)

## DT-Regression

In [None]:
MODEL = DecisionTreeRegressor
params = {"criterion": ["mse"],
          "max_features": ["auto", "sqrt","log2"]}
NAME = "DT"
PATH = "out/"+NAME+"/"
n_splits = 10


modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "sklearn_TTS_DT_raw.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "sklearn_CV_DT_raw.csv")
display(results)