In [None]:
# std
import os
import sys
import inspect
import time
import pathlib
from math import sqrt
from math import log2
# packgaes
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline

# packages
from matplotlib.colors import ListedColormap

## sklearn
from sklearn.preprocessing import StandardScaler,PowerTransformer,MinMaxScaler,QuantileTransformer,normalize
from sklearn.model_selection import train_test_split, learning_curve, ShuffleSplit
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.metrics import r2_score
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import median_absolute_error

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


# for selection the right path
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from common.DataParser import parse_superconductivity
from common.model_trainer_reg import *
from common.regression_plotfunctions import *

from GD.LinearRegression import LinearRegression
from KNN.KNNRegressor import KNNRegressor

import supercon_preprocessing

In [None]:
df_raw = parse_superconductivity()

#df_raw = df_raw.sample(3000)
df_raw

# Train and Test

In [None]:
X, Y = supercon_preprocessing.preprocessing(df_raw, transform=True)

In [None]:
n_splits = 5
test_size = 0.3

## SGD-Regression

In [None]:
MODEL = SGDRegressor
NAME = "SGD"
PATH = "out/"+NAME+"/"
params = {"alpha" : [0.0001]}


modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "sklearn_TTS_SGD.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "sklearn_CV_SGD.csv")
display(results)

## My SGD-Regression

In [None]:
MODEL = LinearRegression
params = {"alpha" : [0.0001], "max_iter": [1000]}

modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "my_TTS_SGD.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "my_CV_SGD.csv")
display(results)

## KNN-Regression

In [None]:
MODEL = KNeighborsRegressor
params = {
    "weights" : ["uniform"],
    "algorithm": ["brute", "kd_tree", "ball_tree"]
}
NAME = "KNN"
PATH = "out/"+NAME+"/"

modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "sklearn_CV_KNN.csv")
display(results)

In [None]:
def dataset_size_experiment(
        subset_size=np.arange(0.1, 1.1, 0.1),
        path_csv=PATH+"sklearn_TTS_KNN_app.csv"
        ):
    results = []
    k = 1
    break_next = False
    for subs in subset_size:
        if subs < 1:
            n_train = int(subs * len(Y))
            print(f"{100*subs:.2f}% --> n_train={n_train}")
        else:
            n_train = subs
            print(f"{100*subs/len(Y):.2f}% --> n_train={n_train}")
        if break_next:
            break
        if n_train > len(Y):
            n_train = len(Y)
            break_next = True
        modeltrainer = ModelTrainer(MODEL, params, X[:n_train,:], Y[:n_train], thread_cnt=thread_cnt)
        modeltrainer.TTSplit(test_size = test_size)
        modeltrainer.k = k 
        modeltrainer.train()
        #print(modeltrainer.results)
        results.append(modeltrainer.results)
        k += 1
        print("-"*30)
    results = pd.concat(results, ignore_index=True)
    results.to_csv(path_csv, index=False)
    display(results)
    return results

In [None]:
MODEL = KNeighborsRegressor
params = {
    "weights" : ["uniform"],
    "algorithm": ["brute", "kd_tree", "ball_tree"]
}
NAME = "KNN"
PATH = "out/"+NAME+"/"
########### train with TrainTestSplit  ###################
thread_cnt = 4
results = dataset_size_experiment([20, 200, 400, 800, 1600, 3200, 6400, 12800, 16000, 25600] ,path_csv=PATH+"sklearn_TTS_KNN_app.csv")

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
N = results["N"].to_numpy()
plt.plot(N, N*1e-4, ls="--", color="red",label="O(N)")
sns.lineplot(x="N", y="inference_time", hue="algorithm", data=results, ax=ax);
plt.grid()
ax.set_xscale("log")
ax.set_yscale("log")
plt.suptitle("Inference times [sklearn KNN]", fontsize=36)
ax.set_title(f"D={D}")

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
N = results["N"].to_numpy()
plt.plot(N, N*1e-4, ls="--", color="red",label="O(N)")
sns.lineplot(x="N", y="train_time", hue="algorithm", data=results, ax=ax);
plt.grid()
ax.set_xscale("log")
ax.set_yscale("log")
plt.suptitle("Training times [sklearn KNN]", fontsize=36)
ax.set_title(f"D={D}")

In [None]:
MODEL = KNNRegressor
params = {"n_neighbors" : [5],
           "p": [2],
           "chunk_size": [1, 4]}
NAME = "KNN"
PATH = "out/"+NAME+"/"
########### train with TrainTestSplit  ###################
thread_cnt = 4
results = dataset_size_experiment([20, 200, 400, 800, 1600, 3200, 6400, 12800, 16000, 25600] ,path_csv=PATH+"my_TTS_KNN_app.csv")

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
N = results["N"].to_numpy()
plt.plot(N, N*1e-4, ls="--", color="red",label="O(N)")
sns.lineplot(x="N", y="train_time", hue="chunk_size", data=results, ax=ax);
plt.grid()
ax.set_xscale("log")
ax.set_yscale("log")
plt.suptitle("Training times [sklearn KNN]", fontsize=36)
ax.set_title(f"D={D}")

In [None]:
fig, ax = plt.subplots(figsize=(10,8))
N = results["N"].to_numpy()
plt.plot(N, N*1e-4, ls="--", color="red",label="O(N)")
plt.plot(N, 1e-6*np.power(N, 2), ls="--", color="blue",label="O(N^2)")
sns.lineplot(x="N", y="inference_time", hue="chunk_size", data=results, ax=ax);
plt.grid()
ax.set_xscale("log")
ax.set_yscale("log")
plt.suptitle("Inference times [sklearn KNN]", fontsize=36)
ax.set_title(f"D={D}")

# Rest

In [None]:
MODEL = KNNRegressor
params = {"n_neighbors" : [5],
           "p": [2],
           "chunk_size": [1, 4]}
NAME = "KNN"
PATH = "out/"+NAME+"/"

modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=4)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "my_TTS_KNN.csv")
display(results)
############ shuffle_Cross validation  ###################
#modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
#results = modeltrainer.retResults(PATH + "my_CV_KNN.csv")
#display(results)

In [None]:
profiles = []
#chsizes = [1, 2, 4, 8, 10, 16, 32]
chsizes = [1, 4, 64, 512]
for chsize in chsizes:
    knn_test = KNNRegressor(p=1, chunk_size=chsize, profile=True)
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

    knn_test.fit(x_train, y_train)

    total = time()
    knn_test.predict(x_test)
    total = time() - total
    print(total)

    profile = knn_test.profile
    processed = {"chunk_size": chsize, "total": total}

    for key in profile:
        if key in ["partition",  "distances"]:
            val = sum(profile[key])
            processed[key] = val
        else:
            processed[key] = profile[key][0]
    profiles.append(processed)

print("-"*30)

for prof in profiles:
    for key in prof:
        print(f'{key}: {prof[key]:.4f}')
    print("-"*30)

## RF-Regression

In [None]:
MODEL = RandomForestRegressor
params = {"n_estimators" : [100]}
NAME = "RF"
PATH = "out/"+NAME+"/"

modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "sklearn_TTS_DT.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "sklearn_CV_DT.csv")
display(results)

## DT-Regression

In [None]:
MODEL = DecisionTreeRegressor
params = {"criterion": ["mse"]}
NAME = "DT"
PATH = "out/"+NAME+"/"


modeltrainer = ModelTrainer(MODEL, params, X, Y, thread_cnt=1)
########### train with TrainTestSplit  ###################
modeltrainer.TTSplit(test_size = test_size)
modeltrainer.train()
results = modeltrainer.retResults(PATH + "sklearn_TTS_DT.csv")
display(results)
############ shuffle_Cross validation  ###################
modeltrainer.CV_shuffle_split(k = n_splits, test_size = test_size, random_state = 42)
results = modeltrainer.retResults(PATH + "sklearn_CV_DT.csv")
display(results)