## Dodatna Domača naloga

In [2]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random 

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, SVC
from hyperopt import hp, tpe, rand, fmin, Trials, space_eval
from hyperopt import pyll, base

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import scale, normalize, MinMaxScaler
from sklearn.decomposition import PCA

# POZOR! Opozorila skrijemo izključno zaradi opozoril pri sns.distplot.
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset = pd.read_csv('podatki.csv', sep=',')
x, y = dataset.iloc[:, :-1], dataset.iloc[:, -1]

In [None]:
x_norm = (x - x.mean()) / (x.max() - x.min())
x_train, x_test, y_train, y_test = train_test_split(x_norm, y, test_size=0.25, random_state=123, stratify=y)

In [None]:
prostor_drevo = {
            'max_depth': hp.choice('max_depth', range(1,51)),
            'max_features': hp.choice('max_features', range(1,31)),
            'criterion': hp.choice('criterion', ["gini", "entropy"])
}

def kriterijska_funkcija_drevo(parametri):
    max_depth = parametri["max_depth"]
    max_features = parametri["max_features"]
    criterion = parametri["criterion"]
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, criterion=criterion)
    
    return 1-cross_val_score(model, x_train, y_train).mean()

In [None]:
# tale funkcija je enaka za vse algoritme
def poisci_najboljse_parametre(prostor, kriterijska_fun, n_izracunov):
    trials = Trials()
    best = fmin(fn=kriterijska_fun,
                space=prostor,
                algo=tpe.suggest,
                max_evals=n_izracunov,
                trials=trials)
    best = space_eval(prostor, best)
    best_value = kriterijska_fun(best)
    # vse vrednosti paramtrov in kriterijske funkcije, ki smo jih preizkusili
    xs = [trial["misc"]["vals"] for trial in trials.trials]
    ys = [1-trial["result"]["loss"] for trial in trials.trials]

    print(best, 1-best_value)
    return best, xs, ys

In [None]:
best_drevo, xs_drevo, ys_drevo = poisci_najboljse_parametre(prostor_drevo, kriterijska_funkcija_drevo, 100)

In [None]:
model_drevo = DecisionTreeClassifier(
                max_depth=best_drevo["max_depth"],
                max_features=best_drevo["max_features"],
                criterion=best_drevo["criterion"])

model_drevo.fit(x_train, y_train)
y_predict_drevo = model_drevo.predict(x_test)
acc_drevo = accuracy_score(y_test, y_predict_drevo)
print('Klasifikacijska točnost najboljšega odločitvenaega drevesa na testni množici je:', acc_drevo)




In [None]:
parametri =["max_depth", "max_features", "criterion"]
cols = len(parametri)
f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(20,5))

for i,val in enumerate(parametri):
    xss = [x[val] for x in xs_drevo]
    axes[i].scatter(xss, ys_drevo, s=20, linewidth=0.01, alpha=0.8, color='C1')
    axes[i].set_title(val)
    

In [None]:
x_drevo = pd.Series(ys_drevo, name="Klasifikacijska točnost")
ax_drevo = sns.distplot(x_drevo, rug=True, bins=10, color='C1')


In [None]:
prostor_knn = {
            'ime': 'knn',
            'n_neighbors': hp.choice('n_neighbors', range(1, 21)),
            'weights': hp.choice('weights', ['uniform', 'distance'])
        }

def kriterijska_funkcija_knn(parametri):
    n_neighbors = parametri["n_neighbors"]
    weights = parametri["weights"]
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
    
    return 1-cross_val_score(model, x_train, y_train).mean()

In [None]:
%%time
# Kriterijska funkcija je za vse algoritme enaka
best_knn, xs_knn, ys_knn = poisci_najboljse_parametre(prostor_knn, kriterijska_funkcija_knn, 100)



In [None]:
model_knn = KNeighborsClassifier(
                n_neighbors=best_knn["n_neighbors"],
                weights=best_knn["weights"])

model_knn.fit(x_train, y_train)
y_predict_knn = model_knn.predict(x_test)
acc_knn = accuracy_score(y_test, y_predict_knn)
print('Klasifikacijska točnost najboljšega kNN na testni množici je:', acc_knn)

In [None]:
parametri_knn =["n_neighbors", "weights"]
cols = len(parametri_knn)
f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(20,5))

for i,val in enumerate(parametri_knn):
    xss = [x[val] for x in xs_knn]
    axes[i].scatter(xss, ys_knn, s=20, linewidth=0.01, alpha=0.5, color='C2')
    axes[i].set_title(val)

In [None]:
x_knn = pd.Series(ys_knn, name="Klasifikacijska točnost")
ax_knn = sns.distplot(x_knn, rug=True, bins=10, color='C2')

In [None]:
prostor_gozd = {
    'n_estimators': hp.choice('n_estimators', range(1, 51)),
    'max_depth': hp.choice('max_depth', range(1, 21)),
    'criterion': hp.choice('criterion', ["gini", "entropy"]),    
}

def kriterijska_funkcija_gozd(parametri):
    n_estimators = parametri["n_estimators"]
    max_depth = parametri["max_depth"]
    criterion = parametri["criterion"]
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion)
    return 1-cross_val_score(model, x_train, y_train).mean() 

In [None]:
best_gozd, xs_gozd, ys_gozd = poisci_najboljse_parametre(prostor_gozd, kriterijska_funkcija_gozd, 100)

In [None]:
model_gozd = RandomForestClassifier(
                n_estimators = best_gozd["n_estimators"],
                max_depth = best_gozd["max_depth"],
                criterion = best_gozd["criterion"])

model_gozd.fit(x_train, y_train)
y_predict_gozd = model_gozd.predict(x_test)
acc_gozd = accuracy_score(y_test, y_predict_gozd)
print('Klasifikacijska točnost najboljšega kNN na testni množici je:', acc_gozd)

In [None]:
parametri_gozd =["n_estimators", "max_depth", "criterion"]
cols = len(parametri_gozd)
f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(20,5))


for i,val in enumerate(parametri_gozd):
    xss = [x[val] for x in xs_gozd]
    axes[i].scatter(xss, ys_gozd, s=20, linewidth=0.01, alpha=0.8, color='C3')
    axes[i].set_title(val)

In [None]:
x_gozd = pd.Series(ys_gozd, name="klasifikacijska točnost")
ax_gozd = sns.distplot(x_gozd, rug=True, bins=10, color='C3')

In [1]:
prostor_svm = {
    'ime': 'svm',
    'C': hp.lognormal('C', 0, 1),
    'kernel': hp.choice('kernel', [
        {
            'tip': 'linear'  # linearno
        },
        {
            'tip': 'rbf',  # radialno
            'gamma': hp.lognormal('gamma', 0, 1)
        },
        {
            'tip': 'poly',  # polinomsko
            'degree': hp.choice('degree', [1, 2, 3, 4, 5])
        }
    ]),
}

def kriterijska_funkcija_svm(parametri):
    C = parametri["C"]
    kernel = parametri["kernel"]["tip"]
    # gamma in degree moramo definirati v vseh treh primerih: tam, kjer nista vazni, ju damo na 1
    neumna_vrednost = 1
    if kernel == "rbf":
        gamma = parametri["kernel"]["gamma"]
        degree = neumna_vrednost
    elif kernel == "linear":
        degree = neumna_vrednost
        gamma = neumna_vrednost
    else:
        gamma = neumna_vrednost
        degree = parametri["kernel"]["degree"]
    model = SVC(kernel=kernel, gamma=gamma, C=C, degree=degree)
        
    return 1-cross_val_score(model, x_train, y_train).mean() 

NameError: name 'hp' is not defined

In [None]:
best_svm, xs_svm, ys_svm = poisci_najboljse_parametre(prostor_svm, kriterijska_funkcija_svm, 100)

In [None]:

neumna_vrednost = 1
if best_svm["kernel"]["tip"] == "rbf":
    gamma = best_svm["kernel"]["gamma"]
    degree = neumna_vrednost
elif best_svm["kernel"]["tip"] == "linear":
    degree = neumna_vrednost
    gamma = neumna_vrednost
else:
    gamma = neumna_vrednost
    degree = best_svm["kernel"]["degree"]
    
model_svm = SVC(kernel=best_svm["kernel"]["tip"], gamma=gamma, C=best_svm["C"], degree=degree)


model_svm.fit(x_train, y_train)
y_predict_svm = model_svm.predict(x_test)
acc_svm = accuracy_score(y_test, y_predict_svm)
print('Klasifikacijska točnost najboljšega SVM na testni množici je:', acc_svm)

In [None]:
parametri_svm =["C", "kernel"]
cols = len(parametri_svm)
f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(20,5))


for i,val in enumerate(parametri_svm):
    xss = [x[val] for x in xs_svm]
    axes[i].scatter(xss, ys_svm, s=20, linewidth=0.01, alpha=0.8, color='C4')
    if val == "kernel":  
        axes[i].set_title(val+", (0=linear, 1=rbf, 2=degree)")
    else:
        axes[i].set_title(val)

In [None]:

x_svm = pd.Series(ys_svm, name="klasifikacijska točnost")
ax_svm = sns.distplot(x_svm, rug=True, bins=10, color='C4')