## Loading Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import plotly.express as px
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

import plotly.graph_objs as go

## Loading Data

In [2]:
X_train = pd.read_csv('Xtrain_feature_sel.csv')
X_test = pd.read_csv('Xtest_feature_sel.csv')
y_train = pd.read_csv('ytrain_mod.csv')
y_test = pd.read_csv('ytest_mod.csv')
print("Shape of X Train: {}".format(X_train.shape))
print("Shape of X Test: {}".format(X_test.shape))
print("Shape of y Train: {}".format(y_train.shape))
print("Shape of y Test: {}".format(y_test.shape))

Shape of X Train: (8672, 19)
Shape of X Test: (2168, 19)
Shape of y Train: (8672, 1)
Shape of y Test: (2168, 1)


## Grid Search

In [9]:
# defining the parameters
#parameters= {'learning_rate':[0.1],
#             'n_estimators':[100],
#             'max_depth':[2,3],
#             'min_samples_leaf':[200]}
parameters= {'learning_rate':[0.1,0.01],
             'n_estimators':[10,50,100],
             'max_depth':[2,3,4],
             'min_samples_leaf':[3,1,5]}
# defining cross-validation method
#cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# defining the model
model = GradientBoostingClassifier()
# defining the grid serach (full grid here)
#Define the scoring
clf=GridSearchCV(model,parameters,scoring='recall',cv=5,n_jobs=-1) # with n_jobs=-1 we run the computation in parallel
clf.fit(X=X_train, y=y_train)
# getting best parameters
print(clf.best_params_)


One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



{'learning_rate': 0.1, 'max_depth': 2, 'min_samples_leaf': 3, 'n_estimators': 10}


Mit diesen Parameter ergab das GBM im Notebook 8_GBM_Decision eine schlächtere Accuracy.

### Plot of the best parameters

In [4]:
# Get the best parameters found by the random search
best_params = clf.best_params_

# Create a dictionary of hyperparameter names and their values
hyperparams = {
    'n_estimators': best_params['n_estimators'],
    'learning_rate': best_params['learning_rate'],
    'max_depth': best_params['max_depth'],
    #'min_samples_split': best_params['min_samples_split'],
    'min_samples_leaf': best_params['min_samples_leaf'],
    #'max_features': best_params['max_features']
}

# Create a bar chart of the hyperparameters
fig = go.Figure(data=[go.Bar(
    x=list(hyperparams.keys()),
    y=list(hyperparams.values()),
    marker=dict(
        color=['rgba(255, 0, 0, 0.7)' if key in best_params else 'rgba(0, 0, 255, 0.7)' for key in hyperparams.keys()]
    )
)])
# Set the chart title and axis labels
fig.update_layout(
    title='Best Hyperparameters',
    xaxis_title='Hyperparameter',
    yaxis_title='Value'
)

# Display the chart
fig.show()

### Prediction

In [5]:
# Get the best CV model
best_model = clf.best_estimator_

# Make a prediction for the test set
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Print the predicted values
# print(y_pred_test)

### Evaluation

#### Confusion Matrix

In [6]:
print("Confusion matrix of the training set: {}".format(metrics.confusion_matrix(y_train,y_pred_train)))
print("Confusion matrix of the test set: {}".format(metrics.confusion_matrix(y_test,y_pred_test)))

Confusion matrix of the training set: [[  1   1   0  11   0   1   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   8   0  43   0  10   0   0   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   2   0  41   2  21   0   1   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   6   0 169   5 111   0  12   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   2   0  53   8  75   0  19   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   2   0  93   7 309   0 157   0   0   0   0   0   0   0   0   0   0
    0   0]
 [  0   1   0  16   0  95   0 136   0   8   0   0   0   0   0   0   0   0
    0   0]
 [  0   2   0   4   0  96   0 510   0 114   0   4   0   0   0   0   0   0
    0   0]
 [  0   1   0   0   0   5   0 212   0 145   0  20   0   0   0   0   0   0
    0   0]
 [  0   1   0   1   0   1   0 124   0 529   0 176   0   1   0   0   0   0
    0   0]
 [  0   1   0   0   0   2   0   1   0 166   0 209   0   6   0   0   0   0
    0   0]
 [  0   0   0   0   0   2  

#### Accuracy

In [7]:
print("Accuracy Score for the training set: {}".format(metrics.accuracy_score(y_train, y_pred_train)))
print("Accuracy Score for the test set: {}".format(metrics.accuracy_score(y_test, y_pred_test)))

Accuracy Score for the training set: 0.5215636531365314
Accuracy Score for the test set: 0.5285977859778598


## Random Search

Konnte mit der vorhandenen Hardware nicht berechnet werden, dauerte zu lange.

In [8]:
param_dist = {
    'n_estimators': sp_randint(50, 500),
    'learning_rate': [0.001, 0.01, 0.1, 0.5],
    'max_depth': sp_randint(3, 10),
    'min_samples_split': sp_randint(2, 20),
    'min_samples_leaf': sp_randint(1, 10),
    'max_features': sp_randint(1, len(X_train.columns))
}

# defining cross-validation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# defining the model
model = GradientBoostingClassifier()
# defining the grid serach (full grid here)
# Define the random search with cross-validation
n_iter_search = 100
random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5)

# Fit the random search to the data
random_search.fit(X_train, y_train)

#  Print the best hyperparameters and the corresponding accuracy score
print("Best parameters found: {}".format(random_search.best_params_))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

KeyboardInterrupt: 

### Plot of the best parameters

In [None]:
# Get the best parameters found by the random search
best_params = random_search.best_params_

# Create a dictionary of hyperparameter names and their values
hyperparams = {
    'n_estimators': best_params['n_estimators'],
    'learning_rate': best_params['learning_rate'],
    'max_depth': best_params['max_depth'],
    'min_samples_split': best_params['min_samples_split'],
    'min_samples_leaf': best_params['min_samples_leaf'],
    'max_features': best_params['max_features']
}

# Create a bar chart of the hyperparameters
fig = go.Figure(data=[go.Bar(
    x=list(hyperparams.keys()),
    y=list(hyperparams.values()),
    marker=dict(
        color=['rgba(255, 0, 0, 0.7)' if key in best_params else 'rgba(0, 0, 255, 0.7)' for key in hyperparams.keys()]
    )
)])
# Set the chart title and axis labels
fig.update_layout(
    title='Best Hyperparameters',
    xaxis_title='Hyperparameter',
    yaxis_title='Value'
)

# Display the chart
fig.show()

### Prediction

In [None]:
# Get the best CV model
best_model = random_search.best_estimator_

# Make a prediction for the test set
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Print the predicted values
# print(y_pred_test)

### Evaluation

#### Confusion Matrix

In [None]:
print("Confusion matrix of the training set: {}".format(metrics.confusion_matrix(y_train,y_pred_train)))
print("Confusion matrix of the test set: {}".format(metrics.confusion_matrix(y_test,y_pred_test)))

Confusion matrix of the training set: [[52126   669]
 [  932 29846]]
Confusion matrix of the test set: [[20657  1714]
 [ 2510 10936]]


#### Accuracy

In [None]:
print("Accuracy Score for the training set: {}".format(metrics.accuracy_score(y_train, y_pred_train)))
print("Accuracy Score for the test set: {}".format(metrics.accuracy_score(y_test, y_pred_test)))

Accuracy Score for the training set: 0.9808430952580379
Accuracy Score for the test set: 0.8820671748052601
