In [21]:
import pandas as pd
from sklearn import svm
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
import seaborn as sns
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from keras.models import Sequential
from keras.layers import Dense

In [22]:
data = pd.read_csv("german_encoded.csv")

In [23]:
X_1 = data.iloc[:,:12]
X_2 = data.iloc[:,13:]
X = pd.concat([X_1, X_2], axis=1)
y = data.iloc[:,12]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# SVM

In [24]:
def linear():
    model = LinearSVC()
    scores = cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro', error_score="raise")
    print('CV score:', scores.mean(), "+/-", scores.std())
    
linear()

CV score: 0.38146594192979455 +/- 0.04575627959554281


Results:
- CV score: 0.38146594192979455 +/- 0.04575627959554281

In [25]:
model = SVC()
scores = cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro', error_score="raise")
print('CV score:', scores.mean(), "+/-", scores.std())

CV score: 0.3506991601863548 +/- 0.039759176896764635


Results:
- CV score: 0.3506991601863548 +/- 0.039759176896764635

## GridSearch

In [26]:
param_grid= {'kernel': ('linear', 'rbf'),'C': [5,10, 20, 30, 40, 100], 'gamma': [0.05, 0.01, 0.005, 0.001]}
base_estimator = SVC()
sh = GridSearchCV(base_estimator, param_grid, cv=10,scoring = 'f1_macro').fit(X_train, y_train)

In [27]:
param = sh.best_params_
score = sh.score(X_train, y_train)
print("Best param:", param)
print("Best score:",score)

Best param: {'C': 10, 'gamma': 0.05, 'kernel': 'rbf'}
Best score: 0.9726810007795694


Results:
- {'C': 10, 'gamma': 0.05, 'kernel': 'rbf'}, score = 0.9726810007795694

# SGD

In [28]:
model = SGDClassifier(loss = 'log_loss')

scores = cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro', error_score="raise")
print(scores.mean())

0.27968085058027875


Results:
- 0.27968085058027875

In [29]:
param_grid = {
    'alpha': [0.0001, 0.00005, 0.00007, 0.00009],
    'penalty': ['l2','l1'],
    'eta0': [0.0001, 0.001, 0.005, 0.007, 0.009, 0.01, 0.015]
}

# create a GridSearchCV object with the specified hyperparameter grid
grids = GridSearchCV(model, param_grid, cv=10, scoring = 'f1_macro', verbose = 1)

# fit the grid search object to the data
grids.fit(X_train , y_train)

# print the best hyperparameters and the corresponding score
print("Best hyperparameters: ", grids.best_params_)
print("Best score: ", grids.best_score_)

Fitting 10 folds for each of 56 candidates, totalling 560 fits
Best hyperparameters:  {'alpha': 0.0001, 'eta0': 0.007, 'penalty': 'l2'}
Best score:  0.3157551526216903


Results:
- {'alpha': 0.0001, 'eta0': 0.007, 'penalty': 'l2'}, score = 0.3157551526216903

# RandomForest

In [30]:
model = RandomForestClassifier()
scores = cross_val_score(model, X_train, y_train, cv=10, scoring='f1_macro', error_score="raise")
print('CV score:', scores.mean(), "+/-", scores.std())

CV score: 0.4244119019294866 +/- 0.05548275123586658


Results:
- CV score: 0.4244119019294866 +/- 0.05548275123586658

In [31]:
model.fit(X_train,y_train)

In [32]:
from sklearn.tree import export_text
# Print out the trees in the ensemble
for i, tree in enumerate(model.estimators_):
    print(f"Tree {i}:\n{export_text(tree)}\n")

Tree 0:
|--- feature_1 <= 0.50
|   |--- feature_10 <= 3.50
|   |   |--- feature_15 <= 1.50
|   |   |   |--- feature_9 <= 0.50
|   |   |   |   |--- feature_8 <= 2.50
|   |   |   |   |   |--- feature_10 <= 2.50
|   |   |   |   |   |   |--- feature_8 <= 0.50
|   |   |   |   |   |   |   |--- class: 3.0
|   |   |   |   |   |   |--- feature_8 >  0.50
|   |   |   |   |   |   |   |--- feature_6 <= 1.50
|   |   |   |   |   |   |   |   |--- feature_13 <= 0.50
|   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |   |--- feature_13 >  0.50
|   |   |   |   |   |   |   |   |   |--- feature_15 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- class: 2.0
|   |   |   |   |   |   |   |   |   |--- feature_15 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- feature_12 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |   |   |   |--- feature_12 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Tree 91:
|--- feature_3 <= 2.50
|   |--- feature_6 <= 3.50
|   |   |--- feature_15 <= 1.50
|   |   |   |--- feature_4 <= 0.50
|   |   |   |   |--- feature_12 <= 1.00
|   |   |   |   |   |--- feature_0 <= 1.50
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |--- feature_0 >  1.50
|   |   |   |   |   |   |--- class: 2.0
|   |   |   |   |--- feature_12 >  1.00
|   |   |   |   |   |--- feature_6 <= 1.50
|   |   |   |   |   |   |--- feature_17 <= 0.50
|   |   |   |   |   |   |   |--- feature_18 <= 0.50
|   |   |   |   |   |   |   |   |--- feature_6 <= 0.50
|   |   |   |   |   |   |   |   |   |--- class: 2.0
|   |   |   |   |   |   |   |   |--- feature_6 >  0.50
|   |   |   |   |   |   |   |   |   |--- feature_11 <= 1.00
|   |   |   |   |   |   |   |   |   |   |--- class: 2.0
|   |   |   |   |   |   |   |   |   |--- feature_11 >  1.00
|   |   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |--- feature_18 >  0.50
|   |   |   |   |   |   |   |   |--- cl

In [33]:
# Get the list of decision tree estimators
estimators = model.estimators_

# Find the best decision tree
best_tree = None
best_score = -1

for estimator in estimators:
    score = estimator.score(X_train, y_train)
    if score > best_score:
        best_tree = estimator
        best_score = score

# Print the best tree
tree_str = export_text(best_tree)
print(tree_str)

|--- feature_8 <= 1.50
|   |--- feature_6 <= 3.50
|   |   |--- feature_0 <= 1.50
|   |   |   |--- feature_17 <= 0.50
|   |   |   |   |--- feature_13 <= 1.50
|   |   |   |   |   |--- feature_13 <= 0.50
|   |   |   |   |   |   |--- feature_14 <= 1.50
|   |   |   |   |   |   |   |--- feature_5 <= 1.50
|   |   |   |   |   |   |   |   |--- feature_1 <= 1.50
|   |   |   |   |   |   |   |   |   |--- feature_3 <= 1.50
|   |   |   |   |   |   |   |   |   |   |--- feature_4 <= 1.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 1.0
|   |   |   |   |   |   |   |   |   |   |--- feature_4 >  1.50
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |   |   |--- feature_3 >  1.50
|   |   |   |   |   |   |   |   |   |   |--- feature_12 <= 1.00
|   |   |   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |   |   |   |--- feature_12 >  1.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   



# k-NN

In [34]:
param_grid= {'n_neighbors': [15,20,25],'p': [1,2]}
models = KNeighborsClassifier()
ch = GridSearchCV(models, param_grid, cv=10,scoring = 'f1_micro').fit(X_train, y_train)

In [35]:
k_param = ch.best_params_
k_score = ch.best_score_
print('best param is', k_param)
print('best score is', k_score)

best param is {'n_neighbors': 15, 'p': 1}
best score is 0.3986666666666666


Results:
- {'n_neighbors': 15, 'p': 1}, score = 0.3986666666666666

# Decision Tree

In [36]:
# Define the decision tree classifier and the hyperparameters to optimize
clf = DecisionTreeClassifier()
param_grid = {'max_depth': [7, 8, 9],
              'min_samples_split': [9, 10, 11],
              'min_samples_leaf': [1, 2, 3, 4, 5]}

# Perform the grid search with cross-validation
dtgs = GridSearchCV(clf, param_grid=param_grid, cv=10,scoring = 'f1_micro')
dtgs.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding evaluation metric
print("Best hyperparameters:", dtgs.best_params_)
print("Best score:", dtgs.best_score_)

Best hyperparameters: {'max_depth': 8, 'min_samples_leaf': 4, 'min_samples_split': 11}
Best score: 0.4253333333333334


Results:
- {'max_depth': 8, 'min_samples_leaf': 4, 'min_samples_split': 11}, score = 0.4253333333333334

# Neural Network

In [37]:
# build the neural network model
nn = Sequential()
nn.add(Dense(10, input_dim=20, activation='relu'))
nn.add(Dense(5, activation='relu'))
nn.add(Dense(1, activation='sigmoid'))

# compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the model to the training data
nn.fit(X_train, y_train, epochs=50, batch_size=32)

# evaluate the model on the test data
loss, accuracy = nn.evaluate(X_train, y_train)

# print the test accuracy
print('Test accuracy:', accuracy)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.2253333330154419


Results:
- Test accuracy: 0.2253333330154419, 50 epochs, 10 in second layer, 5 in third layer, batch size = 32

# Scores

SVM: {'C': 20, 'gamma': 0.01, 'kernel': 'rbf'}, score = 0.8290713574757445

SGD: {'alpha': 7e-05, 'eta0': 0.005, 'penalty': 'l2'}, score = 0.6616935571076444

RandomForest: CV score: 0.663357274635801 +/- 0.04059552799388036

k-NN: {'n_neighbors': 20, 'p': 1}, score = 0.752

Decision Tree: {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 10}, score = 0.72

Neural Network: Test accuracy: 0.699999988079071, 50 epochs, 10 in second layer, batch size = 32

**SVM is the highest**