# Exercise 5 - kNN & SVM
Necessary libraries are being loaded.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 5.1 Extract from the data preparation
Explain with comments in the code what is performed here by the individual commands.

In [8]:
from sklearn import preprocessing

df = pd.read_csv('prepared_data.csv')

dfPrepared = df.copy()
dfPrepared.head()

Unnamed: 0,weekday,daytime,isHoliday,distance,count,startClusterName,startClusterZip,startClusterID,endClusterName,endClusterZip,endClusterID,year,month,total
0,Di,7,0,3,15,Hunedoara,71171,3254026000002,Hunedoara,71171,3254026000007,2021,3,45
1,Mi,17,0,0,10,Turda,80982,3241013050002,Turda,80982,3241013050008,2021,3,0
2,Sa,11,0,3,10,Turda,80982,3241013050007,Turda,80982,3241013030001,2021,3,30
3,Fr,13,0,0,15,Bran,91157,3254028001012,Bran,91157,3254028001004,2021,3,0
4,Fr,18,0,7,10,Bran,91157,3254028001003,Turda,80982,3241013070001,2021,3,70


## 5.2 One-Hot-Encoding & Normalization
Now carry out binary encoding for all relevant features and save the result in the dfTrans data frame.

In [9]:
# space
dfPrepared = pd.get_dummies(dfPrepared, prefix='',prefix_sep='',columns=['weekday'])
dfPrepared = pd.get_dummies(dfPrepared, prefix='start',prefix_sep='_',columns=['startClusterName'])
dfPrepared = pd.get_dummies(dfPrepared, prefix='end',prefix_sep='_',columns=['endClusterName'])

dfPrepared.head()

Unnamed: 0,daytime,isHoliday,distance,count,startClusterZip,startClusterID,endClusterZip,endClusterID,year,month,...,Mon,Sa,Sam,So/Fe,start_Bran,start_Hunedoara,start_Turda,end_Bran,end_Hunedoara,end_Turda
0,7,0,3,15,71171,3254026000002,71171,3254026000007,2021,3,...,False,False,False,False,False,True,False,False,True,False
1,17,0,0,10,80982,3241013050002,80982,3241013050008,2021,3,...,False,False,False,False,False,False,True,False,False,True
2,11,0,3,10,80982,3241013050007,80982,3241013030001,2021,3,...,False,True,False,False,False,False,True,False,False,True
3,13,0,0,15,91157,3254028001012,91157,3254028001004,2021,3,...,False,False,False,False,True,False,False,True,False,False
4,18,0,7,10,91157,3254028001003,80982,3241013070001,2021,3,...,False,False,False,False,True,False,False,False,False,True


Look at the correlations with the heat map. Remove characteristics that correlate 100% with others.

In [None]:
import seaborn as sns

sns.heatmap(dfTrans.corr())

In [None]:
# space

Now all non-binary attributes must be normalized. This time minmax_scale should be used for this. Use comments to explain the function of the individual lines of code.

In [None]:
from sklearn.preprocessing import minmax_scale
dfnorm = dfTrans.copy()

scaled = minmax_scale(dfnorm[['age_first_order','pages_visited_avg']], feature_range = (0, 1))

dfnorm['age_first_order'] = scaled[:,0] 
dfnorm['pages_visited_avg'] = scaled[:,1]

dfnorm[['age_first_order', 'pages_visited_avg']].head()

have a short look at our prepared dataframe. all numeric columns with a natural order are standardised and scaled. all columns without a natural order are one-hot encoded. Our target column ‘high revenue’ is also binary. we want it that way because we perform a classification and either assign a data record to high revenue (1) or not (0)

In [None]:
dfnorm

## 5.3 Training & evaluation of the models

In the following, the algorithms kNN and SVM are to be trained and tested. For this purpose, a data split of 70 % (training data) to 30 % (test data) must be carried out in advance.

In [None]:
from sklearn.model_selection import train_test_split

x = dfnorm.drop(['high revenue'], axis = 1)
y = dfnorm['high revenue'].astype(int)

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                    random_state = 101, stratify = y, test_size = 0.3)

### A1: k-Nearest Neighbors (kNN)
Train the kNN on the training data and evaluate on the test data. Test different values for k.

The following visualisation will help you to try out good ks. Here you can see again the change in the error rate with different k.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

error_rate = []

k_range = range(1, 30)

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    
    y_pred = knn.predict(x_test)
    
    error_rate.append(1 - accuracy_score(y_test, y_pred))  

plt.figure(figsize=(10,6))
plt.plot(k_range, error_rate, color='blue', linestyle='dashed', marker='o', 
         markerfacecolor='red', markersize=10)
plt.title('Elbow method for KNN')
plt.xlabel('Number of neighbours: k')
plt.ylabel('Error rate')
plt.show()

In [None]:
# space

Let's take a quick look at the distribution of the predicted classes

In [None]:
import numpy as np

# Count how many data points were classified as 0 and how many as 1
unique, counts = np.unique(y_pred, return_counts=True)

# Calculate percentage distribution
total = len(y_pred)
percentages = (counts / total) * 100

# Output of the results
for u, count, percentage in zip(unique, counts, percentages):
    print(f'Class {u}: {count} Datapoints ({percentage:.2f}%)')

This distribution also becomes apparent when we visualise the data points. Note: The n-dimensional space is reduced to 3 dimensions using PCA (theoretical Background not part of the class!) in order to make visualisation possible . PCA (Principal Component Analysis) does not simply select three existing features, but creates new features (the so-called principal components), which are linearly combined variables of the original features. The aim is to reduce the dimensions in such a way that the maximum possible variance is retained in the data.

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
from mpl_toolkits.mplot3d import Axes3D

# Dimension reduction to 3 dimensions using PCA
pca = PCA(n_components=3)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

# Visualisation of the test data points
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Create a scatter plot with the test data and the prediction (y_pred) as colours
scatter = ax.scatter(x_test_pca[:, 0], x_test_pca[:, 1], x_test_pca[:, 2], 
                     c=y_pred, cmap='viridis', s=50, alpha=0.8)

# Add colour bars to display the classes
legend1 = ax.legend(*scatter.legend_elements(), title="classes")
ax.add_artist(legend1)

# Axis labelling
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')
ax.set_zlabel('PCA 3')

plt.show()

#### Model evaluation
Then use the scorer or other addressed methods to evaluate the Confusion Matrix and the accuracy of your models.

In [None]:
# space

In [None]:
# space

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

tpr = tp / (tp + fn)
print(f'True Positive Rate (TPR): {tpr:.2f}')

fpr = fp / (fp + tn)
print(f'False Positive Rate (FPR): {fpr:.2f}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

y_proba = knn_model.predict_proba(x_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_proba)

auc_score = roc_auc_score(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

## A1.1 kNN with shepards method

### For comparison, look at the same algorithm but with Shepard's method

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=23, weights='distance')
knn_model.fit(x_train, y_train)

y_pred = knn_model.predict(x_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

### A2: Support Vector Machine
Train an SVM on the training data and evaluate on the test data. Test with the parameter transfer (kernel = 'rbf', gamma = 'scale').

In [None]:
# space

#### Model evaluation
Then use the scorer or other addressed methods to evaluate the Confusion Matrix and the accuracy of your models.

In [None]:
# space

In [None]:
# space

In [None]:
import numpy as np

# Count how many data points were classified as 0 and how many as 1
unique, counts = np.unique(y_pred, return_counts=True)

# Calculate percentage distribution
total = len(y_pred)
percentages = (counts / total) * 100

# Output of the results
for u, count, percentage in zip(unique, counts, percentages):
    print(f'Class {u}: {count} Datapoints ({percentage:.2f}%)')

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

tpr = tp / (tp + fn)
print(f'True Positive Rate (TPR): {tpr:.2f}')

fpr = fp / (fp + tn)
print(f'False Positive Rate (FPR): {fpr:.2f}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC

# Activate SVC model with probabilities
svm_model = SVC(kernel='rbf', gamma='scale', random_state=1, probability=True)
svm_model.fit(x_train, y_train)

# Predict probabilities (only for the positive class)
y_prob = svm_model.predict_proba(x_test)[:, 1]

# Calculate ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

# Plot the ROC-Curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM Model')
plt.legend(loc="lower right")
plt.show()

### Evaluation of the algorithms
Assess the quality of the two algorithms by evaluating the results. Which one would you use?

In [None]:
# space

## 5.4 Optimization with GridSearch ( Preview)
Gridsearch is a method for hyperparameter optimization. Gridsearch iteratively tries out all combinations of the selected parameters. The combination with the highest score is then output.

For the sake of simplicity, cross-validation is not used in this exercise.

In [None]:
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

### A1: SVM

In [None]:
from sklearn.svm import SVC

svm_model = SVC(random_state = 1)
param_grid1 = {"kernel": ["poly", "rbf", "sigmoid"], 
              "gamma": ["scale","auto"] }
gridSearch1 = GridSearchCV(estimator = svm_model, param_grid = param_grid1)
gridSearch1.fit(x_train, y_train)

results1 = pd.DataFrame(gridSearch1.cv_results_)

params1 = results1.loc[results1['rank_test_score'].idxmax()]
params1['params']

Now we want to train the model with the best parameters.

In [None]:
# space

Now look at the ConfusionMatrix again and compare.

In [None]:
# space

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

tpr = tp / (tp + fn)
print(f'True Positive Rate (TPR): {tpr:.2f}')

fpr = fp / (fp + tn)
print(f'False Positive Rate (FPR): {fpr:.2f}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import SVC

svm_model = SVC(kernel='sigmoid', gamma='scale', random_state=1, probability=True)
svm_model.fit(x_train, y_train)

y_prob = svm_model.predict_proba(x_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = roc_auc_score(y_test, y_prob)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM Model')
plt.legend(loc="lower right")
plt.show()

### A1: k-Nearest Neighbours

Now convert GridSearch for k-Nearest Neighbors and select different values for metric and n_neighbors in param_grid.

In [None]:
# space 

Now look at the ConfusionMatrix again and compare.

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

tpr = tp / (tp + fn)
print(f'True Positive Rate (TPR): {tpr:.2f}')

fpr = fp / (fp + tn)
print(f'False Positive Rate (FPR): {fpr:.2f}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

y_proba = knn_model.predict_proba(x_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y_test, y_proba)

auc_score = roc_auc_score(y_test, y_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

although our true positive rate has improved, the ROC-curve has deteriorated. Why?

In [None]:
# space