# 2 - Logistic Regression & Support Vector Classifiers / Support Vector Machines

In [None]:
# dummy 
df , y, x, x_sm = None, None, None, None

## Logistic Regression

In [None]:
# fit model
import statsmodels.api as sm
model = sm.GLM(y, x_sm, family=sm.families.Binomial()).fit()

In [None]:
# make prediction
prob_stud = model.predict([1, 1])       # where the first 1 indicates default and the second 1 indicates student
prob_nonstud = model.predict([1, 0])    # where the first 1 indicates default and the second 0 indicates non-student

print(f'Probability of default for student: {prob_stud[0]:.3f}')
print(f'Probability of default for non-student: {prob_nonstud[0]:.3f}')

## Support Vector Classifiers

In [None]:
from sklearn import svm
import numpy as np
import matplotlib.pyplot as plt


# data
x1 = [3, 2, 4, 1, 2, 4, 4]
x2 = [4, 2, 4, 4, 1, 3, 1]
y = ['red', 'red', 'red', 'red', 'blue', 'blue', 'blue']

x = np.concatenate(([x1], [x2]), axis=0).T

In [None]:
# fit a model
cost = 10
clf = svm.SVC(kernel='linear', C=cost)
clf.fit(x, y)

In [10]:
# calculate error rate
n = x.shape[0] # dataset size
y_pred = clf.predict(x)
error = n - (y_pred == y).sum()
error = error / n

In [None]:
# Print attributes
print(f'Number of support vectors: {len(clf.support_vectors_)}')
print(f'Number of support vectors: {clf.n_support_}')

In [None]:
# plot the hyperplane
beta1, beta2 = clf.coef_[0][0], clf.coef_[0][1]
beta0 = clf.intercept_[0]

x1_hyperplane = np.linspace(1, 4, 2)
x2_hyperplane = - beta1 / beta2 * x1_hyperplane - beta0 / beta2

fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(1, 1, 1)

ax.plot(x1_hyperplane, x2_hyperplane, '-k')

ax.scatter(x1, x2, c=y)
ax.set_xlabel('x1')
ax.set_ylabel('x2')

plt.title("Maximal margin Hyperplane")
plt.show()

#### Cross Validation


In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm

# Set parameters to be tuned. Other options can be added

costs = np.linspace(1, 50, 20)
tune_parameters = {'C': costs}
n_folds = 10

# Tune SVM
clf_tune = GridSearchCV(svm.SVC(kernel='linear'), 
                        tune_parameters,
                        cv=n_folds)

clf_tune.fit(x, y)

# Save Tune scores:
error_tune = 1 - clf_tune.cv_results_['mean_test_score']
error_std = clf_tune.cv_results_['std_test_score'] / np.sqrt(n_folds) 

best_cost = clf_tune.best_params_['C']
print(f"Best cost: {best_cost:.2f}")

In [None]:
# optionally plot the cross-validation error
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(1, 1, 1)

ax.plot(costs, error_tune,
'-k', alpha=0.8, label='Cross validation error')
ax.plot(costs, error_tune + error_std, '--b',
        costs, error_tune - error_std, '--b',
        alpha=0.8, label='Cross validation error standard deviation')

ax.set_xlabel('cost')
ax.set_ylabel('error')
plt.legend()
plt.show()

## Support Vector Machines: Polynomial Kernel
kernel='poly'

In [None]:
costs = np.linspace(0.5, 100, 5)
degree = [1, 2, 3, 4, 5]


tune_parameters = {'C': costs,
                   'degree': degree}
n_folds = 10

# Tune SVM
clf_tune = GridSearchCV(svm.SVC(kernel='poly'), 
                        tune_parameters,
                        cv=n_folds)

clf_tune.fit(x, y)

# Save Tune scores:
error_tune = 1 - clf_tune.cv_results_['mean_test_score']
error_tune = error_tune.reshape(len(costs), len(degree))

best_cost = clf_tune.best_params_['C']
best_degree = clf_tune.best_params_['degree']

print(f"Best parameter Polynomial: {clf_tune.best_params_},")
print(f"Best score Polynomial: {np.round(1 - clf_tune.best_score_, 4):.4f}")


# plot
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(1, 1, 1)

# Plot error vs degree for each value for cost:
for i in range(len(costs)):
        line, = ax.plot(degree, error_tune[i, :],'.-', alpha=0.8)
        line.set_label(('cost=' + str(costs[i])))

ax.set_xlabel('degree')
ax.set_ylabel('CV error')
plt.legend()
plt.show()


## Support Vector Machines: Radial Kernel
kernel='rbf'

In [None]:
costs = np.linspace(0.5, 10, 5)
gamma = np.linspace(0.0005, 0.005, 5)

tune_parameters = {'C': costs,
                   'gamma': gamma}
n_folds = 10

# Tune SVM
clf_tune = GridSearchCV(svm.SVC(kernel='rbf'), 
                        tune_parameters,
                        cv=n_folds)

clf_tune.fit(x, y)

# Save Tune scores:
error_tune = 1 - clf_tune.cv_results_['mean_test_score']
error_tune = error_tune.reshape(len(costs), len(gamma))

best_cost = clf_tune.best_params_['C']
best_gamma = clf_tune.best_params_['gamma']

print(f"Best parameter Radial: {clf_tune.best_params_}")
print(f"Best score Radial: {np.round(1 - clf_tune.best_score_, 4):.4f}")

# plot
fig = plt.figure(figsize=(8, 5))
ax = fig.add_subplot(1, 1, 1)

# Plot error vs gamma for each value for cost:
for i in range(len(costs)):
        line, = ax.plot(gamma, error_tune[i, :],'.-', alpha=0.8)
        line.set_label(('cost=' + str(costs[i])))

ax.set_xlabel('gamma')
ax.set_ylabel('CV error')
plt.legend()
plt.show()


## Metrics / Helper Functions

#### Test / Train split

In [None]:
# option 1)
from sklearn.model_selection import train_test_split

x = df.drop(['mpg01'], axis=1)
y = df['mpg01']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [None]:
# option 2) 
i = df.index
# Index of train
i_train = np.random.choice(i, replace=False,
                           size=800)

# Save DataFrames
df_train = df.iloc[i_train]

#### Classification Error

In [None]:
import numpy as np
import statsmodels.api as sm

# Train
x_train_sm = sm.add_constant(x_train)
model = sm.GLM(y_train, x_train_sm, family=sm.families.Binomial()).fit()


# Test
def class_err(x, y, model):
    """ Find classification error for given
    x, y and fitted model """
    y_pred = model.predict(x)
    # Round to 0 or 1
    y_pred = y_pred.round()
    # Classification error
    e = abs(y - y_pred).mean()
    return e


x_test_sm = sm.add_constant(x_test)

e_train = class_err(x_train_sm, y_train, model)
e_test = class_err(x_test_sm, y_test, model)

print('Train error:\n', np.round(e_train, 4),
'\nTest error:\n', np.round(e_test, 4))

#### Confusion Matrix

In [None]:
import pandas as pd

y_pred_test = model.predict(x_test_sm).round()
confusion_test = pd.DataFrame({'predicted': y_pred_test,'true': y_test})
confusion_test = pd.crosstab(confusion_test.predicted,confusion_test.true,
                             margins=True, margins_name="Sum")

print("\nConfusion Matrix - Test Set:\n", confusion_test)

#### TP, FP, FN, TN and Accuracy, Precision, Recall, F-1

In [None]:
# hinweis: zuerst confusion berechnen 
tp = confusion_test[1][1]
tn = confusion_test[0][0]
fp = confusion_test[1][0]
fn = confusion_test[0][1]

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall)

print(f"Accuracy: {accuracy:.4f}"
      f"\nPrecision: {precision:.4f}"
      f"\nRecall: {recall:.4f}"
      f"\nF1 Score: {f1:.4f}")

####