In [None]:
import itertools
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import NearestNeighbors

# Classification

In [None]:
train_set = pd.read_csv("./spam_train.csv")
test_set = pd.read_csv("./spam_test.csv")

In [None]:
X_train = np.array(train_set[list(train_set.columns[1:-1])])
y_train = np.array(train_set["class"])
X_test = np.array(test_set[list(test_set.columns[1:-1])])
y_test = np.array(test_set["class"])

## KNN

In [None]:
accuracies_test = []
accuracies_train = []
for k in range(1,21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    predictions_test = knn.predict(X_test)
    predictions_train = knn.predict(X_train) 
    accuracies_test.append(np.round(np.sum(predictions_test==y_test)*100/len(y_test), 2))
    accuracies_train.append(np.round(np.sum(predictions_train==y_train)*100/len(y_train), 2))

In [None]:
plt.plot(np.arange(1,21),accuracies_train, marker="o", label="Train")
plt.plot(np.arange(1,21),accuracies_test, marker="o", label = "Test")
_ = plt.xticks(np.arange(1,21))
plt.xlabel("K")
plt.ylabel("Accuracy")
plt.legend()

### ANALYSIS of Misclassified

In [None]:
optimal_k = 5
knn = KNeighborsClassifier(n_neighbors=optimal_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [None]:
# Indices of misclassified samples
misclassified_indices = np.where((y_pred!=y_test))[0]
misclassified_indices

In [None]:
#Identify k nearest neighbors from the training set for each misclassified test sample
k = 3
nn = NearestNeighbors(n_neighbors=k)
nn.fit(X_train)
distances, indices = nn.kneighbors(X_test[misclassified_indices])

In [None]:
columns = ["Misclassified Test idx", "True Class", "Pred Class"]
for i in range(k):
    columns+= ["Neigbor#{}_idx".format(i+1), "Neigbor#{}_True Class".format(i+1), "Neigbor#{}_Distance".format(i+1)]

df = pd.DataFrame(columns = columns)

df["Misclassified Test idx"] = misclassified_indices
df["True Class"] =  y_test[misclassified_indices]
df["Pred Class"] = y_pred[misclassified_indices]
for i in range(k):
    df["Neigbor#{}_idx".format(i+1)] = indices[:,i]
    df["Neigbor#{}_True Class".format(i+1)] = y_train[indices[:,i]]
    df["Neigbor#{}_Distance".format(i+1)] = np.around(distances[:,i], decimals=2)

In [None]:
df

In [None]:
idx = 243 #between 0 and 243
print("Test sample {} is misclassified.".format(misclassified_indices[idx]))
print("True class: {}".format(y_test[misclassified_indices][idx]))
print("Predicted class: {}".format(y_pred[misclassified_indices][idx]))
for i in range(k):
    print("Neighbor#{}: Train index {} ; True Class {} ; Distance {:.2f}".format(i+1, indices[idx,i], y_train[indices[idx,i]], distances[idx, i]))

## Logistic Regression

In [None]:
accuracies_test = []
accuracies_train = []
c_vals = [0.1,1.0,2.0,3.0,4.0,5.0]
for i in c_vals:
    clf = LogisticRegression(random_state=0, C=i, solver = "liblinear").fit(X_train, y_train)
    accuracies_test.append(100*clf.score(X_test, y_test))
    accuracies_train.append(100*clf.score(X_train, y_train))

In [None]:
plt.plot(c_vals,accuracies_train, marker="o", label="Train")
plt.plot(c_vals,accuracies_test, marker="o", label = "Test")
_ = plt.xticks(c_vals)
plt.xlabel("C")
plt.ylabel("Accuracy")
plt.legend()

### ANALYSIS of Misclassified

In [None]:
optimal_C = 1.0
clf = LogisticRegression(random_state=0, C=i, solver = "liblinear")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
# Indices of misclassified samples
misclassified_indices = np.where((y_pred!=y_test))[0]

In [None]:
#Identify k nearest neighbors from the training set for each misclassified test sample
k = 3
nn = NearestNeighbors(n_neighbors=k)
nn.fit(X_train)
distances, indices = nn.kneighbors(X_test[misclassified_indices])

In [None]:
columns = ["Misclassified Test idx", "True Class", "Pred Class"]
for i in range(k):
    columns+= ["Neigbor#{}_idx".format(i+1), "Neigbor#{}_True Class".format(i+1), "Neigbor#{}_Distance".format(i+1)]

df = pd.DataFrame(columns = columns)

df["Misclassified Test idx"] = misclassified_indices
df["True Class"] =  y_test[misclassified_indices]
df["Pred Class"] = y_pred[misclassified_indices]
for i in range(k):
    df["Neigbor#{}_idx".format(i+1)] = indices[:,i]
    df["Neigbor#{}_True Class".format(i+1)] = y_train[indices[:,i]]
    df["Neigbor#{}_Distance".format(i+1)] = np.around(distances[:,i], decimals=2)

In [None]:
df

In [None]:
idx = 1 #between 0 and 87
print("Test sample {} is misclassified.".format(misclassified_indices[idx]))
print("True class: {}".format(y_test[misclassified_indices][idx]))
print("Predicted class: {}".format(y_pred[misclassified_indices][idx]))
for i in range(k):
    print("Neighbor#{}: Train index {} ; True Class {} ; Distance {:.2f}".format(i+1, indices[idx,i], y_train[indices[idx,i]], distances[idx, i]))

## Linear Support Vector Machines Classifier

In [None]:
accuracies_test = []
accuracies_train = []
c_vals = [0.1,1.0,5.0, 10.0, 15.0, 20.0]
for i in c_vals:
#     clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=i)).fit(X_train, y_train)
    clf =  SVC(gamma='auto', C=i).fit(X_train, y_train)
    accuracies_test.append(100*clf.score(X_test, y_test))
    accuracies_train.append(100*clf.score(X_train, y_train))

In [None]:
plt.plot(c_vals,accuracies_train, marker="o", label="Train")
plt.plot(c_vals,accuracies_test, marker="o", label = "Test")
_ = plt.xticks(c_vals)
plt.xlabel("C")
plt.ylabel("Accuracy")
plt.legend()

In [None]:
optimal_C = 5.0
clf = SVC(gamma='auto', C=i)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
# Indices of misclassified samples
misclassified_indices = np.where((y_pred!=y_test))[0]

In [None]:
#Identify k nearest neighbors from the training set for each misclassified test sample
k = 3
nn = NearestNeighbors(n_neighbors=k)
nn.fit(X_train)
distances, indices = nn.kneighbors(X_test[misclassified_indices])

In [None]:
columns = ["Misclassified Test idx", "True Class", "Pred Class"]
for i in range(k):
    columns+= ["Neigbor#{}_idx".format(i+1), "Neigbor#{}_True Class".format(i+1), "Neigbor#{}_Distance".format(i+1)]

df = pd.DataFrame(columns = columns)

df["Misclassified Test idx"] = misclassified_indices
df["True Class"] =  y_test[misclassified_indices]
df["Pred Class"] = y_pred[misclassified_indices]
for i in range(k):
    df["Neigbor#{}_idx".format(i+1)] = indices[:,i]
    df["Neigbor#{}_True Class".format(i+1)] = y_train[indices[:,i]]
    df["Neigbor#{}_Distance".format(i+1)] = np.around(distances[:,i], decimals=2)

In [None]:
df

In [None]:
idx = 3 #between 0 and 89
print("Test sample {} is misclassified.".format(misclassified_indices[idx]))
print("True class: {}".format(y_test[misclassified_indices][idx]))
print("Predicted class: {}".format(y_pred[misclassified_indices][idx]))
for i in range(k):
    print("Neighbor#{}: Train index {} ; True Class {} ; Distance {:.2f}".format(i+1, indices[idx,i], y_train[indices[idx,i]], distances[idx, i]))