In [None]:
import mglearn as mglearn
import np as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

!pip install numpy scipy scikit-learn matplotlib pandas
from sklearn.model_selection import GridSearchCV


# Train the Classifiers

## Import data, setup models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import os
import warnings
import sys

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = ('ignore::UserWarning,ignore::ConvergenceWarning,ignore::RuntimeWarning')
train_set = pd.read_csv("./spam_train.csv")
test_set = pd.read_csv("./spam_test.csv")

X_train = np.array(train_set[list(train_set.columns[1:-1])])
y_train = np.array(train_set["class"])
X_test = np.array(test_set[list(test_set.columns[1:-1])])
y_test = np.array(test_set["class"])

np.random.seed(2)

knn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
print("KNN Training score: {:.8f}".format(knn.score(X_train, y_train)))
print(
    "KNN Testing score: {:.8f}".format(knn.score(X_test, y_test))
)

logreg = LogisticRegression().fit(X_train, y_train)
print("Logistic Regression Training score: {:.8f}".format(logreg.score(X_train, y_train)))
print(
    "Logistic Regression Testing score: {:.8f}".format(logreg.score(X_test, y_test))
)

lsvc = LinearSVC().fit(X_train, y_train)
print("LinearSVC Training score: {:.8f}".format(lsvc.score(X_train, y_train)))
print("LinearSVC Testing score: {:.8f}".format(lsvc.score(X_test, y_test)))

## KNN Train

In [None]:
knn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
print("KNN Training score: {:.8f}".format(knn.score(X_train, y_train)))
print("KNN Testing score: {:.8f}".format(knn.score(X_test, y_test)))


def knn_and_graph(neighbor_max):
    plt.style.use('_mpl-gallery')
    x = np.linspace(1, neighbor_max, neighbor_max)
    results = []
    for neighbor_count in x:
        clf = KNeighborsClassifier(n_neighbors=int(neighbor_count))
        clf.fit(X_train, y_train)
        results.append(
            [
                neighbor_count,
                clf.score(X_train, y_train),
                clf.score(X_test, y_test)
            ]
        )

    x = [neighbors[0] for neighbors in results]
    plt.title("KNN")
    plt.xlabel("Neighbors")
    plt.ylabel("Accuracy (0 - 1)")
    plt.plot(x, [train[1] for train in results], label='Training Score')
    plt.plot(x, [test[2] for test in results], label='Testing Score')
    plt.legend()
    # fig = plt.figure()
    plt.show()
    #

    print("Test set predictions: {}".format(clf.predict(X_test)))
    print("Test set accuracy: {:.2f}".format(clf.score(X_test, y_test)))
    clf = KNeighborsClassifier(n_neighbors=int(5))
    clf.fit(X_train, y_train)

    print(clf.score(X_train, y_train))
    print(clf.score(X_test, y_test))

### KNN Train first view

In [None]:
knn_and_graph(100)

### KNN Train second view

In [None]:
knn_and_graph(20)

## KNN Validation

In [None]:
print("KNN")
k_range = range(1, 20)
k_scores = []
knn = KNeighborsClassifier().fit(X_train, y_train)
hyperparameters = dict(n_neighbors=k_range)
clf = GridSearchCV(knn, hyperparameters, cv=5)
clf.fit(X_train, y_train)
scores_knn = clf.cv_results_['mean_test_score']
std_scores = clf.cv_results_['std_test_score']
optimal_k_val = clf.best_params_['n_neighbors']
print("Best K: ", optimal_k_val)
print("Train score: ", KNeighborsClassifier(n_neighbors=optimal_k_val).fit(X_train, y_train).score(X_train, y_train))
print(std_scores)
# for k in k_range:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
#     k_scores.append(scores.mean())
# plot to see clearly
fig, ax = plt.subplots()
fig.set_figheight(4)
fig.set_figwidth(4)
plt.plot(k_range, scores_knn)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

fig, ax = plt.subplots()
fig.set_figheight(4)
fig.set_figwidth(4)
plt.plot(k_range, std_scores)
plt.xlabel('C Value')
plt.ylabel('Cross-Validated Standard Deviation')
plt.show()

# Logistic Regression

## Logreg Train

In [None]:
logreg = LogisticRegression().fit(X_train, y_train)
print("Logistic Regression Training score: {:.8f}".format(logreg.score(X_train, y_train)))
print("Logistic Regression Testing score: {:.8f}".format(logreg.score(X_test, y_test)))


def logreg_explore_and_graph(c_vals):
    hyperparameters = dict(C=c_vals)
    clf = GridSearchCV(logreg, hyperparameters, cv=5)
    clf.fit(X_train, y_train)
    scores_logreg = clf.cv_results_['mean_test_score']
    std_scores = clf.cv_results_['std_test_score']
    optimal_c_val = clf.best_params_['C']
    print("Best C: ", optimal_c_val)
    print("Train score: ", LogisticRegression(C=optimal_c_val).fit(X_train, y_train).score(X_train, y_train))
    print(std_scores)
    fig, ax = plt.subplots()
    fig.set_figheight(4)
    fig.set_figwidth(4)
    plt.plot(c_vals, scores_logreg)
    plt.xlabel('C Value')
    plt.ylabel('Cross-Validated Accuracy')
    plt.show()

    fig, ax = plt.subplots()
    fig.set_figheight(4)
    fig.set_figwidth(4)
    plt.plot(c_vals, std_scores)
    plt.xlabel('C Value')
    plt.ylabel('Cross-Validated Standard Deviation')
    plt.show()

## Logreg Validate first view

In [None]:
logreg_explore_and_graph(np.logspace(0, 4, 10))

## Logreg Validate second view

In [None]:
logreg_explore_and_graph(np.logspace(1, 3.3, 20))

## Logreg Validate first linspace view

In [None]:
logreg_explore_and_graph(np.linspace(10, 40, 40))

### Logreg final linspace

In [None]:
logreg_explore_and_graph(np.linspace(25, 28, 50))

### Logreg Test

In [None]:
print(
    "Logreg best train score: ",
    LogisticRegression(C=26.224489795918366)
        .fit(X_train, y_train)
        .score(X_train, y_train)
)
logreg = LogisticRegression(C=26.224489795918366).fit(X_test, y_test)
print(
    "Logreg best test score: ",
    logreg.score(X_test, y_test)
)
importance = logreg.coef_[0]
print(len(importance))
print("ingored: ", (importance == 0).sum())


fig, ax = plt.subplots()
fig.set_figheight(4)
fig.set_figwidth(4)
plt.bar([x for x in range(len(importance))], importance)
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

## 5-fold cv for linearSVC

In [None]:
np.random.seed(42)

lsvc = LinearSVC().fit(X_train, y_train)
print("Linear SVC Training score: {:.8f}".format(lsvc.score(X_train, y_train)))
print("Linear SVC Testing score: {:.8f}".format(lsvc.score(X_test, y_test)))


def lsvc_explore_and_graph(c_vals):
    hyperparameters = dict(C=c_vals)
    clf = GridSearchCV(lsvc, hyperparameters, cv=5)
    clf.fit(X_train, y_train)
    cv_scores = clf.cv_results_['mean_test_score']
    cv_standard_deviations = clf.cv_results_['std_test_score']
    optimal_c_val = clf.best_params_['C']

    print("Best C: ", optimal_c_val)
    print("Train score: ", LinearSVC(C=optimal_c_val).fit(X_train, y_train).score(X_train, y_train))

    fig, ax = plt.subplots()
    fig.set_figheight(4)
    fig.set_figwidth(4)
    plt.plot(c_vals, cv_scores)
    plt.xlabel('C Value')
    plt.ylabel('Cross-Validated Accuracy')
    plt.show()

    fig, ax = plt.subplots()
    fig.set_figheight(4)
    fig.set_figwidth(4)
    plt.plot(c_vals, cv_standard_deviations)
    plt.xlabel('C Value')
    plt.ylabel('Cross-Validated Standard Deviation')
    plt.show()

In [None]:
lsvc_explore_and_graph(np.logspace(0, 4, 20))

### LSVC Validate logspace 2

In [None]:
lsvc_explore_and_graph(np.logspace(-4, 1, 20))

### LSVC Validate linspace 1

In [None]:
lsvc_explore_and_graph(np.linspace(0, 1, 50))

### LSVC Validate linspace 2

In [None]:
lsvc_explore_and_graph(np.linspace(0.01, 0.03, 50))

### LSVC Test

In [None]:
np.random.seed(42)

print(
    "LSVC best training score: ",
    LinearSVC(C=0.010816326530612244)
        .fit(X_train, y_train)
        .score(X_train, y_train)
)
lsvc = LinearSVC(C=0.010816326530612244).fit(X_test, y_test)
print(
    "LSVC best test score: ",
    lsvc.score(X_test, y_test)
)
features = lsvc.coef_[0]
print((features == 0).sum())
print(features)