In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import scipy.io as sio 
import pandas as pd
import re
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from nltk.stem import PorterStemmer

In [2]:
def plotBinaryClassScatter(class1=None, class2=None):
    plt.clf()
    fig, ax = plt.subplots()
    fig.set_size_inches((10, 7))
    ax.scatter(class1[0], class1[1], c="red")
    ax.scatter(class2[0], class2[1], c="blue")
    return fig, ax

def plotDecisionBoundary(X, model, ax):
    x_min, x_max = X[:, 0].min(), X[:, 0].max()
    y_min, y_max = X[:, 1].min(), X[:, 1].max()
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                        np.arange(y_min, y_max, 0.01))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    ax.contour(xx, yy, Z.reshape(xx.shape), alpha=0.4)

def calculateAccuracy(X, y, model):
    pred = model.predict(X)
    return np.sum((pred.ravel() == y.ravel()))/len(y)
    
    

# 1. Train non-linear decision boundary

## a. Load data

In [None]:
dataset = sio.loadmat("ex6data2.mat")

y = np.array(dataset["y"]).ravel()
X = np.array(dataset["X"])
print("Shapes:", X.shape, y.shape)

# Visualize data
plotBinaryClassScatter(
    class1=(X[:,0][y == 0], X[:,1][y == 0]), 
    class2=(X[:,0][y == 1], X[:,1][y == 1])
)

## 2. Train SVM

In [None]:
clf = svm.SVC(kernel='rbf', C=1.0, gamma=50)
clf.fit(X, y.ravel())

_, ax = plotBinaryClassScatter(
    class1=(X[:,0][y == 0], X[:,1][y == 0]), 
    class2=(X[:,0][y == 1], X[:,1][y == 1])
)
plotDecisionBoundary(X, clf, ax)

# Calculate accuracy:
print("Training accuracy:", calculateAccuracy(X, y, clf))

# 2. Determine the right kernel params

## a. Load data

In [None]:
dataset2 = sio.loadmat("ex6data3.mat")

y = np.array(dataset2["y"]).ravel()
X = np.array(dataset2["X"])
Xval = np.array(dataset2["Xval"])
yval = np.array(dataset2["yval"]).ravel()
print("Shapes:", X.shape, y.shape, Xval.shape, yval.shape)

# Visualize data
plotBinaryClassScatter(
    class1=(X[:,0][y == 0], X[:,1][y == 0]), 
    class2=(X[:,0][y == 1], X[:,1][y == 1])
)

## b. Train various models on various params

In [None]:
C_arr = [0.1, 0.5, 1, 2, 3, 5, 10, 20]
gamma_arr = [0.1, 0.2, 0.5, 1, 10, 20, 50, 80, 100]
min_err = 10e6
optimal_C = 0
optimal_gamma = 0
optimal_model = None 

for C in C_arr:
    for gamma in gamma_arr:
        clf = svm.SVC(C=C, gamma=gamma, kernel='rbf')
        clf.fit(X, y)
        pred = clf.predict(Xval).ravel()
        err = np.sum(np.double(pred != yval.ravel()))/len(pred)
        
        if (err < min_err):
            min_err = err 
            optimal_C = C 
            optimal_gamma = gamma
            optimal_model = clf

        print("\r", 
            "Training with C=%f, gamma=%f" % (C, gamma), 
            "; Error: %f" % err, 
            end="")

print("\nOptimal values of (C, gamma, err) =", (optimal_C, optimal_gamma, min_err))

# Plot decision boundary
_, ax = plotBinaryClassScatter(
    class1=(X[:,0][y == 0], X[:,1][y == 0]), 
    class2=(X[:,0][y == 1], X[:,1][y == 1])
)
plotDecisionBoundary(X, optimal_model, ax)

# 3. Training a spam classifier with SVM

## a. Load data

In [3]:
df = pd.read_table("vocab.txt", header=None)

dataset_train = sio.loadmat("spamTrain.mat")
X = np.array(dataset_train["X"]) 
y = np.array(dataset_train["y"])
print(X.shape, y.shape)

# Scramble data
megaset = np.hstack((X, y))
np.random.shuffle(megaset)

breakpnt = int(X.shape[0]*0.8)

X = megaset[None:breakpnt, None:-1]
y = megaset[None:breakpnt, -1:None]
Xval = megaset[breakpnt:None, None:-1]
yval = megaset[breakpnt:None, -1:None]

print(X.shape, y.shape, Xval.shape, yval.shape)

dataset_val = sio.loadmat("spamTest.mat")
Xtest = np.array(dataset_train["X"])
ytest = np.array(dataset_train["y"])

(4000, 1899) (4000, 1)
(3200, 1899) (3200, 1) (800, 1899) (800, 1)


In [4]:
# Test cell


## b. Define data processing functions

In [5]:
def readEmail(filename):
    email = None 
    with open(filename) as f:
        email = f.readlines() 
    return " ".join(email)
    
def preprocessEmail(email: str):
    regex_matches = [
        "(http|https)://[^\s]*", # http
        "[0-9]+", # number
        "[^\s]+@[^\s]+", # email address
        "[$]+", # dollar sign
        "<[^<>]+>", # html tags
        "[^A-Za-z0-9]", # non alphanumeric chars
        "[\s]+"
    ]
    regex_replacements = [
        "httpaddr",
        "number",
        "emailaddr",
        "dollar ",
        " ",
        " ",
        " "
    ]

    for match, replacement in zip(regex_matches, regex_replacements):
        email = re.sub(match, replacement, email)
    
    return email.lower().strip()

def getEncodedEmail(email: str):
    word_indices = np.array([],  dtype=np.int16)
    email = preprocessEmail(email)
    ps = PorterStemmer()

    for word in email.split():
        stemmed_word = ps.stem(word)
        data_row = df[df[1] == stemmed_word]
        if (not data_row.empty):
            word_indices = np.append(word_indices, int(data_row[0]))
    
    return word_indices

def getFeatureVector(encoded_email: np.ndarray):
    feature_vec = np.zeros((df.shape[0], 1))
    for idx in encoded_email:
        feature_vec[idx] = 1
    return feature_vec

## c. Train SVM

In [6]:
C_arr = [50]
gamma_arr = [0.0005]
min_err = 10e6
optimal_C = 0
optimal_gamma = 0
optimal_model = None 

for C in C_arr:
    for gamma in gamma_arr:
        print("Training with C=%f, gamma=%f" % (C, gamma), end="; ")
        clf = svm.SVC(C=C, gamma=gamma, kernel='rbf')
        clf.fit(X, y.ravel())

        val_pred = clf.predict(Xval).ravel()
        err = np.mean(np.double(val_pred != yval.ravel()))

        print("Error=", err)
        if (err < min_err):
            min_err = err 
            optimal_C = C 
            optimal_gamma = gamma
            optimal_model = clf

print("\nOptimal values of (C, gamma, err) =", (optimal_C, optimal_gamma, min_err))
print("Test set accuracy:", calculateAccuracy(Xtest, ytest, optimal_model))

Training with C=50.000000, gamma=0.000500; Error= 0.0175

Optimal values of (C, gamma, err) = (50, 0.0005, 0.0175)
Test set accuracy: 0.994


## e. Test real emails

In [7]:
# sample1 = readEmail("emailSample1.txt")
# sample2 = readEmail("emailSample2.txt")
# spam1 = readEmail("spamSample1.txt")
# spam2 = readEmail("spamSample2.txt")
spam3 = r"do you want to be a billionnaire? participate our training programme today!"
spam4 = "bkav antivirus, 100\% effective! purchase now and have your computer protected \
    by the most advanced antivirus software in the world!"
normal = "hello, i just saw the 'statistics in application' book that you are selling on facebook. \
    do you accept COD shipping? please contact me as soon as possible. best regards."


for mail in (spam3, spam4, normal):
    feature_vec = getFeatureVector(getEncodedEmail(mail)).reshape(1, -1)
    pred = optimal_model.predict(feature_vec)
    if int(pred) == 1:
        print("=> This is a spam email")
    else:
        print("=> This is not a spam email")

=> This is a spam email
=> This is a spam email
=> This is not a spam email


# 4. Training a spam classifier with neural network

## a. Load data

In [None]:
# Calls the block from (3)

## b. Train

In [9]:
lambdas = [0.1, 0.5, 1, 10, 20, 30, 40]
max_val_accuracy = 0
optimal_model = None
optimal_lambda = 0

for lambd in lambdas:
    print("Training with lambda=%f" % lambd, end="; ")
    clf = MLPClassifier(solver='lbfgs', alpha=lambd, hidden_layer_sizes=(1,))
    clf.fit(X, y.ravel())

    val_accuracy = calculateAccuracy(Xval, yval, clf)

    if val_accuracy > max_val_accuracy:
        max_val_accuracy = val_accuracy
        optimal_model = clf
        optimal_lambda = lambd

    print("Val acc=%f" % (val_accuracy))

print("Optimal values of (accuracy, lambda) =", (max_val_accuracy, optimal_lambda))
print("Test set accuracy: ", calculateAccuracy(Xtest, ytest, optimal_model))
    

Training with lambda=0.100000; Val acc=0.976250
Training with lambda=0.500000; Val acc=0.980000
Training with lambda=1.000000; Val acc=0.710000
Training with lambda=10.000000; Val acc=0.981250
Training with lambda=20.000000; Val acc=0.710000
Training with lambda=30.000000; Val acc=0.981250
Training with lambda=40.000000; Val acc=0.981250
Optimal values of (accuracy, lambda) = (0.98125, 10)
Test set accuracy:  0.99575


In [10]:
# Test with logistic regression

lambdas = [0.1, 0.5, 1, 10, 20, 30, 40]
max_val_accuracy = 0
optimal_model = None
optimal_lambda = 0

for lambd in lambdas:
    print("Training with lambda=%f" % lambd, end="; ")
    clf = LogisticRegression(solver='lbfgs', C=1/lambd)
    clf.fit(X, y.ravel())

    val_accuracy = calculateAccuracy(Xval, yval, clf)

    if val_accuracy > max_val_accuracy:
        max_val_accuracy = val_accuracy
        optimal_model = clf
        optimal_lambda = lambd

    print("Val acc=%f" % (val_accuracy))

print("Optimal values of (accuracy, lambda) =", (max_val_accuracy, optimal_lambda))
print("Test set accuracy: ", calculateAccuracy(Xtest, ytest, optimal_model))

Training with lambda=0.100000; Val acc=0.980000
Training with lambda=0.500000; Val acc=0.981250
Training with lambda=1.000000; Val acc=0.982500
Training with lambda=10.000000; Val acc=0.981250
Training with lambda=20.000000; Val acc=0.972500
Training with lambda=30.000000; Val acc=0.973750
Training with lambda=40.000000; Val acc=0.972500
Optimal values of (accuracy, lambda) = (0.9825, 1)
Test set accuracy:  0.996


## d. Test with real emails

In [None]:
# Use test block from (3)