In [None]:
! pip install -q kaggle
! rm -rf ~/.kaggle
! mkdir ~/.kaggle
! echo '{"username":"jademeskill","key":"fe82fde46aeb72df41f1b111160c034b"}' > ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json
! rm ./spambase.*
! kaggle datasets download -d colormap/spambase
! unzip ./spambase.zip
! rm ./spambase.zip

In [159]:
import numpy
import pandas
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

# Naive bayes

1. model learning:

   Note:

   features: remove attributes that is not related to word (the last four attributes)

   labels: the last column

   count P(c) -> how many samples are positive, and how many are negtive

   if freq_word>0, then this word exists. You could use this to calculate P(a|c) -> for each class, what is the prob of each word

   remember to use laplace smoothing.

2. model evaluation (on val dataset -> performance(model, val)):
   
   for each new sample, $\prod{P(a|c)}P(c)$; and find the maximum class
   

   

In [160]:
class NaiveBayes:
    def __init__(self, trainingData, validationData):
        self.name = "Gaussian Naive Bayes (with Laplace Smoothing)"
        self.actual = [row.iloc[-1] for _, row in validationData.iterrows()]
        self.results = self.naiveBayes(trainingData, validationData)

    def splitByClass(self, data):
        separated = dict()
        for index in range(len(data)):
            vector = data.iloc[index].tolist()
            classValue = vector[-1]
            if classValue not in separated:
                separated[classValue] = list()
            separated[classValue].append(vector)
        return separated

    def summarizeDataset(self, data):
        summaries = [(numpy.mean(column), numpy.std(column), len(column)) for column in zip(*data)]
        summaries = summaries[:-1]
        return summaries

    def summarizeByClass(self, data):
        separated = self.splitByClass(data)
        summaries = dict()
        for classValue, rows in separated.items():
            summaries[classValue] = self.summarizeDataset(rows)
        return summaries

    def gaussianProbabilityDensity(self, x, mean, standardDeviation):
        exponent = numpy.exp(-((x - mean) ** 2 / ((2 * standardDeviation ** 2) + 1e-17)))
        return (1 / ((numpy.sqrt(2 * numpy.pi) * standardDeviation) + 1e-17)) * exponent

    def calculateClassProbabilities(self, summaries, row):
        totalRows = sum([summaries[label][0][2] for label in summaries])
        totalClasses = len(summaries)
        probabilities = dict()
        for classValue, classSummaries in summaries.items():
            probabilities[classValue] = (summaries[classValue][0][2] + 1)/(float(totalRows) + totalClasses)
            for index in range(len(classSummaries)):
                mean, standardDeviation, _ = classSummaries[index]
                probabilities[classValue] *= self.gaussianProbabilityDensity(row.iloc[index], mean, standardDeviation)
        return probabilities
    
    def predict(self, summaries, row):
        probabilities = self.calculateClassProbabilities(summaries, row)
        bestLabel, bestProbability = None, -1

        for classValue, probability in probabilities.items():
            if bestLabel is None or probability > bestProbability:
                bestProbability = probability
                bestLabel = classValue
            
        return bestLabel
    
    def naiveBayes(self, trainingData, validationData):
        summaries = self.summarizeByClass(trainingData)
        predictions = list()

        for rowIndex in range(len(validationData)):
            prediction = self.predict(summaries, validationData.iloc[rowIndex])
            predictions.append(prediction)

        return predictions

# KNN
1. model learning: None

2. model evaluation(on val dataset): You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.

   ```
   Note:
   parallel programing
   numpy.cos() to calcuate the similarity
   ```

In [161]:
class KNN:
    def __init__(self, trainingData, validationData):
        self.data = validationData
        self.actual = [row.iloc[-1] for _, row in self.data.iterrows()]
        self.results = self.evaluation(self.data)
        self.name = "KNN"

    def euclideanDistance(self, row1, row2):
        distance = 0.0
        for i in range(len(row1) - 1):
            distance += (row1.iloc[i] - row2.iloc[i]) ** 2
        return numpy.sqrt(distance)

    def nearestNeighbors(self, data, testRow, numNeighbors):
        distances = []

        for _, row in data.iterrows():
            distance = self.euclideanDistance(testRow, row)
            distances.append((row, distance))

        distances.sort(key=lambda tup: tup[1])
        neighbors = list()
        for index in range(numNeighbors):
            neighbors.append(distances[index][0])
        return neighbors

    def predict(self, data, testRow, numNeighbors):
        neighbors = self.nearestNeighbors(data, testRow, numNeighbors)
        output = [row.iloc[-1] for row in neighbors]
        prediction = max(set(output), key=output.count)
        return prediction

    def evaluation(self, data):
        predictions = []
        for _, row in data.iterrows():
            predictions.append(self.predict(data, row, 5))

        return predictions

# LR

1. model learning: You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.
    
    $y = sigmoid(MX)$

step 1: add one more column (all value is 1) in X -> X' = np.c_[np.ones((len(X), 1)), X]

step 2:vector M = np.random.randn(len(X[0])+1, 1);

key formula for step 3 (Note: n is the size of the TRAINING dataset; $cdot$ is dot production ):

1. $pred_y = sigmoid(M\cdot X')$

2. $loss = -\sum(y\cdot log(pred_y)+(1-y)\cdot log(1-pred_y))/n$

3. $gm=X'\cdot (pred_y - y)*2/n$

Step 3 example code:
   ```
   #Step 3: performing gradient descent on whole dataset:
   best_model = M
   best_performace = 0
   for i in range(epoch):
     pred_y = ...
     gm = ...
     _p = performace(model, val)
     if _p > best_performance:
        best_model = M
        best_performance = _p
     M = M - learning_rate*gm
   ```

2. model evaluation(on val dataset):
  
   calculate pred_y, if more than 0.5, then the predicted label is 1.

In [166]:
class LR:
    def __init__(self, trainingData, validationData):
        self.name = "Logistic Regression"
        self.data = self.normalize(validationData, self.datasetMinMax(validationData))
        self.actual = [row.iloc[-1] for _, row in self.data.iterrows()]
        self.results = self.logisticRegression(trainingData, 0.1, 100, self.data)

    def datasetMinMax(self, data):
        minmax = list()
        for index in range(len(data.columns)):
            columns = [row.iloc[index] for _, row in data.iterrows()]
            minVal = min(columns)
            maxVal = max(columns)
            minmax.append([minVal, maxVal])
        return minmax
    
    def normalize(self, data, minmax):
        for rowIndex in range(len(data)):
            index = 0
            for column in data.columns:
                data.loc[rowIndex, column] = (data.iloc[rowIndex].loc[column] - minmax[index][0]) / (minmax[index][1] - minmax[index][0])
                index += 1
        return data

    def predict(self, row, coefficients):
        y = coefficients[0]
        for index in range(len(row) - 1):
            y += coefficients[index + 1] * row.iloc[index]
        return 1.0 / (1.0 + numpy.exp(-y))

    def stochasticGradientDescent(self, data, learningRate, epochs):
        coefficient = [0.0 for i in range(len(data.iloc[0]))]
        for epoch in range(epochs):
            for _, row in data.iterrows():
                y = self.predict(row, coefficient)
                error = row.iloc[-1] - y
                coefficient[0] = coefficient[0] + learningRate * error * y * (1.0 - y)
                for i in range(len(row) - 1):
                    coefficient[i + 1] = coefficient[i + 1] + learningRate * error * y * (1.0 - y) * row.iloc[i]
        return coefficient

    def logisticRegression(self, trainingData, learningRate, epochs, validationData):
        predictions = list()
        coefficient = self.stochasticGradientDescent(trainingData, learningRate, epochs)
        for _, row in validationData.iterrows():
            y = round(self.predict(row, coefficient))
            predictions.append(y)
        return predictions

# Model Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html

In [163]:
def performance(model, data):
    actual = model.actual
    results = model.results
    result = {"Confusion Matrix": confusion_matrix(actual, results), "Accuracy Score": accuracy_score(actual, results), "AUC Score": roc_auc_score(actual, results)}
    return result

In [167]:
from time import time

# Dataset
data = pandas.read_csv("./spambase.csv")
data = data.sample(frac=1)

training = data[:int(len(data)*.8)]
test = data[int(len(data)*.8):]

models = list()
models.append(NaiveBayes)
models.append(KNN)
models.append(LR)

fold5 = KFold(5)
for model in models:
    avgTime = 0.0
    avgAcc = 0.0
    avgAUC = 0.0
    avgTP = 0
    avgFP = 0
    avgFN = 0
    avgTN = 0
    
    for train_idx, val_idx in fold5.split(training):
        sub_val = training.iloc[val_idx[0]:val_idx[-1]]
        sub_train = training.iloc[train_idx[0]:train_idx[-1]]
        startTime = time()
        clf = model(sub_train, sub_val)
        endTime = time() - startTime
        result = performance(clf, test)
        avgTime += endTime
        avgAcc += result["Accuracy Score"]
        avgTP += result["Confusion Matrix"][0][0]
        avgFP += result["Confusion Matrix"][0][1]
        avgFN += result["Confusion Matrix"][1][0]
        avgTN += result["Confusion Matrix"][1][1]
        avgAUC += result["AUC Score"]

    print(clf.name)
    print("\tAverage Time: ", avgTime/5)
    print("\tAverage Accuracy: ", avgAcc/5)
    print("\tAverage AUC Score: ", avgAUC/5)
    print("\tAverage True Positives: ", int(avgTP/5))
    print("\tAverage False Positives: ", int(avgFP/5))
    print("\tAverage False Negatives: ", int(avgFN/5))
    print("\tAverage True Negatives: ", int(avgTN/5))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[rowIndex, column] = (data.iloc[rowIndex].loc[column] - minmax[index][0]) / (minmax[index][1] - minmax[index][0])
  data.loc[rowIndex, column] = (data.iloc[rowIndex].loc[column] - minmax[index][0]) / (minmax[index][1] - minmax[index][0])
  data.loc[rowIndex, column] = (data.iloc[rowIndex].loc[column] - minmax[index][0]) / (minmax[index][1] - minmax[index][0])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[rowIndex, column] = (data.iloc[rowIndex].loc[column] - minmax[index][0]) / (minmax[index][1] - minmax[index][0])
  return 1.0 / (1.0 + numpy.exp(-y))
A value is trying to be set on a copy of a slice from a

Logistic Regression
	Average Time:  82.24754905700684
	Average Accuracy:  0.6065894853263064
	Average AUC Score:  0.5219215044008471
	Average True Positives:  689
	Average False Positives:  236
	Average False Negatives:  296
	Average True Negatives:  132
