<a href="https://colab.research.google.com/github/kendallcallison/AI_Final_Project/blob/main/AI_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

1. load csv file (panda, numpy)
2. split dataset. Example code:()
   ```
   random.shuffle(data) # change if you are using pandas dataframe
   training = data[:int(len(data)*0.8)]
   test = data[int(len(data)*0.8):]

   fold5 = KFold(5) # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
   for train_idx, val_idx in fold5.split(training):
      sub_val = training[val_idx]
      sub_train = training[train_idx]
      clf = model(sub_train, sub_val, ...) # training the model, and evaluate it on validation dataset
      performance(clf, test) # test the model on test dataset
   ```

In [50]:
import pandas as pd
import numpy as np
from collections import Counter
from concurrent.futures import ProcessPoolExecutor

# Load the dataset
data = pd.read_csv("spambase.csv")

# Define function to split dataset into train and test sets
def train_test_split(data, test_size = 0.2):
    data = data.sample(frac = 1).reset_index(drop = True)  # Shuffle the data
    split_idx = int(len(data) * (1 - test_size))
    train_data = data.iloc[:split_idx]
    test_data = data.iloc[split_idx:]
    return train_data, test_data

#Naive bayes

1. model learning:

   Note:

   features: remove attributes that is not related to word (the last four attributes)

   labels: the last column

   count P(c) -> how many samples are positive, and how many are negtive

   if freq_word>0, then this word exists. You could use this to calculate P(a|c) -> for each class, what is the prob of each word

   remember to use laplace smoothing.

2. model evaluation (on val dataset -> performance(model, val)):
   
   for each new sample, $\prod{P(a|c)}P(c)$ if word is in the email(freq_word > 0); and find the maximum class
   

   

In [40]:
# Define function to separate data by class
def nb_separate_by_class(data):
    separated = {}
    for _, row in data.iterrows():
        label = row['spam']
        if label not in separated:
            separated[label] = []
        separated[label].append(row.drop('spam'))
    return separated

# Define function to calculate mean and standard deviation for each feature
def nb_summarize(data):
    summaries = {}
    for label, rows in data.items():
        features = np.array([row[:-1] for row in rows])
        summaries[label] = [(np.mean(column), np.std(column)) for column in features.T]
    return summaries

# Define function to calculate Gaussian probability density function
def nb_gaussian_probability(x, mean, std):
    exponent = np.exp(-((x - mean) ** 2 / (2 * std ** 2)))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

# Define function to calculate class probabilities
def nb_class_probabilities(summaries, input_data):
    probabilities = {}
    for label, class_summaries in summaries.items():
        probabilities[label] = 1
        for i in range(len(class_summaries)):
            mean, std = class_summaries[i]
            x = input_data[i]
            probabilities[label] *= nb_gaussian_probability(x, mean, std)
    return probabilities

# Define function to predict the class for a given input
def nb_predict(summaries, input_data):
    probabilities = nb_class_probabilities(summaries, input_data)
    best_label, best_prob = None, -1
    for label, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = label
    return best_label

# Define function to evaluate the model
def evaluate_nb(train_data, test_data):
    separated = nb_separate_by_class(train_data)
    summaries = nb_summarize(separated)
    predictions = []
    for i in range(len(test_data)):
        input_data = test_data.iloc[i][:-1]
        label = nb_predict(summaries, input_data)
        predictions.append(label)
    actual_labels = test_data['spam'].tolist()
    correct = sum(1 for i in range(len(predictions)) if predictions[i] == actual_labels[i])
    accuracy = correct / float(len(predictions)) * 100.0
    return accuracy

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size = 0.2)

# Evaluate the model
accuracy = evaluate_nb(train_data, test_data)
print("Accuracy:", accuracy)


Accuracy: 82.19326818675353


# KNN
1. model learning: None

2. model evaluation(on val dataset): You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.

   ```
   Note:
   parallel programing
   numpy.cos() to calcuate the similarity
   ```

In [41]:
# Define function to calculate cosine similarity using numpy
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_product = np.linalg.norm(a) * np.linalg.norm(b)
    return dot_product / norm_product

# Define function to calculate K Nearest Neighbors
def k_nearest_neighbors(train_data, val_row, k=3):
    similarities = train_data.iloc[:, :-1].apply(lambda row: cosine_similarity(row, val_row[:-1]), axis=1)
    neighbors = list(zip(train_data['spam'], similarities))
    neighbors.sort(key=lambda x: x[1], reverse=True)
    return neighbors[:k]

# Define function to predict class based on K Nearest Neighbors
def predict_class(neighbors):
    labels = [neighbor[0] for neighbor in neighbors]
    most_common_label = Counter(labels).most_common(1)[0][0]
    return most_common_label

# Define function to evaluate the model
def evaluate_knn(train_data, val_data, k=3):
    predictions = []
    with ProcessPoolExecutor() as executor:
        for i, val_row in val_data.iterrows():
            neighbors = k_nearest_neighbors(train_data, val_row, k)
            prediction = predict_class(neighbors)
            predictions.append(prediction)
    actual_labels = val_data['spam'].tolist()
    correct = sum(1 for i in range(len(predictions)) if predictions[i] == actual_labels[i])
    accuracy = correct / float(len(predictions)) * 100.0
    return accuracy

# Split the data into train and validation sets
train_data, val_data = train_val_split(data, val_size = 0.2)

# Evaluate the model
accuracy = evaluate_knn(train_data, test_data)
print("Accuracy:", accuracy)


Accuracy: 89.35939196525516


# LR

1. model learning: You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.
    
    $y = sigmoid(MX)$

step 1: add one more column (all value is 1) in X -> X' = np.c_[np.ones((len(X), 1)), X]

step 2:vector M = np.random.randn(len(X[0])+1, 1);

key formula for step 3 (Note: n is the size of the TRAINING dataset; $cdot$ is dot production ):

1. $pred_y = sigmoid(M\cdot X')$

2. $loss = -\sum(y\cdot log(pred_y)+(1-y)\cdot log(1-pred_y))/n$

3. $gm=X'\cdot (pred_y - y)*2/n$

Step 3 example code:
   ```
   #Step 3: performing gradient descent on whole dataset:
   best_model = M
   best_performace = 0
   for i in range(epoch):
     pred_y = ...
     gm = ...
     _p = performace(model, val)
     if _p > best_performance:
        best_model = M
        best_performance = _p
     M = M - learning_rate*gm
   ```

2. model evaluation(on val dataset):
  
   calculate pred_y, if more than 0.5, then the predicted label is 1.

In [64]:
# Define sigmoid function with numerical stability
def lr_sigmoid(z):
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

# Define function to initialize weights
def lr_initialize_weights(dim):
    return np.zeros((dim, 1))

# Define function to compute cost and gradients
def lr_compute_cost_and_gradients(X, y, weights):
    m = len(y)
    z = np.dot(X, weights)
    h = lr_sigmoid(z)
    epsilon = 1e-10  # Small epsilon to avoid taking the logarithm of zero
    cost = -np.mean(y * np.log(h + epsilon) + (1 - y) * np.log(1 - h + epsilon))
    gradient = np.dot(X.T, (h - y)) / m
    return cost, gradient

# Define function to train logistic regression model
def lr_train_logistic_regression(X, y, learning_rate=0.01, num_iterations=100):
    m, n = X.shape
    weights = lr_initialize_weights(n)
    for i in range(num_iterations):
        cost, gradient = lr_compute_cost_and_gradients(X, y, weights)
        weights -= learning_rate * gradient
    return weights

# Define function to predict using logistic regression model
def lr_predict(X, weights):
    z = np.dot(X, weights)
    return lr_sigmoid(z)

# Main function
def evaluate_lr(data):
    # Split the dataset into features (X) and labels (y)
    X = data.drop(columns=['spam']).values
    y = data['spam'].values.reshape(-1, 1)

    # Split the data into train and test sets (80% train, 20% test)
    split_idx = int(0.8 * len(X))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    # Train logistic regression model
    weights = lr_train_logistic_regression(X_train, y_train)

    # Make predictions on test set
    predictions = lr_predict(X_test, weights)
    predictions = (predictions >= 0.5).astype(int)

    # Calculate accuracy
    accuracy = np.mean(predictions == y_test) * 10000
    return accuracy

evaluate_lr(data)

76.0043431053203

# Model Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html

In [69]:
def performance(model, data):
  result = 0

  if (model == "nb"):
    print("nb ", end='')
    result = evaluate_nb(train_data, test_data)

  if (model == "knn"):
    print("knn ", end='')
    result = evaluate_knn(train_data, test_data)

  if (model == "lr"):
    print("lr ", end='')
    result = evaluate_lr(data)

  print("result: " + str(result) + "%")

  return

# run each algorithm and compare the results
performance("nb", data)
performance("knn", data)
performance("lr", data)

nb result: 83.27904451682954%
knn result: 89.35939196525516%
lr result: 76.0043431053203%


76.0043431053203