In [1]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Change working directory to where your .ipynb file is located
os.chdir('/content/drive/My Drive/CSDS 335/Project1/code')
# os.chdir('/content/drive/My Drive/Project1/code')

In [4]:
# Import all of the models and related functions
from data_loading import *
from cv_and_eval import *

from naive_bayes import *
from svm import *
from knn import *
from decisiontree import *
from adaboost import *

In [5]:
# Load the dataset
X2, y2 = load_dataset_2(212)
X1, y1 = load_dataset_1(212)

# Naive Bayes
Hyperparameter: The size of training set

### Dataset1

In [15]:
# Optimize parameters
print('Grid Search Results:')
split_prop = naive_bayes_test_params(X1, y1, [.5,.6,.7,.8,.9])

# Split the data based on hyperparam tuning
split_ind = int(split_prop*len(X1))
X_train = X1[:split_ind]
y_train = y1[:split_ind]

X_test = X1[split_ind:]
y_test = y1[split_ind:]

# Train and evaluate the final model
model1 = NaiveBayes()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

print()
acc, prec, rec, f1 = eval_predictions(y_test, y_pred)
print("Final Results:")
print(f'Accuracy: {acc}')
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1 score: {f1}')

Grid Search Results:
best split threshold: 0.5
best acc: 0.9404797601199402
best prec: 0.9421356421257482
best rec: 0.9026373626282187
best f1: 0.9179625156311394

Final Results:
Accuracy: 0.9473684210526315
Precision: 0.9696969696959902
Recall: 0.8888888888880658
F1 score: 0.9275362318831618


### Dataset2

In [16]:
# Optimize parameters
print('Grid Search Results:')
split_prop = naive_bayes_test_params(X2, y2, [.5,.6,.7,.8,.9])

# Split the data based on hyperparam tuning
split_ind = int(split_prop*len(X2))
X_train = X2[:split_ind]
y_train = y2[:split_ind]

X_test = X2[split_ind:]
y_test = y2[split_ind:]

# Train and evaluate the final model
model1 = NaiveBayes()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

print()
acc, prec, rec, f1 = eval_predictions(y_test, y_pred)
print("Final Results:")
print(f'Accuracy: {acc}')
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1 score: {f1}')

Grid Search Results:
best split threshold: 0.6
best acc: 0.7327142857142857
best prec: 0.5891774891722166
best rec: 0.7016233766155413
best f1: 0.6338959756922076

Final Results:
Accuracy: 0.6648648648648648
Precision: 0.5142857142849796
Recall: 0.562499999999121
F1 score: 0.537313432835019


# SVM
Hyperparameters: C(lambda) and gamma

### Dataset1

In [17]:
# Train test split
split_ind = int(0.8*len(X1))
X_train = X1[:split_ind]
y_train = y1[:split_ind]

X_test = X1[split_ind:]
y_test = y1[split_ind:]

# Optimize parameters using X_train
print('Grid Search Results:')
l1, g1 = SVM_test_params(X_train, y_train)

# Train best model and evaluate
model1 = rbf_SVM(C = l1, gamma = g1)
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

print()
acc, prec, rec, f1 = eval_predictions(y_test, y_pred)
print("Final Results:")
print(f'Accuracy: {acc}')
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1 score: {f1}')

Grid Search Results:


  return 2 * (prec * rec) / (prec + rec)


best gamma: 0.01
best lambda: 0.1
best acc: 0.9410392364793214
best prec: 0.8716071428518577
best rec: 0.854702911462551
best f1: nan

Final Results:
Accuracy: 0.9473684210526315
Precision: 0.9736842105237534
Recall: 0.8809523809502834
F1 score: 0.9249999999976874


### Dataset2

In [18]:
# Dataset 2
# Train test split
split_ind = int(0.8*len(X2))
X_train = X2[:split_ind]
y_train = y2[:split_ind]

X_test = X2[split_ind:]
y_test = y2[split_ind:]

# Optimize parameters using X_train
print('Grid Search Results:')
l2, g2 = SVM_test_params(X_train, y_train)

# Train best model and evaluate
model2 = rbf_SVM(C = l2, gamma = g2)
model2.fit(X_train, y_train)
y_pred = model2.predict(X_test)

print()
acc, prec, rec, f1 = eval_predictions(y_test, y_pred)
print("Final Results:")
print(f'Accuracy: {acc}')
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1 score: {f1}')

Grid Search Results:
best gamma: 1.0
best lambda: 0.1
best acc: 0.7234984984984985
best prec: 0.63089133088466
best rec: 0.5273613396363939
best f1: 0.5554126156023513

Final Results:
Accuracy: 0.7311827956989247
Precision: 0.6521739130406428
Recall: 0.4687499999985351
F1 score: 0.5454545454525619


# kNN

### Dataset1

In [8]:
# instantiate the KNNModel
knn_model = KNNModel()

#range of neighbor values to try
num_neighbors_values = [1, 3, 5, 7, 9]

# Perform 10-fold cross-validation
print("----- Dataset 1 -----")
results = cross_val_10fold1(knn_model, X1, y1, num_neighbors_values)

best_value = np.argmax(results['f1_score'])
print(f'Best Performing Configuration:')
print(f'Num Neighbors: {results["num_neighbors"][best_value]}')
print(f'Accuracy: {results["accuracy"][best_value]:.4}')
print(f'Precision: {results["precision"][best_value]:.4}')
print(f'Recall: {results["recall"][best_value]:.4}')
print(f'F1-Score: {results["f1_score"][best_value]:.4}')

# Instantiate the KNNModel
knn_model = KNNModel()

# Perform 10-fold cross-validation
print("----- Dataset 2 -----")
results = cross_val_10fold1(knn_model, X2, y2, num_neighbors_values)

best_value = np.argmax(results['f1_score'])
print(f'Best Performing Configuration:')
print(f'Num Neighbors: {results["num_neighbors"][best_value]}')
print(f'Accuracy: {results["accuracy"][best_value]:.4}')
print(f'Precision: {results["precision"][best_value]:.4}')
print(f'Recall: {results["recall"][best_value]:.4}')
print(f'F1-Score: {results["f1_score"][best_value]:.4}')

----- Dataset 1 -----
Best Performing Configuration:
Num Neighbors: 5
Accuracy: 0.9722
Precision: 0.9885
Recall: 0.9368
F1-Score: 0.9613
----- Dataset 2 -----
Best Performing Configuration:
Num Neighbors: 3
Accuracy: 0.6714
Precision: 0.5319
Recall: 0.4304
F1-Score: 0.4715


# Decision Tree

In [10]:
# Assuming X1 and y1 are your dataset
# Split the data based on hyperparam tuning
split_ind = int(0.8 * len(X1))
X_train = X1[:split_ind]
y_train = y1[:split_ind]

X_test = X1[split_ind:]
y_test = y1[split_ind:]

# Train the DecisionTree model
dt_model = DecisionTree()

# Perform cross-validation on the training set
cross_val_10fold2(dt_model, X_train, y_train)

# Train the model on the full training set
dt_model.train(X_train, y_train)

# Evaluate the final model using the testing set
y_pred = dt_model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1)


# Print evaluation metrics
acc = calculate_accuracy(y_test, y_pred_class)
prec = precision(y_test, y_pred_class)
rec = recall(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)

print("Final Results:")
print(f'Accuracy: {acc}')
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1 score: {f1}')


Final Results:
Accuracy: 0.9385964912280702
Precision: 0.9999999999971428
Recall: 0.8333333333313492
F1 score: 0.9090909090885477


  return 2 * (prec * rec) / (prec + rec)


In [11]:
# Assuming X2 and y2 are your dataset
split_ind = int(0.8 * len(X1))
X_train = X2[:split_ind]
y_train = y2[:split_ind]

X_test = X2[split_ind:]
y_test = y2[split_ind:]

# Train the DecisionTree model
dt_model = DecisionTree()

# Perform cross-validation on the training set
cross_val_10fold2(dt_model, X_train, y_train)

# Train the model on the full training set
dt_model.train(X_train, y_train)

# Evaluate the final model using the testing set
y_pred = dt_model.predict(X_test)
y_pred_class = np.argmax(y_pred, axis=1)


# Print evaluation metrics
acc = calculate_accuracy(y_test, y_pred_class)
prec = precision(y_test, y_pred_class)
rec = recall(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)

print("Final Results:")
print(f'Accuracy: {acc}')
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1 score: {f1}')


Final Results:
Accuracy: 0.5714285714285714
Precision: 0.3333333333222222
Recall: 0.499999999975
F1 score: 0.399999999984


# AdaBoost

In [12]:
#DecisionTREE
import math
import numpy as np
from collections import Counter

class TreeNode():
  def __init__(self, dataset, feature_index, threshold, prediction_probs, info_gain) -> None:
    self.data = dataset
    self.feature_index = feature_index
    self.threshold = threshold
    self.prediction_probs = prediction_probs
    self.info_gain = info_gain
    self.left = None
    self.right = None

class DecisionTree():

  def __init__(self, max_depth=6, min_samples_split=1, min_info_gain=0.0, num_features_split=None, adaboost_weight=None) -> None:
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.min_info_gain = min_info_gain
    self.num_features_split = num_features_split
    self.adaboost_weight = adaboost_weight
    self.tree = None


  #using entropy class formula
  def entropy(self, class_probs: list) -> float:
    return sum([-p * np.log2(p) for p in class_probs if p > 0])

  def class_probs(self, targets: list) -> list:
    total = len(targets)
    return [target_count/total for target_count in Counter(targets).values()]

  def data_entropy(self, targets: list) -> float:
    return self.entropy(self.class_probs(targets))

  def partition_entropy(self, subsets: list) -> float:
    total = sum([len(subset) for subset in subsets])
    return sum([self.data_entropy(subset) * (len(subset) / total) for subset in subsets])

  def split(self, dataset: np.array, feature_index: int, threshold: float) -> tuple:
    #all rows that are less than threshold
    below_threshold_group = dataset[:, feature_index] < threshold
    group1 = dataset[below_threshold_group]
    group2 = dataset[~below_threshold_group]
    return group1, group2

  def target_probs(self, dataset: np.array) -> np.array:
    target_values = dataset[:, -1]
    total_target_values = len(target_values)
    target_probs = np.zeros(len(self.target_values), dtype=float)

    for i, target_val in enumerate(self.target_values):
      target_index = np.where(target_values == i)[0]
      if len(target_index) > 0:
        target_probs[i] = len(target_index) / total_target_values

    return target_probs

  def best_split(self, dataset: np.array) -> tuple:
    min_entropy = math.inf
    min_entropy_feature_index = None
    min_entropy_threshold = None

    for i in range(dataset.shape[1]-1):
      threshold = np.median(dataset[:, i])
      subtree1, subtree2 = self.split(dataset, i, threshold)
      split_entropy = self.partition_entropy([subtree1[:, -1], subtree2[:, -1]])
      #finding split with lowest entropy
      if split_entropy < min_entropy:
        min_entropy = split_entropy
        min_entropy_feature_index = i
        min_entropy_threshold = threshold
        subtree1_min, subtree2_min = subtree1, subtree2

    return subtree1_min, subtree2_min, min_entropy_feature_index, min_entropy_threshold, min_entropy

  def build_tree(self, dataset: np.array, curr_depth: int) -> TreeNode:
    if curr_depth >= self.max_depth:
      return None

    subtree1, subtree2, split_feature_index, split_threshold, split_entropy = self.best_split(dataset)

    target_probs = self.target_probs(dataset)

    node_entropy = self.entropy(target_probs)
    info_gain = node_entropy - split_entropy

    node = TreeNode(dataset, split_feature_index, split_threshold, target_probs, info_gain)

    if(self.min_samples_split > subtree1.shape[0] or self.min_samples_split > subtree2.shape[0]):
      return node

    elif info_gain < self.min_info_gain:
      return node

    curr_depth = curr_depth + 1
    #continue recursively until one of return conditions is met
    node.left = self.build_tree(subtree1, curr_depth)
    node.right = self.build_tree(subtree2, curr_depth)

    return node
  def predict_one_sample(self, X: np.array) -> np.array:
    node = self.tree

    while node:
        if node.left is None and node.right is None:  # Check if the node is a leaf
            return node.prediction_probs
        else:
            if X[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right

    # Handle the case where no leaf node is reached
    return None  # Or raise an exception if needed

  def train(self, X_train: np.array, Y_train: np.array) -> None:
    self.target_values = np.unique(Y_train)
    train_data = np.concatenate((X_train, np.reshape(Y_train, (-1, 1))), axis=1)

    self.tree = self.build_tree(dataset=train_data, curr_depth=0)


  def predict_probs(self, X_set: np.array) -> np.array:
    pred_prob = np.apply_along_axis(self.predict_one_sample, 1, X_set)

    return pred_prob

  def predict(self, X_set: np.array) -> np.array:
    predictions = []
    for sample in X_set:
        prediction = self.predict_one_sample(sample)
        predictions.append(prediction)
    return np.array(predictions)

In [13]:
import numpy as np
from collections import Counter

class AdaBoost():
    def __init__(self, num_base_learners=10):
        self.num_base_learners = num_base_learners
        self.base_learners = []
        self.base_learner_weights = []

    def calc_adaboost_weight(self, base_learner, X, y):
        prediction = base_learner.predict(X)
        prediction = np.argmax(prediction, axis=1)
        err = 1 - np.mean(prediction == y)
        return 0.5 * np.log((1 - err) / max(err, 1e-10))

    def train(self, X_train, y_train):
        n_samples = X_train.shape[0]
        weights = np.ones(n_samples) / n_samples

        for _ in range(self.num_base_learners):
            base_learner = DecisionTree(max_depth=6)
            base_learner.train(X_train, y_train)
            adaboost_weight = self.calc_adaboost_weight(base_learner, X_train, y_train)
            self.base_learners.append(base_learner)
            self.base_learner_weights.append(adaboost_weight)

            predictions = base_learner.predict(X_train)
            predicted_labels = np.argmax(predictions, axis=1)  # Convert probabilities to class labels
            err = np.mean(predicted_labels != y_train)
            beta = err / (1 - err)
            errors = (predicted_labels != y_train)
            weights *= np.exp(beta * errors)
            weights /= np.sum(weights)


    def predict(self, X):
        pred_scores = np.zeros((len(X), len(self.base_learners[0].target_values)))

        for i, base_learner in enumerate(self.base_learners):
            pred_probs = base_learner.predict_probs(X)
            pred_scores += pred_probs * self.base_learner_weights[i]

        return np.argmax(pred_scores, axis=1)



In [14]:
#Dataset 1
adaboost = AdaBoost()

adaboost.train(X1, y1)

print('For Dataset1:')
_, _, _, _ = cross_val_10fold3(adaboost, X1, y1)
print()

#Dataset 2
adaboost2 = AdaBoost()

adaboost2.train(X2, y2)

print('For Dataset2:')
_, _, _, _ = cross_val_10fold3(adaboost2, X2, y2)

For Dataset1:
----- Over 10 folds -----
Accuracy: 0.9525
Precision: 0.9688
Recall: 0.904
F1-Score: 0.9339

For Dataset2:
----- Over 10 folds -----
Accuracy: 0.7573
Precision: 0.6929
Recall: 0.5245
F1-Score: 0.5865
