In [117]:
# Import tokenizer
from nltk.tokenize import WordPunctTokenizer
import numpy as np

In [118]:
# Open "SMSSpamCollection" file to read the data
# For each line in the file, split the line into two parts: the label (first word) and the message (the rest of the line)
# Store the label in the list "labels" and the tokenized message in the list "messages"
labels = []
messages = []

tokenizer = WordPunctTokenizer()

with open("SMSSpamCollection") as file:
    for line in file:
        line = line.strip()
        label, message = line.split("\t", 1)
        labels.append(label)
        messages.append(tokenizer.tokenize(message.lower()))

In [119]:
# Print the first 5 messages
for i in range(5):
    print(labels[i])
    print(messages[i])

ham
['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']
ham
['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...']
spam
['free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005', '.', 'text', 'fa', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', 'c', "'", 's', 'apply', '08452810075over18', "'", 's']
ham
['u', 'dun', 'say', 'so', 'early', 'hor', '...', 'u', 'c', 'already', 'then', 'say', '...']
ham
['nah', 'i', 'don', "'", 't', 'think', 'he', 'goes', 'to', 'usf', ',', 'he', 'lives', 'around', 'here', 'though']


In [120]:
def tf_idf_vectorizor(messages, vector_size=20):
    num_messages = len(messages)                        # Count the number of messages provided in the input

    document_frequencies = {}                           # Store the number of messages which contain the specified word
    
    words_in_messages = []                              # Convert each message in messages into a list of dictonaries

    for message in messages:                            # For each message in messages:
        words_in_message = {}                               # Store the count of each word in the message in a dictionary
        unique_words = set(message)
        for word in message:                            # For each word in the message:
            if word in words_in_message:                    # If the word is already in the dictionary, increment the count
                words_in_message[word] += 1
            else:                                           # Otherwise, add the word to the dictionary with a count of 1
                words_in_message[word] = 1
        words_in_messages.append(words_in_message)
        for word in unique_words:                       # Update the document frequencies for each unique word in the message
            if word in document_frequencies:
                document_frequencies[word] += 1
            else:
                document_frequencies[word] = 1

    """
    tf_idf_vectors = []                                 # Store the TF-IDF vectors for all messages
    for i, message in enumerate(messages):              # For each message in messages:
        tf_idf_vector = []                                  # Calculate the TF-IDF vector and store it in tf_idf_vectors
        for word in message:
            term_frequency = words_in_messages[i][word] / len(message)
            document_frequency = document_frequencies[word]
            inverse_document_frequency = np.log(num_messages / document_frequency)
            tf_idf_vector.append(term_frequency * inverse_document_frequency)
        tf_idf_vectors.append(tf_idf_vector)
    """

    # Calculate the TF-IDF vectors for all messages and store values in dictionary
    tf_idf_vectors = []
    for i, message in enumerate(messages):
        tf_idf_vector = {}
        for word in message:
            term_frequency = words_in_messages[i][word] / len(message)
            document_frequency = document_frequencies[word]
            inverse_document_frequency = np.log(num_messages / document_frequency)
            tf_idf_vector[word] = term_frequency * inverse_document_frequency
        tf_idf_vectors.append(tf_idf_vector)

    # Get the most frequent words
    most_frequent_words = sorted(document_frequencies, key=document_frequencies.get, reverse=True)[:vector_size]

    # Create fixed length vector for each message
    sentence_vectors = []
    for i, message in enumerate(messages):
        vector = []
        # For each word in the document_frequency dictionary, add the TF-IDF value to the vector
        for word in most_frequent_words:
            if word in tf_idf_vectors[i]:
                vector.append(tf_idf_vectors[i][word])
            else:
                vector.append(0)
        sentence_vectors.append(vector)
    
    return sentence_vectors                               # Return the list of TF-IDF vectors

In [136]:
# Vectorize all messages, vector size is the size of the fixed length TF-IDF vector
vectors = tf_idf_vectorizor(messages, vector_size=30)

In [137]:
# Print the first 5 fixed length vectors
# Ordered by appearance in the document_frequency dictionary
# Words not appearing in the message are represented by 0
for i in range(5):
    print(messages[i])
    print(vectors[i])

['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']
[0, 0, 0, 0, 0, 0.06049624534521248, 0, 0, 0, 0, 0, 0.0803680816823087, 0, 0, 0.17242415432365424, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0.23715494936964898, 0, 0, 0, 0, 0.5172724629709627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005', '.', 'text', 'fa', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', 'c', "'", 's', 'apply', '08452810075over18', "'", 's']
[0.017727044722041806, 0, 0.09193547119199424, 0, 0.07344525075111702, 0, 0.03981021770723546, 0, 0, 0, 0, 0.04945728103526689, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.12213852858349523, 0, 0, 0, 0.063034

In [138]:
def split_dataset(messages, labels, vectors):
    # Split the dataset into training and testing sets with a 80-20 split
    # Return the training and testing sets for both messages and labels
    # TODO: Change split to 80-20 (0.8), currently set to 0.2 for code testing
    split_index = int(0.8 * len(messages))
    train_messages = messages[:split_index]
    test_messages = messages[split_index:]
    train_labels = labels[:split_index]
    test_labels = labels[split_index:]
    train_vectors = vectors[:split_index]
    test_vectors = vectors[split_index:]
    return train_messages, test_messages, train_labels, test_labels, train_vectors, test_vectors

In [139]:
train_messages, test_messages, train_labels, test_labels, train_vectors, test_vectors = split_dataset(messages, labels, vectors)

In [126]:
# Implement the k-NN algorithm using the Euclidean distance metric on two vectors
def knn_classifier(test_vectors, train_vectors, train_labels, k):
    # Store the predicted labels for all test messages
    predicted_labels = []

    for test_vector in test_vectors:                        # For each test message:
        distances = []                                      # Calculate the Euclidean distance between the test message and all training messages
        for train_vector in train_vectors:
            distance = np.linalg.norm(np.array(train_vector) - np.array(test_vector))
            distances.append(distance)

        # Sort the distances from smallest to largest and reorder the labels correspondingly
        sorted_indices = np.argsort(distances)

        # Store the k-nearest labels
        nearest_labels = []
        for i in range(k):
            nearest_labels.append(train_labels[sorted_indices[i]])

        # Predict the label for the test message based on the majority label of the k-nearest labels
        predicted_label = max(set(nearest_labels), key=nearest_labels.count)
        predicted_labels.append(predicted_label)

    return predicted_labels

In [160]:
# Print the first n predicted labels for k neighbors
n = len(test_vectors)
k = 1
predicted_labels = knn_classifier(test_vectors[:n], train_vectors, train_labels, k)
for i in range(n):
    print(test_labels[i], predicted_labels[i])

ham ham
spam ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
spam spam
ham ham
spam ham
ham ham
ham ham
ham ham
ham ham
ham spam
ham spam
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham spam
ham ham
ham ham
ham ham
ham ham
ham ham
ham spam
ham ham
ham spam
ham ham
ham spam
spam spam
spam spam
ham ham
ham ham
ham ham
ham ham
ham ham
spam ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
spam ham
ham ham
ham ham
spam spam
ham ham
ham spam
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
spam spam
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
spam spam
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham spam
ham ham
spam spam
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
ham ham
spam spam
ham ham
ham ham
spam spam
ham ham
ham ham
spam spam
spam spam
spam s

In [129]:
# Define a function to calculate performance metric
def performance_metric(predicted_labels, actual_labels):
    # Ensure the number of predicted labels and actual labels are the same
    assert len(predicted_labels) == len(actual_labels)
    # Initialize the performance metric counts
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0
    # Loop through each pair of predicted and actual labels and update the counts
    for i in range(len(predicted_labels)):
        if predicted_labels[i] == "spam" and actual_labels[i] == "spam":
            true_positives += 1
        elif predicted_labels[i] == "spam" and actual_labels[i] == "ham":
            false_positives += 1
        elif predicted_labels[i] == "ham" and actual_labels[i] == "ham":
            true_negatives += 1
        elif predicted_labels[i] == "ham" and actual_labels[i] == "spam":
            false_negatives += 1
    # Calculate accuracy, precision, recall, and F1 score
    accuracy = (true_positives + true_negatives) / len(actual_labels)
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0
    # Return all performance metrics
    return true_positives, false_positives, true_negatives, false_negatives, accuracy, precision, recall, f1_score

In [161]:
# Test performance metric function with a test set
true_positives, false_positives, true_negatives, false_negatives, accuracy, precision, recall, f1_score = performance_metric(predicted_labels, test_labels[:n])
print("True Positives:", true_positives)
print("False Positives:", false_positives)
print("True Negatives:", true_negatives)
print("False Negatives:", false_negatives)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

True Positives: 129
False Positives: 76
True Negatives: 894
False Negatives: 16
Accuracy: 0.9174887892376682
Precision: 0.6292682926829268
Recall: 0.8896551724137931
F1 Score: 0.7371428571428572
