In [1]:
# Import tokenizer
from nltk.tokenize import WordPunctTokenizer
import numpy as np

In [2]:
# Open "SMSSpamCollection" file to read the data
# For each line in the file, split the line into two parts: the label (first word) and the message (the rest of the line)
# Store the label in the list "labels" and the tokenized message in the list "messages"
labels = []
messages = []

tokenizer = WordPunctTokenizer()

with open("SMSSpamCollection") as file:
    for line in file:
        line = line.strip()
        label, message = line.split("\t", 1)
        labels.append(label)
        messages.append(tokenizer.tokenize(message.lower()))

In [15]:
# Print the first 5 messages
for i in range(5):
    print(labels[i])
    print(messages[i])

ham
['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']
ham
['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...']
spam
['free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005', '.', 'text', 'fa', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', 'c', "'", 's', 'apply', '08452810075over18', "'", 's']
ham
['u', 'dun', 'say', 'so', 'early', 'hor', '...', 'u', 'c', 'already', 'then', 'say', '...']
ham
['nah', 'i', 'don', "'", 't', 'think', 'he', 'goes', 'to', 'usf', ',', 'he', 'lives', 'around', 'here', 'though']


In [3]:
def split_dataset(messages, labels):
    # Split the dataset into training and testing sets with a 80-20 split
    # Return the training and testing sets for both messages and labels
    # TODO: Change split to 80-20 (0.8), currently set to 0.2 for code testing
    split_index = int(0.8 * len(messages))
    train_messages = messages[:split_index]
    test_messages = messages[split_index:]
    train_labels = labels[:split_index]
    test_labels = labels[split_index:]
    return train_messages, test_messages, train_labels, test_labels

In [4]:
train_messages, test_messages, train_labels, test_labels = split_dataset(messages, labels)

In [18]:
def term_frequency(word, document):
    # Count the frequency of the word in the document (list of words)
    return document.count(word) / len(document)

In [19]:
def document_frequency(word, documents):
    # Count the number of documents containing the word
    count = 0
    for document in documents:
        if word in document:
            count += 1
    return count

In [20]:
def idf(word, documents):
    # Calculate the inverse document frequency of the word
    df = document_frequency(word, documents)
    # If the word is not in any document, return 0
    # This prevents division by zero errors when calculating idf
    if df == 0:
        return 0
    return np.log(len(documents) / (document_frequency(word, documents)))

In [21]:
def tf_idf(word, document, documents):
    # Calculate the term frequency-inverse document frequency of the word
    return term_frequency(word, document) * idf(word, documents)

In [22]:
def tf_idf_vectorizor(messages):
    # Create a TF-IDF vector for each document
    # Return the list of TF-IDF vectors
    vectors = []
    for message in messages:
        vector = []
        for word in message:
            vector.append(tf_idf(word, message, messages))
        vectors.append(vector)
    return vectors

In [23]:
train_vectors = tf_idf_vectorizor(train_messages)

In [24]:
# Print the first 5 tf-idf vectors
for i in range(5):
    print(train_messages[i])
    print(train_vectors[i])

['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']
[0.1279150421400878, 0.2176927489283138, 0.35011165852614484, 0.2501993554928794, 0.05978210845106636, 0.25417061298472626, 0.10773202686101135, 0.23206110252380252, 0.1378720668258631, 0.08049501253520311, 0.27545501397497585, 0.156595371061921, 0.16549429190767345, 0.20980766561004177, 0.28305174550805734, 0.1719172369004759, 0.32123052600281377, 0.17334288039168272, 0.26903206898217347, 0.13304973549108565, 0.13282266862577047, 0.35011165852614484, 0.16700961041479323, 0.17334288039168272]
['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...']
[0.368955168882722, 0.6171179877284687, 0.5200286411750482, 0.9636915780084414, 0.6822801031826295, 0.23767376676088595, 0.9130084394949208, 0.5200286411750482]
['free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '2

In [11]:
def tf_idf_vectorizor(messages):
    num_messages = len(messages)                        # Count the number of messages provided in the input

    document_frequencies = {}                           # Store the number of messages which contain the specified word
    
    words_in_messages = []                              # Convert each message in messages into a list of dictonaries

    for message in messages:                            # For each message in messages:
        words_in_message = {}                               # Store the count of each word in the message in a dictionary
        unique_words = set(message)
        for word in message:                            # For each word in the message:
            if word in words_in_message:                    # If the word is already in the dictionary, increment the count
                words_in_message[word] += 1
            else:                                           # Otherwise, add the word to the dictionary with a count of 1
                words_in_message[word] = 1
        words_in_messages.append(words_in_message)
        for word in unique_words:                       # Update the document frequencies for each unique word in the message
            if word in document_frequencies:
                document_frequencies[word] += 1
            else:
                document_frequencies[word] = 1

    tf_idf_vectors = []                                 # Store the TF-IDF vectors for all messages
    for i, message in enumerate(messages):              # For each message in messages:
        tf_idf_vector = []                                  # Calculate the TF-IDF vector and store it in tf_idf_vectors
        for word in message:
            term_frequency = words_in_messages[i][word] / len(message)
            document_frequency = document_frequencies[word]
            inverse_document_frequency = np.log(num_messages / document_frequency)
            tf_idf_vector.append(term_frequency * inverse_document_frequency)
        tf_idf_vectors.append(tf_idf_vector)
    
    return tf_idf_vectors                               # Return the list of TF-IDF vectors

In [12]:
# Train the optimized TF-IDF vectorizer
train_vectors = tf_idf_vectorizor(train_messages)

In [13]:
# Print the first 5 tf-idf vectors
for i in range(5):
    print(train_messages[i])
    print(train_vectors[i])

['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']
[0.1279150421400878, 0.2176927489283138, 0.35011165852614484, 0.2501993554928794, 0.05978210845106636, 0.25417061298472626, 0.10773202686101135, 0.23206110252380252, 0.1378720668258631, 0.08049501253520311, 0.27545501397497585, 0.156595371061921, 0.16549429190767345, 0.20980766561004177, 0.28305174550805734, 0.1719172369004759, 0.32123052600281377, 0.17334288039168272, 0.26903206898217347, 0.13304973549108565, 0.13282266862577047, 0.35011165852614484, 0.16700961041479323, 0.17334288039168272]
['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...']
[0.368955168882722, 0.6171179877284687, 0.5200286411750482, 0.9636915780084414, 0.6822801031826295, 0.23767376676088595, 0.9130084394949208, 0.5200286411750482]
['free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '2