## Imports

In [18]:
import pandas as pd
import re
import numpy as np
from collections import Counter

## Preprocessing + calculating word counts

In [19]:
def preprocess_text(text):
    url_pattern = r'https?:\/\/(?:www\.)?[^\s\/$.?#].[^\s]*'
    text = re.sub(url_pattern, 'url', text)
    text = text.lower()
    return text


def create_word_counts(texts, max_vocab_size=None):
    word_counts = []
    word_freq = Counter()

    # Count word occurrences across all texts
    for text in texts:
        words = text.split()
        word_count = Counter(words)
        word_counts.append(word_count)
        word_freq.update(word_count)
    
    # Limit vocabulary to the most frequent words, if required
    vocabulary = dict(word_freq.most_common(max_vocab_size)) if max_vocab_size else dict(word_freq)
    
    return word_counts, set(vocabulary.keys())

## Naive Bayes implementation

In [20]:
def train_naive_bayes(word_counts, labels, vocabulary, alpha=1.0):
    num_classes = 2
    class_counts = np.bincount(labels)
    word_counts_by_class = {0: {word: 0 for word in vocabulary}, 1: {word: 0 for word in vocabulary}}
    word_totals = np.zeros(num_classes)  # Array for storing sum of words in each class

    # Count word frequencies by class
    for i in range(len(word_counts)):
        label = labels[i]
        for word, count in word_counts[i].items():
            if word in vocabulary:
                word_counts_by_class[label][word] += count
                word_totals[label] += count 
    
    # Calculate prior probabilities
    priors = class_counts / len(labels)
    
    # Calculate conditional probabilities with Laplace smoothing
    num_words = len(vocabulary)
    conditional_probs = {c: {} for c in range(num_classes)}
    
    for c in range(num_classes):
        for word in vocabulary:
            word_count = word_counts_by_class[c].get(word, 0)
            conditional_probs[c][word] = (word_count + alpha) / (word_totals[c] + alpha * num_words)
    
    return priors, conditional_probs


def predict_naive_bayes(word_counts, priors, conditional_probs, vocabulary):
    predictions = []

    for count in word_counts:
        log_probs = np.log(priors)  
        for word, word_count in count.items():
            if word in vocabulary:
                # Calculate log-probability for each class
                for c in range(len(priors)):
                    log_probs[c] += word_count * np.log(conditional_probs[c].get(word, 1e-10))  # Small epsilon to avoid log(0)
        
        # Choose the class with the higher log probability
        predictions.append(np.argmax(log_probs))  
    
    return np.array(predictions)

## Define file paths and function to save predictions

In [21]:
def save_predictions(predictions, output_file_path):
    np.savetxt(output_file_path, predictions, fmt='%d', newline='\n')
    print(f"Saved output to {output_file_path}")


# File paths
input_file_path = './spam-classification/train/in.tsv'
expected_file_path = './spam-classification/train/expected.tsv'
test_file_path = './spam-classification/test/in.tsv'
output_file_path = './out.tsv'

## Load training data

In [22]:
# Load train data
in_data = pd.read_csv(input_file_path, sep='\t', header=None, names=['text'])
in_data 

Unnamed: 0,text
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."
...,...
3795,"Feb &lt;#&gt; is \I LOVE U\"" day. Send dis t..."
3796,"Actually nvm, got hella cash, we still on for ..."
3797,We tried to contact you re your reply to our o...
3798,"It's ok, at least armand's still around"


In [23]:
expected_data = pd.read_csv(expected_file_path, sep='\t', header=None, names=['label'])
expected_data

Unnamed: 0,label
0,0
1,0
2,1
3,0
4,0
...,...
3795,0
3796,0
3797,1
3798,0


## Preprocess data and train the model

In [24]:
# Preprocess text data
in_data['cleaned_text'] = in_data['text'].apply(preprocess_text)

# Create word counts and vocabulary
word_counts_train, vocabulary = create_word_counts(in_data['cleaned_text'])

# Prepare training data
y_train = expected_data['label'].values

# Train Naive Bayes model
priors, conditional_probs = train_naive_bayes(word_counts_train, y_train, vocabulary)

## Load test data

In [25]:
test_data = pd.read_csv(test_file_path, sep='\t', header=None, names=['text'])
test_data

Unnamed: 0,text
0,Yup song bro. No creative. Neva test quality. ...
1,"No dude, its not fake..my frnds got money, tht..."
2,Dude while were makin those weirdy brownies my...
3,URGENT! We are trying to contact you. Last wee...
4,Pls dont restrict her from eating anythin she ...
...,...
1765,This is the 2nd time we have tried 2 contact u...
1766,Will �_ b going to esplanade fr home?
1767,"Pity, * was in mood for that. So...any other s..."
1768,The guy did some bitching but I acted like i'd...


## Preprocess test data and make predictions

In [26]:
# Prepare test data
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text)
word_counts_test, _ = create_word_counts(test_data['cleaned_text'])

# Predict labels for test data
y_pred = predict_naive_bayes(word_counts_test, priors, conditional_probs, vocabulary)

# Save predictions to file
save_predictions(y_pred, output_file_path)

# Print class counts
class_counts = np.bincount(y_pred)
print(f'Number of data in class non-spam is {class_counts[0]}')
print(f'Number of data in class spam is {class_counts[1]}')


Saved output to ./out.tsv
Number of data in class non-spam is 1552
Number of data in class spam is 218
