# data loading & pre-processing

In [8]:
import pandas as pd
import os
# Check if the file exists in the specified path

# Load the SMS spam dataset
file_path = 'SMSSpamCollection'
df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocess The Text Messages

In [25]:
import re
# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuations and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize the text
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

# Apply the preprocessing function to the 'message' column
df['processed_message'] = df['message'].apply(preprocess_text)

# Display the first few rows of the dataframe with the processed messages
df.head(20)

Unnamed: 0,label,message,processed_message
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, he, goes, to, usf, he, l..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...,"[freemsg, hey, there, darling, its, been, 3, w..."
6,ham,Even my brother is not like to speak with me. ...,"[even, my, brother, is, not, like, to, speak, ..."
7,ham,As per your request 'Melle Melle (Oru Minnamin...,"[as, per, your, request, melle, melle, oru, mi..."
8,spam,WINNER!! As a valued network customer you have...,"[winner, as, a, valued, network, customer, you..."
9,spam,Had your mobile 11 months or more? U R entitle...,"[had, your, mobile, 11, months, or, more, u, r..."


## Splitting the dataset into training & test sets

In [26]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets while maintaining the same class distribution
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Display the class distribution in the training set
print("Training set class distribution:")
print(train_df['label'].value_counts(normalize=True))

# Display the class distribution in the test set
print("Test set class distribution:")
print(test_df['label'].value_counts(normalize=True))

Training set class distribution:
label
ham     0.865829
spam    0.134171
Name: proportion, dtype: float64
Test set class distribution:
label
ham     0.866368
spam    0.133632
Name: proportion, dtype: float64


## Maximum Likelyhood Estimator