In [2]:
import csv
import pandas as pd
import numpy as np
import chardet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

### Load spam.csv into a pandas dataframe and remove empty columns

In [3]:
with open('Data/spam.csv', 'rb') as f:
        raw_data = f.read(10000)
        result = chardet.detect(raw_data)
        detected_encoding = result['encoding']

print(f"Encoding: {detected_encoding}")

Encoding: Windows-1252


In [4]:
data_path = 'Data/spam.csv'
data = pd.read_csv(data_path, encoding='Windows-1252')
print(data[:10])

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   
5  spam  FreeMsg Hey there darling it's been 3 week's n...        NaN   
6   ham  Even my brother is not like to speak with me. ...        NaN   
7   ham  As per your request 'Melle Melle (Oru Minnamin...        NaN   
8  spam  WINNER!! As a valued network customer you have...        NaN   
9  spam  Had your mobile 11 months or more? U R entitle...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
5        NaN        NaN  
6        NaN  

In [5]:
print(f'There are {len(data.columns)} columns')
print("Column names:")
for c in data.columns:
    print(f'-{c}')

There are 5 columns
Column names:
-v1
-v2
-Unnamed: 2
-Unnamed: 3
-Unnamed: 4


In [6]:
null_columns = data.isna().any()
null_column_list = null_columns[null_columns].index.tolist()
if(null_column_list):
    print("The following columns have null values:")
    for item in null_column_list:
        print(f'-{item}')
else:
    print("There are no columns or rows with null values")

The following columns have null values:
-Unnamed: 2
-Unnamed: 3
-Unnamed: 4


In [7]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

### Adding column names to dataset

In [8]:
data.columns = ['label', 'email']

In [9]:
print(data[:10])

  label                                              email
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5  spam  FreeMsg Hey there darling it's been 3 week's n...
6   ham  Even my brother is not like to speak with me. ...
7   ham  As per your request 'Melle Melle (Oru Minnamin...
8  spam  WINNER!! As a valued network customer you have...
9  spam  Had your mobile 11 months or more? U R entitle...


### How many values in each column?

In [10]:
print(data["label"].value_counts())

label
ham     4825
spam     747
Name: count, dtype: int64


### How many values in total dataset?

In [11]:
print(len(data))

5572


In [30]:
import re
import string
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
def preprocess_text(text):
    """
    Preprocessing technique: Text cleaning and normalization
    - Convert to lowercase
    - Remove URLs
    - Remove special characters and digits
    - Remove extra whitespace
    """
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

data['email_processed'] = data['email'].apply(preprocess_text)

In [20]:
vectorizer = TfidfVectorizer(
    max_features=3000,  # Increased from typical 1000
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.8,  # Ignore terms that appear in more than 80% of documents
    sublinear_tf=True)

In [21]:
X = vectorizer.fit_transform(data['email_processed'])
y = data['label'].map({'ham': 0, 'spam': 1})

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n" + "="*70)
print("BASELINE MODEL: Multinomial Naive Bayes")
print("="*70)

baseline_model = MultinomialNB()
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)

baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
baseline_precision = precision_score(y_test, y_pred_baseline)
baseline_recall = recall_score(y_test, y_pred_baseline)
baseline_f1 = f1_score(y_test, y_pred_baseline)

print(f"Accuracy:  {baseline_accuracy:.4f}")
print(f"Precision: {baseline_precision:.4f}")
print(f"Recall:    {baseline_recall:.4f}")
print(f"F1-Score:  {baseline_f1:.4f}")


BASELINE MODEL: Multinomial Naive Bayes
Accuracy:  0.9722
Precision: 0.9917
Recall:    0.7987
F1-Score:  0.8848
