In [None]:
# The project is telling either the SMS is spam or ham.
# Author: Muhammad Humayun Khan

import pandas as pd
import numpy as np

dataset_path = '../datasets/sms_spam_dataset/spam.csv'

# Load the dataset
df = pd.read_csv(dataset_path, encoding='latin-1')[['v1', 'v2']]

df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [50]:
df.shape

(5572, 2)

In [31]:

# Rename the columns for clarity
df.columns = ['label', 'text']

In [32]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
# Clean labels
df['label'] = df['label'].astype(str).str.strip().str.lower()


In [34]:
# Filter valid rows - only ham and spam
df = df[df['label'].isin(['ham', 'spam'])]

In [35]:
# Reset index
df = df.reset_index(drop=True)


In [36]:
# Map labels
y = df['label'].map({'ham': 0, 'spam': 1})


In [37]:
# Check for sanity
print(df['label'].value_counts())
print("Any NaNs in y:", y.isnull().sum())

# the following data showing the more than 80cent of the dataset as ham and it will effect the training towards ham


label
ham     4825
spam     747
Name: count, dtype: int64
Any NaNs in y: 0


In [38]:
# finding the missing rows
df.isnull().sum()

label    0
text     0
dtype: int64

In [39]:
# finding the duplicate rows
df.duplicated().sum()

403

In [None]:
# Now after exploring the dataset, the next steps are
# 1. Text preprocessing which includes: lowercasing, removing punctuation, removing stopwords, tokenization, stemming/lemmatization
# 2. Text Representation/Feature extraction: to convert text to numerical form for ML Model: using TF-IDF or Count Vectorization
# 3. Model Training: Encode labels spam = 1, ham = 0, split data into train/test and train the model using Random Forest
# 4. Evaluate the model using accuracy, precision, recall, f1-score

In [40]:
# Now we need to clean the text which is Text Preprocessing
# Create the function to apply all the text preprocessing steps
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


# Initialize once for performance
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation, digits, symbols
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [41]:
# apply on the dataframe
df['text'] = df['text'].apply(preprocess_text)

In [42]:
# check the output
df.head()

Unnamed: 0,label,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think go usf life around though


In [43]:
df.sample(10)

Unnamed: 0,label,text
1187,ham,there point hangin mr right he makin u happy
2540,ham,said gon na snow start around pm tonite predic...
1914,spam,new textbuddy chat horny guy ur area p free re...
1163,ham,new theory argument win situation loses person...
4605,ham,oh haha den shld went today gee nvm la kaiez d...
2852,ham,doubt could handle time per night case
3424,ham,true easier
2993,ham,idea guess well work hour supposed leave since...
4423,ham,update face book status frequently
5356,ham,tell something thats okay


In [44]:
# Now the next step after the text preprocessing is to convert the text to numerical form for ML Model
from sklearn.feature_extraction.text import TfidfVectorizer


# Initialize vectorizer
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),         # use unigrams + bigrams
    max_df=0.95,                # remove very common terms
    min_df=2,                   # remove rare words
    stop_words='english'       # remove common stopwords
)

# Fit and transform the cleaned text
X = vectorizer.fit_transform(df['text'])

print("TF-IDF shape:", X.shape) # the output will show the number of rows and unique words such as rows = 5572 and columns = 7855



TF-IDF shape: (5572, 7941)


In [45]:
# Now train/test split the data and train the model
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Oversample only the training set due to imbalance dataset
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


In [20]:
# Now train the model using Random Forest
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

# Initialize the model
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Train the model
rf_model.fit(X_train, y_train)



In [21]:
# predict the model on test
rf_preds = rf_model.predict(X_test)

In [22]:
# Evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("Random Forest Evaluation:")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))
print("Classification Report:\n", classification_report(y_test, rf_preds))


Random Forest Evaluation:
Accuracy: 0.9775784753363229
Confusion Matrix:
 [[965   1]
 [ 24 125]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [48]:
# Now lets predict a new sample message or spam or ham for the Random Forest Classifier
def predict_message(text):
    
    def preprocess_text(text):
        text = text.lower()
        text = ''.join(char for char in text if char.isalnum() or char.isspace())
        tokens = text.split()
        stopwords = set(['the', 'is', 'in', 'and', 'to', 'a', 'of', 'that', 'it', 'this', 'for', 'with'])
        tokens = [word for word in tokens if word not in stopwords]
        return ' '.join(tokens)

    cleaned_text = preprocess_text(text)

    # TF-IDF transform
    text_vector = vectorizer.transform([cleaned_text])  

    # Predict using trained model
    prediction = rf_model.predict(text_vector)

    return "Spam" if prediction[0] == 1 else "Ham"

print(predict_message("Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/123 to claim now!"))
print(predict_message("Win a brand new car by texting WIN to 90909"))
print(predict_message("Hey, are we still meeting at 6pm today?"))




ValueError: X has 7941 features, but RandomForestClassifier is expecting 3380 features as input.