## **SMS Spam Classification Using Word2Vec**

In [1]:
# Import required libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# Initialize lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# Step 1: Load the dataset
# Read the SMS spam collection dataset
df = pd.read_csv(r'C:\Users\Administrator\OneDrive\Desktop\MLDS\NLP\Spam_SMS.csv').iloc[:,[1,0]]
df.head(10)

Unnamed: 0,Message,Class
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham
5,FreeMsg Hey there darling it's been 3 week's n...,spam
6,Even my brother is not like to speak with me. ...,ham
7,As per your request 'Melle Melle (Oru Minnamin...,ham
8,WINNER!! As a valued network customer you have...,spam
9,Had your mobile 11 months or more? U R entitle...,spam


In [11]:
df["Class"].value_counts()

Class
ham     4827
spam     747
Name: count, dtype: int64

In [12]:
# Step 2: Convert ratings to binary (0 for negative, 1 for positive)
df['Class'] = df['Class'].apply(lambda x: 1 if x == "spam" else 0)
df["Class"].value_counts()

Class
0    4827
1     747
Name: count, dtype: int64

In [13]:
# Step 3: Text Preprocessing Pipeline
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    url_pattern = r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
    text = re.sub(url_pattern, '', text)
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9- ]+', '', text)
    # Remove stopwords and lemmatize
    text = " ".join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)  
    # Remove extra spaces
    text = " ".join(text.split())
    return text

In [14]:
# Apply to all messages
df['Message'] = df['Message'].apply(preprocess_text)
df.head(10)

Unnamed: 0,Message,Class
0,go jurong point crazy available bugis n great ...,0
1,ok lar joking wif u oni,0
2,free entry 2 wkly comp win fa cup final tkts 2...,1
3,u dun say early hor u c already say,0
4,nah dont think go usf life around though,0
5,freemsg hey darling 3 week word back id like f...,1
6,even brother like speak treat like aid patent,0
7,per request melle melle oru minnaminunginte nu...,0
8,winner valued network customer selected receiv...,1
9,mobile 11 month u r entitled update latest col...,1


In [58]:
# Step 4: Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['Class'],test_size=0.20,random_state=42)

In [59]:
# Step 5: Feature Extraction - Word2Vec
from gensim.models import Word2Vec
from tqdm import tqdm
import numpy as np

# Tokenize reviews for Word2Vec
tokenized_reviews = [review.split() for review in X_train]
# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_reviews,vector_size=300, window=5, min_count=2, workers=4,sg=1)

In [60]:
# w2v_model.wv.index_to_key #Vocabulary

In [61]:
# Convert text to average word2vec vector for each review.
# Returns zeros vector if no words are found in vocabulary
def get_avg_word2vec(reviews, w2v_model, vector_size=300):
    doc_vectors = []
    for review in tqdm(reviews):
        word_vectors = [w2v_model.wv[word] for word in review.split() if word in w2v_model.wv]
        if word_vectors:
            doc_vectors.append(np.mean(word_vectors, axis=0))
        else:
            doc_vectors.append(np.zeros(vector_size))
    return np.array(doc_vectors)

In [62]:
X_train_w2v = get_avg_word2vec(X_train, w2v_model)
X_test_w2v = get_avg_word2vec(X_test, w2v_model)

100%|██████████| 4459/4459 [00:00<00:00, 17694.52it/s]
100%|██████████| 1115/1115 [00:00<00:00, 17698.82it/s]


In [64]:
# Step 6: Model Training and Evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score
# Initialize classifier
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)

In [66]:
# Train and evaluate with Word2Vec features
rf_classifier_w2v=rf_classifier.fit(X_train_w2v, y_train)
y_pred_w2v = rf_classifier_w2v.predict(X_test_w2v)
print("\nWord2Vec Results:")
print("Word2Vec accuracy: ",accuracy_score(y_test,y_pred_w2v))
print("Word2Vec F1_Score: ",f1_score(y_test,y_pred_w2v))
print("Conf_Metrix:\n",confusion_matrix(y_test, y_pred_w2v))
print("====================================================================")


Word2Vec Results:
Word2Vec accuracy:  0.97847533632287
Word2Vec F1_Score:  0.922077922077922
Conf_Metrix:
 [[949   5]
 [ 19 142]]


In [67]:
# Step 7:Manual predictions
def predict_new_texts(texts, w2v_model, classifier):
    # Preprocess new texts
    processed_texts = [preprocess_text(text) for text in texts]
    # Convert texts to vectors
    text_vectors = get_avg_word2vec(processed_texts, w2v_model)
    # Predict
    predictions = classifier.predict(text_vectors)
    return predictions

In [68]:
new_texts = [
"Eh u remember how 2 spell his name... Yes i did. He v naughty make until i v wet.",
"Fine if that’s the way u feel. That’s the way its gota b",
"England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ú1.20 POBOXox36504W45WQ 16+",
"Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"]
# Get predictions
predictions = predict_new_texts(new_texts, w2v_model, rf_classifier)
# Map predictions to labels
sentiment_map = {0: 'Ham', 1: 'Spam', 2: 'Neutral', 3: 'Positive'}
results = [sentiment_map[pred] for pred in predictions]
# Print results
print("\nText Classification Results:")
print("-" * 50)
for text, result in zip(new_texts, results):
    print(f"Text: {text}")
    print(f"Classification: {result}")
    print("-" * 50)

100%|██████████| 4/4 [00:00<00:00, 4143.55it/s]


Text Classification Results:
--------------------------------------------------
Text: Eh u remember how 2 spell his name... Yes i did. He v naughty make until i v wet.
Classification: Ham
--------------------------------------------------
Text: Fine if that’s the way u feel. That’s the way its gota b
Classification: Ham
--------------------------------------------------
Text: England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ú1.20 POBOXox36504W45WQ 16+
Classification: Spam
--------------------------------------------------
Text: Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged
Classification: Spam
--------------------------------------------------



