<a href="https://colab.research.google.com/github/jagadish-sonamale/nlp-projects/blob/main/spam_ham_bow_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spam Or Ham Prediction using Bag Of Word, TF-IDF and NLTK

## Tokenization

In [59]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

#Download Necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
def preprocess(text, lemmatizer):
  """
  Preprocesses a single text message.
  """
  review = re.sub('[^a-zA-Z]', ' ', text).lower()
  tokens = word_tokenize(review)
  refined_tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stopwords.words('english')]
  return " ".join(refined_tokens)

In [8]:
# Load text file which has messages labelled with Spam or Ham
data = pd.read_csv('/content/sample_data/SMSSpamCollection.txt', sep='\t', names = ['label', 'message'])

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [10]:
# Encode labels, i.e., Spam as 1 and Ham as 0.
data['label'] = data['label'].map(lambda a: 0 if a=='ham' else 1)

In [11]:
# Apply preprocessing to all messages
data['processed_message'] = data['message'].apply(preprocess, args=(WordNetLemmatizer(),))

In [12]:
# Split the features and labels into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(data['processed_message'], np.array(data['label']), train_size=0.75, random_state=42)

## Bow and Naive Bayes

### Vectorization

In [13]:
# Apply the Bag-of-Words (BoW) technique to vectorize the text.
vectorizer = CountVectorizer(ngram_range=(1,2)) # Create a vocabulary with combination of single word and 2 consecutive words
X_train_bow_vec = vectorizer.fit_transform(X_train).toarray()
X_test_bow_vec = vectorizer.transform(X_test).toarray()

### Machine Learing - Prediction

In [14]:
# Apply a Random Forest Classifier to train the model and make predictions.
random_forest_model = MultinomialNB()
random_forest_model.fit(X_train_bow_vec, y_train)

In [15]:
y_train_pred = random_forest_model.predict(X_train_bow_vec)

In [16]:
# Evaluate the model on training set
accuracy = accuracy_score(y_train, y_train_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_train, y_train_pred))

Accuracy: 0.9971284996410624
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3618
           1       0.98      1.00      0.99       561

    accuracy                           1.00      4179
   macro avg       0.99      1.00      0.99      4179
weighted avg       1.00      1.00      1.00      4179



In [17]:
# Make Prediction on testing set
y_test_pred = random_forest_model.predict(X_test_bow_vec)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_test_pred))

Accuracy: 0.9856424982053122
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1207
           1       0.97      0.92      0.94       186

    accuracy                           0.99      1393
   macro avg       0.98      0.96      0.97      1393
weighted avg       0.99      0.99      0.99      1393



## TF-IDF and Naive Bayes

### Vectorization

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
tfidf = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X_train_tfidf_vec = tfidf.fit_transform(X_train).toarray()
X_test_tfidf_vec = tfidf.transform(X_test).toarray()

In [20]:
model = MultinomialNB()
model.fit(X_train_tfidf_vec, y_train)

In [21]:
y_train_pred = model.predict(X_train_tfidf_vec)

In [22]:
print(f"Accuracy : {accuracy_score(y_train, y_train_pred)}")
print(f"Classification report:\n {classification_report(y_train, y_train_pred)}")

Accuracy : 0.9822924144532185
Classification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3618
           1       0.99      0.87      0.93       561

    accuracy                           0.98      4179
   macro avg       0.99      0.94      0.96      4179
weighted avg       0.98      0.98      0.98      4179



In [117]:
# Make Prediction on testing set
y_test_pred = model.predict(X_test_tfidf_vec)
print(f"Accuracy : {accuracy_score(y_test, y_test_pred)}")
print(f"Classification report:\n {classification_report(y_test, y_test_pred)}")

Accuracy : 0.9791816223977028
Classification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1207
           1       0.98      0.86      0.92       186

    accuracy                           0.98      1393
   macro avg       0.98      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393



## Word2Vec

### Vectorization

In [24]:
!pip install gensim



In [25]:
import gensim.downloader as api

In [26]:
wv = api.load('word2vec-google-news-300')

In [129]:
# Sentence vectorization using averaging
def get_vector(text, wv):
  tokens = text.strip().split()
  if not tokens:
    return np.zeros(wv.vector_size)
  vectors = [wv[word] if word in wv.key_to_index else np.zeros(wv.vector_size) for word in text.strip().split()]
  return np.mean(np.array(vectors), axis=0)

In [130]:
X_train_w2v= np.array([get_vector(text, wv) for text in X_train.values])
X_test_w2v= np.array([get_vector(text, wv) for text in X_test.values])

### Training

In [124]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train_w2v, y_train)

### Evaluation

In [125]:
y_train_pred = gnb.predict(X_train_w2v)
print(f"Accuracy for training set : {accuracy_score(y_train, y_train_pred)}")
print(f"Classification report:\n {classification_report(y_train, y_train_pred)}")

Accuracy for training set : 0.8624072744675759
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.85      0.91      3618
           1       0.49      0.92      0.64       561

    accuracy                           0.86      4179
   macro avg       0.74      0.89      0.78      4179
weighted avg       0.92      0.86      0.88      4179



In [126]:
y_test_pred = gnb.predict(X_test_w2v)
print(f"Accuracy for Test set : {accuracy_score(y_test, y_test_pred)}")
print(f"Classification report:\n {classification_report(y_test, y_test_pred)}")

Accuracy for Test set : 0.8837042354630295
Classification report:
               precision    recall  f1-score   support

           0       0.98      0.88      0.93      1207
           1       0.54      0.91      0.68       186

    accuracy                           0.88      1393
   macro avg       0.76      0.89      0.80      1393
weighted avg       0.92      0.88      0.90      1393

