<a href="https://colab.research.google.com/github/jagadish-sonamale/nlp-projects/blob/main/spam_ham_bow_tf-idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spam Or Ham Prediction using Bag Of Word, TF-IDF and NLTK

## Tokenization

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

#Download Necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def preprocess(text, lemmatizer):
  """
  Preprocesses a single text message.
  """
  review = re.sub('[^a-zA-Z]', ' ', text).lower()
  tokens = word_tokenize(review)
  refined_tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stopwords.words('english')]
  return " ".join(refined_tokens)

In [None]:
# Load text file which has messages labelled with Spam or Ham
data = pd.read_csv('/content/sample_data/SMSSpamCollection.txt', sep='\t', names = ['label', 'message'])

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
# Encode labels, i.e., Spam as 1 and Ham as 0.
data['label'] = data['label'].map(lambda a: 0 if a=='ham' else 1)

In [None]:
# Apply preprocessing to all messages
data['processed_message'] = data['message'].apply(preprocess, args=(WordNetLemmatizer(),))

## Bow and Naive Bayes

### Vectorization

In [None]:
# Apply the Bag-of-Words (BoW) technique to vectorize the text.
vectorizer = CountVectorizer(ngram_range=(1,2)) # Create a vocabulary with combination of single word and 2 consecutive words
X = vectorizer.fit_transform(data['processed_message']).toarray()
y = np.array(data['label'])

### Machine Learing - Prediction

In [None]:
# Split the features and labels into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

In [None]:
# Apply a Random Forest Classifier to train the model and make predictions.
random_forest_model = MultinomialNB()
random_forest_model.fit(X_train, y_train)

In [None]:
y_pred = random_forest_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9655419956927495
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1207
           1       0.82      0.96      0.88       186

    accuracy                           0.97      1393
   macro avg       0.90      0.96      0.93      1393
weighted avg       0.97      0.97      0.97      1393



## TF-IDF and Naive Bayes

### Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X = tfidf.fit_transform(data['processed_message']).toarray()
y = np.array(data['label'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(f"Accuracy : {accuracy_score(y_test, y_pred)}")
print(f"Classification report:\n {classification_report(y_test, y_pred)}")

Accuracy : 0.9798994974874372
Classification report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1207
           1       0.99      0.86      0.92       186

    accuracy                           0.98      1393
   macro avg       0.98      0.93      0.95      1393
weighted avg       0.98      0.98      0.98      1393

