<a href="https://colab.research.google.com/github/jagadish-sonamale/nlp-projects/blob/main/spam_ham_bow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spam Or Ham Prediction using Bag Of Word And NLTK

In [17]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

#Download Necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
def preprocess(text, lemmatizer):
  """
  Preprocesses a single text message.
  """
  review = re.sub('[^a-zA-Z]', ' ', text).lower()
  tokens = word_tokenize(review)
  refined_tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stopwords.words('english')]
  return " ".join(refined_tokens)

In [19]:
# Load text file which has messages labelled with Spam or Ham
data = pd.read_csv('/content/sample_data/SMSSpamCollection.txt', sep='\t', names = ['label', 'message'])

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [21]:
# Encode labels, i.e., Spam as 1 and Ham as 0.
data['label'] = data['label'].map(lambda a: 0 if a=='ham' else 1)

In [22]:
# Apply preprocessing to all messages
data['processed_message'] = data['message'].apply(preprocess, args=(WordNetLemmatizer(),))

In [23]:
# Apply the Bag-of-Words (BoW) technique to vectorize the text.
vectorizer = CountVectorizer(ngram_range=(1,2)) # Create a vocabulary with combination of single word and 2 consecutive words
X = vectorizer.fit_transform(data['processed_message']).toarray()
y = np.array(data['label'])

In [24]:
# Split the features and labels into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

In [25]:
# Apply a Random Forest Classifier to train the model and make predictions.
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

In [26]:
y_pred = random_forest_model.predict(X_test)

In [27]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.968413496051687
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1207
           1       1.00      0.76      0.87       186

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393

