<a href="https://colab.research.google.com/github/hafsatariq18/NLP_SpamHam_Classifier/blob/main/Spam_Ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas nltk scikit-learn

In [None]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

nltk.download('stopwords')
nltk.download('wordnet')

np.random.seed(42)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df = pd.read_csv('spam.csv', encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
print(df.head())


     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
df.shape

(5572, 2)

In [None]:
label_counts = df['v1'].value_counts()
label_counts



v1
ham     4825
spam     747
Name: count, dtype: int64

In [None]:
df['v2'] = df['v2'].str.lower()
print(df.head())


     v1                                                 v2
0   ham  go until jurong point, crazy.. available only ...
1   ham                      ok lar... joking wif u oni...
2  spam  free entry in 2 a wkly comp to win fa cup fina...
3   ham  u dun say so early hor... u c already then say...
4   ham  nah i don't think he goes to usf, he lives aro...


In [None]:
df['v2'] = df['v2'].str.translate(str.maketrans('', '', string.punctuation))
print(df.head())

     v1                                                 v2
0   ham  go until jurong point crazy available only in ...
1   ham                            ok lar joking wif u oni
2  spam  free entry in 2 a wkly comp to win fa cup fina...
3   ham        u dun say so early hor u c already then say
4   ham  nah i dont think he goes to usf he lives aroun...


In [None]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])
df['v2'] = df['v2'].apply(remove_stopwords)
print(df.head())


     v1                                                 v2
0   ham  go jurong point crazy available bugis n great ...
1   ham                            ok lar joking wif u oni
2  spam  free entry 2 wkly comp win fa cup final tkts 2...
3   ham                u dun say early hor u c already say
4   ham        nah dont think goes usf lives around though


In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

df['v2'] = df['v2'].apply(lemmatize_text)
print(df.head())


     v1                                                 v2
0   ham  go jurong point crazy available bugis n great ...
1   ham                            ok lar joking wif u oni
2  spam  free entry 2 wkly comp win fa cup final tkts 2...
3   ham                u dun say early hor u c already say
4   ham           nah dont think go usf life around though


In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['v2'])

X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

print(X_df.head())


   008704050406  0089my  0121  01223585236  01223585334  0125698789  02  \
0             0       0     0            0            0           0   0   
1             0       0     0            0            0           0   0   
2             0       0     0            0            0           0   0   
3             0       0     0            0            0           0   0   
4             0       0     0            0            0           0   0   

   020603  0207  02070836089  ...  ìï  ìïll  ûthanks  ûªm  ûªt  ûªve  ûï  \
0       0     0            0  ...   0     0        0    0    0     0   0   
1       0     0            0  ...   0     0        0    0    0     0   0   
2       0     0            0  ...   0     0        0    0    0     0   0   
3       0     0            0  ...   0     0        0    0    0     0   0   
4       0     0            0  ...   0     0        0    0    0     0   0   

   ûïharry  ûò  ûówell  
0        0   0       0  
1        0   0       0  
2        0   0   

In [None]:
le = LabelEncoder()
y = le.fit_transform(df['v1'])
print(y)

[0 0 1 ... 0 0 0]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.97847533632287
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
tfidf_vectorizer = TfidfVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(df['v2'])

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

accuracy_tfidf = accuracy_score(y_test_tfidf, y_pred_tfidf)
print("TF-IDF Accuracy:", accuracy_tfidf)
print("TF-IDF Classification Report:")
print(classification_report(y_test_tfidf, y_pred_tfidf))


TF-IDF Accuracy: 0.9443946188340807
TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       0.96      0.61      0.75       150

    accuracy                           0.94      1115
   macro avg       0.95      0.80      0.86      1115
weighted avg       0.95      0.94      0.94      1115



In [None]:
sentences = [text.split() for text in df['v2']]

word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(text):
    words = text.split()
    word_vecs = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(100)

X_word2vec = np.array([document_vector(text) for text in df['v2']])

le = LabelEncoder()
y = le.fit_transform(df['v1'])

X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, y, test_size=0.2, random_state=42)

model_w2v = LogisticRegression()
model_w2v.fit(X_train_w2v, y_train_w2v)

y_pred_w2v = model_w2v.predict(X_test_w2v)

accuracy_w2v = accuracy_score(y_test_w2v, y_pred_w2v)
print("Word2Vec Accuracy:", accuracy_w2v)
print("Word2Vec Classification Report:")
print(classification_report(y_test_w2v, y_pred_w2v))
