In [70]:
import pandas as pd

df = pd.read_csv("spam.csv", encoding='latin-1')
print(df.head())


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [71]:
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])


In [72]:
df.columns = ['label', 'message']


In [73]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
# Encoding the label
df["label"] = df["label"].map({"ham": 0, "spam": 1})


In [75]:
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [76]:
# Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')

def preprocess(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

df["clean_message"] = df["message"].apply(preprocess)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kabir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kabir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
df

Unnamed: 0,label,message,clean_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u pound prize 2 claim...
5568,0,Will Ì_ b going to esplanade fr home?,b going esplanade fr home
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood suggestions
5570,0,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)

X = vectorizer.fit_transform(df["clean_message"])
y = df["label"]


In [79]:
X.toarray()


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [80]:
#Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [81]:
# train model 
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)


In [82]:
#Testing 
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9766816143497757
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       0.99      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [83]:
#Testing on random text
def predict_sms(text):
    text = preprocess(text)
    vector = vectorizer.transform([text])
    prediction = model.predict(vector)
    return "Spam" if prediction[0] == 1 else "Ham"

print(predict_sms("Congratulations! You won a free ticket"))
print(predict_sms("Hey, are we meeting today?"))


Ham
Ham


SMS SPAM DETECTION USING WORD2VEC


In [84]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [85]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

df['tokens'] = df['message'].apply(preprocess)


In [86]:
df

Unnamed: 0,label,message,clean_message,tokens
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,"[nah, think, goes, usf, lives, around, though]"
...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u pound prize 2 claim...,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,0,Will Ì_ b going to esplanade fr home?,b going esplanade fr home,"[b, going, esplanade, fr, home]"
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood suggestions,"[pity, mood, suggestions]"
5570,0,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...,"[guy, bitching, acted, like, interested, buyin..."


In [87]:
# train word2vec model
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=df['tokens'],
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)


In [88]:
model.wv['free']


array([-3.6445180e-01,  5.6640887e-01,  3.4922412e-01,  1.3465442e-01,
        1.0277254e-01, -8.0854607e-01,  1.4044768e-01,  1.2829796e+00,
       -6.0146946e-01, -4.1965133e-01, -2.9880658e-01, -9.7192329e-01,
       -1.2894125e-01,  2.2499248e-01,  1.8895781e-01, -4.2836520e-01,
        1.1988947e-01, -7.0803398e-01, -1.1294402e-01, -1.0884480e+00,
        2.6112002e-01,  1.7589438e-01,  3.1206504e-01, -3.7400770e-01,
       -3.8979182e-01, -4.6162385e-02, -4.5246485e-01, -3.5225418e-01,
       -3.4832406e-01,  1.3158572e-01,  5.6260824e-01,  2.9787338e-01,
        1.6719255e-01, -2.7599043e-01, -2.7011520e-01,  6.1129814e-01,
       -4.9298465e-02, -6.0038811e-01, -4.4279340e-01, -1.1369694e+00,
        1.9509691e-01, -5.4753113e-01, -2.4484351e-01, -1.6963089e-01,
        4.9671233e-01, -3.0283621e-01, -5.5508137e-01, -1.1290051e-01,
        3.3247942e-01,  3.9870268e-01,  1.6542156e-01, -3.0411682e-01,
       -6.0768653e-02, -4.0650991e-04, -3.6591882e-01,  1.9071808e-01,
      

In [89]:
# converting setence to  vector now , since qord2vec converts word to vector
def sentence_vector(tokens):
    vectors = []
    for word in tokens:
        if word in model.wv:
            vectors.append(model.wv[word])
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

X = np.array([sentence_vector(tokens) for tokens in df['tokens']])


In [90]:
df

Unnamed: 0,label,message,clean_message,tokens
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,"[go, jurong, point, crazy, available, bugis, n..."
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,"[nah, think, goes, usf, lives, around, though]"
...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u pound prize 2 claim...,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,0,Will Ì_ b going to esplanade fr home?,b going esplanade fr home,"[b, going, esplanade, fr, home]"
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood suggestions,"[pity, mood, suggestions]"
5570,0,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...,"[guy, bitching, acted, like, interested, buyin..."


In [91]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [92]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)


In [93]:
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.863677130044843
