In [46]:
import pandas as pd

In [47]:
!pip install chardet



In [48]:
import chardet

In [49]:
with open('spam.csv', 'rb') as file:
    rawdata = file.read()
    result = chardet.detect(rawdata)
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.7269493857068697, 'language': ''}


In [50]:
spam_data= pd.read_csv('spam.csv',encoding='Windows-1252',usecols=[0,1])

In [52]:
spam_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [53]:
spam_data.rename(columns={'v1':'category','v2':'message'},inplace=True)

In [54]:
import spacy

In [55]:
nlp= spacy.load('en_core_web_sm')

In [56]:
import nltk

In [57]:
from nltk.tokenize import word_tokenize


In [58]:
from nltk import PorterStemmer


In [59]:
def lemmatize(text):
  lemmatized_text=[]
  sp= nlp(text)
  for token in sp:
    lemmatized_text.append(token.lemma_)
  lemmatized=' '.join(lemmatized_text)

  return lemmatized;





In [60]:
spam_data['lemmatize_message']=spam_data['message'].map(lemmatize)

In [61]:
spam_data['lemmatize_message'][0]

'go until jurong point , crazy .. available only in bugis n great world la e buffet ... Cine there get amore wat ...'

In [62]:
spam_data.head()

Unnamed: 0,category,message,lemmatize_message
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point , crazy .. available onl..."
1,ham,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor ... u c already then sa...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah I do not think he go to usf , he live arou..."


In [63]:
spam_data['category_encode']= spam_data['category'].map(lambda x: 1 if x=='ham' else 0)

In [64]:
spam_data.head()

Unnamed: 0,category,message,lemmatize_message,category_encode
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point , crazy .. available onl...",1
1,ham,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor ... u c already then sa...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah I do not think he go to usf , he live arou...",1


In [65]:
X= spam_data['lemmatize_message']
y= spam_data['category_encode']

In [66]:
from sklearn.model_selection import train_test_split

In [67]:
train_X, test_X, train_Y, test_Y = train_test_split(X,y, train_size=0.8, random_state=3008)


In [68]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
from imblearn.over_sampling import SMOTE

In [70]:
cv= CountVectorizer()

In [71]:
train_X_cv=cv.fit_transform(train_X)

In [72]:
train_X_cv.shape

(4457, 6872)

In [73]:
train_X_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [90]:
train_Y.value_counts()

Unnamed: 0_level_0,count
category_encode,Unnamed: 1_level_1
1,3870
0,587


Let's balance the dataset now

In [74]:
sm=SMOTE()
X_bal,y_bal= sm.fit_resample(train_X_cv,train_Y)

In [75]:
y_bal.value_counts()

Unnamed: 0_level_0,count
category_encode,Unnamed: 1_level_1
0,3870
1,3870


Now the data is balanced

In [76]:
from sklearn.naive_bayes import MultinomialNB

In [77]:
nb_model= MultinomialNB()

In [78]:
nb_model.fit(X_bal,y_bal)

In [79]:
test_X_cv=cv.transform(test_X)

In [80]:
nb_preds=nb_model.predict(test_X_cv)

In [81]:
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [82]:
print(classification_report(test_Y,nb_preds))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91       160
           1       0.98      0.99      0.99       955

    accuracy                           0.98      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [83]:
accuracy_score(test_Y,nb_preds)

0.9757847533632287

CREATING PIPELINE

In [84]:
from sklearn.pipeline import Pipeline

In [85]:
spam_detection= Pipeline([('vectorizer',CountVectorizer()),('nb',MultinomialNB())])

In [86]:
spam_detection.fit(train_X,train_Y)

In [87]:
preds=spam_detection.predict(test_X)

In [89]:
print(classification_report(test_Y,preds))

              precision    recall  f1-score   support

           0       0.98      0.88      0.92       160
           1       0.98      1.00      0.99       955

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

