## **Spam classification using Naive Bayes and Decision Tree**

In [71]:
import pandas as pd
from sklearn.metrics import precision_score,recall_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am",
              "an", "and", "any", "are", "as", "at", "be", "because", "been", "before",
              "being", "below", "between", "both", "but", "by", "could", "did", "do",
              "does", "doing", "down", "during", "each", "few", "for", "from", "further",
              "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here",
              "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i",
              "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its",
              "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on",
              "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out",
              "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so",
              "some", "such", "than", "that", "that's", "the", "their", "theirs", "them",
              "themselves", "then", "there", "there's", "these", "they", "they'd",
              "they'll", "they're", "they've", "this", "those", "through", "to", "too",
              "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're",
              "we've", "were", "what", "what's", "when", "when's", "where", "where's",
              "which", "while", "who", "who's", "whom", "why", "why's", "with", "would",
              "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself",
              "yourselves" ]

In [72]:
data = pd.read_csv('spam2.csv')
print("\nData :\n",data)
print("\nData statistics\n",data.info())


Data :
         v1                                                 v2
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham              Will �_ b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2 

In [73]:
replace_with_dig = {
    'ham':1,
    'spam':0
}
data.replace(to_replace=replace_with_dig,inplace=True,value=None)

# **Using CountVectorizer**

In [74]:
# Creating an object CountVectorizer which will not include stopwords while counting
vectorizer = CountVectorizer(stop_words={'english'})

In [75]:
on_sentences_1 = vectorizer.fit(data['v2'])
on_sentences_1.vocabulary_

{'go': 3538,
 'until': 8005,
 'jurong': 4329,
 'point': 5899,
 'crazy': 2321,
 'available': 1304,
 'only': 5516,
 'in': 4073,
 'bugis': 1751,
 'great': 3622,
 'world': 8462,
 'la': 4455,
 'buffet': 1749,
 'cine': 2044,
 'there': 7621,
 'got': 3582,
 'amore': 1070,
 'wat': 8241,
 'ok': 5483,
 'lar': 4491,
 'joking': 4297,
 'wif': 8365,
 'oni': 5512,
 'free': 3346,
 'entry': 2939,
 'wkly': 8420,
 'comp': 2161,
 'to': 7732,
 'win': 8378,
 'fa': 3076,
 'cup': 2380,
 'final': 3196,
 'tkts': 7719,
 '21st': 411,
 'may': 4910,
 '2005': 402,
 'text': 7573,
 '87121': 784,
 'receive': 6275,
 'question': 6168,
 'std': 7207,
 'txt': 7908,
 'rate': 6220,
 'apply': 1157,
 '08452810075over18': 77,
 'dun': 2793,
 'say': 6610,
 'so': 7002,
 'early': 2814,
 'hor': 3913,
 'already': 1043,
 'then': 7616,
 'nah': 5219,
 'don': 2704,
 'think': 7636,
 'he': 3768,
 'goes': 3546,
 'usf': 8050,
 'lives': 4645,
 'around': 1208,
 'here': 3818,
 'though': 7656,
 'freemsg': 3353,
 'hey': 3828,
 'darling': 2437,
 'it

In [76]:
on_sentences_1 = vectorizer.transform(data['v2'])
print(on_sentences_1.shape)
on_sentences_1.toarray()[1]

(5572, 8624)


array([0, 0, 0, ..., 0, 0, 0])

In [77]:
on_sentences_1= vectorizer.fit_transform(data['v2'])
print(on_sentences_1.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# **Using Naive Baye's**

In [78]:
from sklearn.naive_bayes import MultinomialNB
multi_naive_bayes = MultinomialNB()

In [79]:
#Splitting into training and testing 
X_train,X_test,y_train,y_test = train_test_split(on_sentences_1,data['v1'],random_state=179)

multi_naive_bayes.fit(X_train,y_train)

predicted_using_naive_bayes = multi_naive_bayes.predict(X_test)

In [80]:
print("Precision :", precision_score(y_test,predicted_using_naive_bayes))
print("Recall :", recall_score(y_test,predicted_using_naive_bayes))

Precision : 0.990924092409241
Recall : 0.9844262295081967


# **Using Decision Tree Classifier**

In [81]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier()

DTC.fit(X_train,y_train)

predict_using_DTC = DTC.predict(X_test)
print("Precision Score:", precision_score(y_test,predict_using_DTC))
print("Recall Score:", recall_score(y_test,predict_using_DTC))

Precision Score: 0.9812550937245313
Recall Score: 0.9868852459016394


# **Till now we have done for unigram now let's do for Bigram**

In [82]:
bigram_vectorizer = CountVectorizer(stop_words={'english'},ngram_range=(2,2))

In [83]:
on_sentence_bigram = bigram_vectorizer.fit_transform(data['v2'])
print(on_sentence_bigram.toarray())


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [84]:
X_train,X_test,y_train,y_test = train_test_split(on_sentence_bigram,data['v1'],random_state=179)


# **Using Naive Bayes**

In [85]:
multi_naive_bayes = MultinomialNB()
multi_naive_bayes.fit(X_train,y_train)
predicted_using_nb = multi_naive_bayes.predict(X_test)


In [86]:
print("Precision :", precision_score(y_test,predicted_using_nb))
print("Recall :", recall_score(y_test,predicted_using_nb))

Precision : 0.9961389961389961
Recall : 0.8459016393442623


# **Using Decision Tree Classifier**

In [87]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train,y_train)
predicted_using_DTC = DTC.predict(X_test)


In [88]:
print("Precision Score:", precision_score(y_test,predicted_using_DTC))
print("Recall Score:", recall_score(y_test,predicted_using_DTC))

Precision Score: 0.9626391096979332
Recall Score: 0.9926229508196721


## **Using TFIDF Vectorizer**

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words={'english'})

In [90]:
sentences_tfidf = tfidf_vectorizer.fit_transform(data['v2'])

In [91]:
X_train,X_test,y_train,y_test = train_test_split(sentences_tfidf,data['v1'],random_state=179)

# **Using Naive Bayes**

In [92]:
multi_naive_bayes = MultinomialNB()
multi_naive_bayes.fit(X_train,y_train)
predicted_using_nb_tfidf = multi_naive_bayes.predict(X_test)

In [93]:
print("Precision :", precision_score(y_test,predicted_using_nb_tfidf))
print("Recall :", recall_score(y_test,predicted_using_nb_tfidf))

Precision : 0.9561128526645768
Recall : 1.0


# **Using Decision Tree Classifier**

In [94]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train,y_train)
predicted_using_DTC_tfidf = DTC.predict(X_test)

In [95]:
print("Precision Score:", precision_score(y_test,predicted_using_DTC_tfidf))
print("Recall Score:", recall_score(y_test,predicted_using_DTC_tfidf))

Precision Score: 0.9780130293159609
Recall Score: 0.9844262295081967


# **For Bigram**

In [96]:
tfidf_vectorizer = TfidfVectorizer(stop_words={'english'},ngram_range=(2,2))

In [97]:
sentences_tfidf = tfidf_vectorizer.fit_transform(data['v2'])

In [98]:
X_train,X_test,y_train,y_test = train_test_split(sentences_tfidf,data['v1'],random_state=179)

**Using Naive Bayes**

In [99]:
multi_naive_bayes = MultinomialNB()
multi_naive_bayes.fit(X_train,y_train)
predicted_using_nb_tfidf_bigram = multi_naive_bayes.predict(X_test)

In [100]:
print("Precision :", precision_score(y_test,predicted_using_nb_tfidf_bigram))
print("Recall :", recall_score(y_test,predicted_using_nb_tfidf_bigram))

Precision : 0.9377401998462721
Recall : 1.0


**Using Decision Tree Classifier**

In [101]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train,y_train)
predicted_using_DTC_tfidf_bigram = DTC.predict(X_test)

In [102]:
print("Precision Score:", precision_score(y_test,predicted_using_DTC_tfidf_bigram))
print("Recall Score:", recall_score(y_test,predicted_using_DTC_tfidf_bigram))

Precision Score: 0.9657097288676236
Recall Score: 0.9926229508196721
