# SMS Spam Detection

In [150]:
import nltk
import pandas as pd

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from textblob import Word

from sklearn import model_selection, preprocessing, linear_model,  metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

## 2-Read Data:

In [151]:
Email_Data = pd.read_csv("spam.csv",encoding ='latin1')

Email_Data.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## 3- Text Preprocessing:

In [152]:
stop_words = stopwords.words('english')

In [153]:
def Convert_ToLower(data):
    words = []
    for word in data.split():
        words.append(word.lower())
    return " ".join(words)

In [154]:
def Not_Stopwords (input):
    words = ""
    for word in input.split() : 
        if word not in stop_words: 
            words =words + " " +word
    return words

In [155]:
def Lemmatize_Text(data ): 
    words = []
    for word in data.split():
        words.append(Word(word).lemmatize())
    return " ".join(words)   

In [156]:
Email_Data['Message'] = Email_Data['Message'].apply(Convert_ToLower)
Email_Data['Message'] = Email_Data['Message'].apply(Not_Stopwords)
Email_Data['Message'] =Email_Data['Message'].apply(Lemmatize_Text)

Email_Data.head()

Unnamed: 0,Category,Message
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think go usf, life around though"


## 4-Data Preparation:

In [169]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(Email_Data['Message'], Email_Data['Category'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

tfidf_vect = TfidfVectorizer()
xtrain_tfidf=tfidf_vect.fit_transform(train_x)
xtest_tfidf =tfidf_vect.transform(test_x)
xtrain_tfidf.data


array([0.19841476, 0.18195367, 0.24657999, ..., 0.18189197, 0.80051084,
       0.59931828])

## 5-Model Training:

In [170]:
def train_model(classifier, feature_vector_X, y_train, feature_vector_Test):
    
    classifier.fit(feature_vector_X, y_train)
    
    predictions = classifier.predict(feature_vector_Test)
    train_pred = classifier.predict(feature_vector_X)
    
    return metrics.accuracy_score(predictions, test_y) , metrics.accuracy_score(train_pred, y_train)



In [173]:
accuracyTest , accuracyTrain = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)
print ("Accuracy of Test : ", accuracyTest)
print ("Accuracy of Train : ", accuracyTrain)

Accuracy of Test :  0.9676956209619526
Accuracy of Train :  0.9691313711414213


In [174]:
accuracyTest , accuracyTrain = train_model(RandomForestClassifier(n_estimators=40, random_state=42), xtrain_tfidf, train_y, xtest_tfidf)
print ("Accuracy of Test : ", accuracyTest)
print ("Accuracy of Train : ", accuracyTrain)

Accuracy of Test :  0.9712849964106246
Accuracy of Train :  0.9997607083034219


In [177]:
base_estimator = DecisionTreeClassifier(max_depth=1)
accuracyTest , accuracyTrain =train_model(AdaBoostClassifier(estimator=base_estimator, n_estimators=50, learning_rate=1.0, random_state=42) ,xtrain_tfidf, train_y, xtest_tfidf )
print ("Accuracy of Test : ", accuracyTest)
print ("Accuracy of Train : ", accuracyTrain)


Accuracy of Test :  0.968413496051687
Accuracy of Train :  0.9794209140942809


In [175]:
accuracyTest , accuracyTrain = train_model(KNeighborsClassifier(n_neighbors=3), xtrain_tfidf, train_y, xtest_tfidf)
print ("Accuracy of Test : ", accuracyTest)
print ("Accuracy of Train : ", accuracyTrain)



Accuracy of Test :  0.923905240488155
Accuracy of Train :  0.9452022014836086
