In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#importing wordcloud for text visualization
from wordcloud import WordCloud

# importing nltk for text processing
import nltk
from nltk.corpus import stopwords




In [4]:
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hiyansh/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/hiyansh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
df = pd.read_csv('spam.csv')

In [7]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace =True, axis = 1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df.rename(columns = {'v1' : 'target', 'v2' : 'text'}, inplace = True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

df['target'] = encoder.fit_transform(df['target'])
df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
duplicates = df[df.duplicated()]
duplicates

Unnamed: 0,target,text
102,0,As per your request 'Melle Melle (Oru Minnamin...
153,0,As per your request 'Melle Melle (Oru Minnamin...
206,0,"As I entered my cabin my PA said, '' Happy B'd..."
222,0,"Sorry, I'll call later"
325,0,No calls..messages..missed calls
...,...,...
5524,1,You are awarded a SiPix Digital Camera! call 0...
5535,0,"I know you are thinkin malaria. But relax, chi..."
5539,0,Just sleeping..and surfing
5553,0,Hahaha..use your brain dear


In [21]:
#check duplicate values
df.duplicated().sum()

403

In [22]:
len(df)

5572

In [23]:
df.drop_duplicates(inplace = True)
len(df)

5169

In [24]:
#check duplicate values
df.duplicated().sum()

0

In [25]:
from nltk.stem.porter import PorterStemmer
import string
stemmer = PorterStemmer()

In [None]:
exam = nltk.word_tokenize('This is a sample !text for tokenization.!')
print(exam)
y = []
for i in exam:
    if i.isalnum():
        y.append(i)
print(y)
print(exam)
exam = y[:]
print(exam)
y.clear()
print(y)



['This', 'is', 'a', 'sample', '!', 'text', 'for', 'tokenization', '.', '!']
['This', 'is', 'a', 'sample', 'text', 'for', 'tokenization']
['This', 'is', 'a', 'sample', '!', 'text', 'for', 'tokenization', '.', '!']
['This', 'is', 'a', 'sample', 'text', 'for', 'tokenization']
[]


In [33]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    #removing stop words and punctuations
    text = [i for i in text if i not in stopwords.words('english') and i not in string.punctuation]
    text = [stemmer.stem(i) for i in text]

    return " ".join(text)




In [34]:
transform_text('This is a sample !text for tokenization.!')

'sampl text token'

In [35]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=500)


In [43]:
X = tfidf.fit_transform(df['transformed_text']).toarray()
print(X.shape)
print(type(X))

(5169, 500)
<class 'numpy.ndarray'>


In [48]:
y = df['target'].values
print(y.shape)
print(y)
print(type(y))

(5169,)
[0 0 1 ... 0 0 0]
<class 'numpy.ndarray'>


In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [52]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135, 500), (1034, 500), (4135,), (1034,))

In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [55]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 50, random_state = 2)

In [56]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

In [57]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [58]:
accuracy_scores = []
precision_scores = []
for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9709864603481625
Precision:  0.952755905511811

For:  KNN
Accuracy:  0.9294003868471954
Precision:  0.9736842105263158

For:  NB
Accuracy:  0.9758220502901354
Precision:  0.9838709677419355

For:  DT
Accuracy:  0.9313346228239845
Precision:  0.8363636363636363

For:  LR
Accuracy:  0.9574468085106383
Precision:  0.904

For:  RF
Accuracy:  0.971953578336557
Precision:  0.953125





For:  Adaboost
Accuracy:  0.9535783365570599
Precision:  0.8702290076335878

For:  Bgc
Accuracy:  0.960348162475822
Precision:  0.8768115942028986

For:  ETC
Accuracy:  0.9777562862669246
Precision:  0.9552238805970149

For:  GBDT
Accuracy:  0.9477756286266924
Precision:  0.941747572815534

For:  xgb
Accuracy:  0.9680851063829787
Precision:  0.9242424242424242
