In [86]:
import pandas as pd
import numpy as np

In [87]:
#importing dataset
dt=pd.read_csv("spam.csv", encoding= 'unicode_escape')
dt.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [88]:
#preprocessing
#which includes Tokenization, Stemming and Lemmatization, Stopword Removal
#Tokenization: splitting sentences into words
dt['spam']=dt['type'].map({'spam':1, 'ham':0}).astype(int)
dt.head(5)

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [89]:
dt.columns
len(dt['text'])

116

In [90]:
p=dt['text'][1]

In [91]:
def tokenizer(text):
    return text.split()

In [92]:
dt['text']=dt['text'].apply(tokenizer)

In [101]:
dt['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [97]:
#stemming: removing the suffix

from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer("english", ignore_stopwords=False)

In [98]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [102]:
dt['text']=dt['text'].apply(stem_it)
dt['text'][1]
# reducing words into their root words

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [104]:
#Lemmatization: finding the lemma(base form) of a word
dt['text'][63]

['sorri',
 'my',
 'roommat',
 'took',
 'forever,',
 'it',
 'ok',
 'if',
 'i',
 'come',
 'by',
 'now?']

In [105]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [106]:
def lemmit_it(text):
    return [lemmatizer.lemmatize(word, pos="a") for word in text]

In [107]:
dt['text']=dt['text'].apply(lemmit_it)
dt['text'][63]

['sorri',
 'my',
 'roommat',
 'took',
 'forever,',
 'it',
 'ok',
 'if',
 'i',
 'come',
 'by',
 'now?']

In [118]:
w1=['yupy','better']
w1

['yupy', 'better']

In [119]:
w1=stem_it(w1)
w1=lemmit_it(w1)
w1

['yupi', 'good']

In [129]:
#stopword removal:removal of useless words
w2=['tired','i',"haven't",'slept','well','the','past','few','nights']

In [127]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [131]:
def stop_it(text):
    review =[word for word in text if not word in stop_words]
    return review

In [132]:
stop_it(w2)

['tired', 'slept', 'well', 'past', 'nights']

In [133]:
dt['text']=dt['text'].apply(stop_it)

In [134]:
dt.head(10)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, bugi, n, ...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, ear, hor..., u, c, alreadi, say...]",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [135]:
dt['text']=dt['text'].apply(' '.join)

In [153]:
dt.head()

Unnamed: 0,type,text,spam
0,ham,"go jurong point, crazy.. avail bugi n great wo...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say ear hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


In [154]:
#Vectorization
#it gives numerical values to the text
#Term document frequency gives words with maximum frequency
#TF-IDF gives words having maximum importance even if they are less frequent
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
y=dt.spam.values

x=tfidf.fit_transform(dt['text'])

In [157]:
#splitting
from sklearn.model_selection import train_test_split
x_train,x_text,y_train,y_text=train_test_split(x,y,random_state=1,test_size=0.2,shuffle=False)

In [192]:
#using Logistic Regression as classifier
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_text)
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred,y_text)*100
print("accuracy:",acc_log)

accuracy: 87.5


In [191]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train,y_train)
y_pred = linear_svc.predict(x_text)
acc_linear_svc= accuracy_score(y_pred, y_text)*100
print("accuracy:", acc_linear_svc)

accuracy: 87.5


In [190]:
type(acc_linear_svc)

numpy.float64