In [119]:
import pandas as pd

In [120]:
df = pd.read_csv("gmail_spam_pred.csv")
df.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    116 non-null    object
 1   text    116 non-null    object
dtypes: object(2)
memory usage: 1.9+ KB


In [122]:
df["gmail_spam_pred"] = df["type"].map({'spam':1,'ham':0}).astype(int)
df.head()

Unnamed: 0,type,text,gmail_spam_pred
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


#### Tokenizing

In [123]:
def tokenizer(text):
    return text.split()
df['text'] = df['text'].apply(tokenizer)
df['text'][1]

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

#### Stemming

In [124]:
from nltk.stem.snowball import SnowballStemmer 
porter = SnowballStemmer("english", ignore_stopwords=False)
def stem_it(text):
    return[porter.stem(word) for word in text]
df['text'] = df['text'].apply(stem_it)
df['text'][1]

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

In [125]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmit_it(text):
    return[lemmatizer.lemmatize(word, pos='a') for word in text]
df['text'] = df['text'].apply(lemmit_it)
df['text'][100]

[nltk_data] Downloading package wordnet to C:\Users\Jainam
[nltk_data]     Shah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['pleas',
 "don't",
 'text',
 'me',
 'anymore.',
 'i',
 'have',
 'noth',
 'els',
 'to',
 'say.']

#### Stopword Removal

In [126]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
def stop_it(text):
    return[word for word in text if not word in stop_words]
df['text'] = df['text'].apply(stop_it)
df.text[100]


['pleas', 'text', 'anymore.', 'noth', 'els', 'say.']

In [127]:
df.head(10)

Unnamed: 0,type,text,gmail_spam_pred
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1


In [128]:
df['text'] = df['text'].apply(' '.join)
df.head()

Unnamed: 0,type,text,gmail_spam_pred
0,ham,"go jurong point, crazy.. avail onli bugi n gre...",0
1,ham,ok lar... joke wif u oni...,0
2,spam,free entri 2 wkli comp win fa cup final tkts 2...,1
3,ham,u dun say earli hor... u c alreadi say...,0
4,ham,"nah think goe usf, live around though",0


#### TD & TF-IDF Vectorisation    

In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
y = df.gmail_spam_pred.values
x = tfidf.fit_transform(df['text'])

In [130]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.2, shuffle=False)

#### Logistic Regression Model

In [131]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
from sklearn.metrics import accuracy_score
acc_log = accuracy_score(y_pred, y_test) * 100
print("accuracy: ", acc_log)

accuracy:  87.5


#### Classification using LinearSVC Accuracy

In [132]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(random_state=0)
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_test)
acc_linear_svc = accuracy_score(y_pred, y_test)*100
print("accuracy: ", acc_linear_svc)

accuracy:  87.5
