In [33]:
import os 
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem.porter import *

from sklearn.feature_extraction.text import CountVectorizer


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [34]:
data=pd.read_csv('emails.csv',encoding = "ISO-8859-1")

In [35]:
data

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [36]:
def remove_pattern(input_text,pattern):
  r = re.findall(pattern,input_text)
  for i in r:
    input_text=re.sub(i,'',input_text)
    
  return input_text

In [38]:
data['tidy']=np.vectorize(remove_pattern)(data['text'],',[\w]*')
data['tidy']=data['tidy'].str.replace('[^a-zA-z]'," ")
data['tidy']=data['tidy'].apply(lambda x:' '.join([w for w in x.split() if len(w)>2]))

In [39]:
data.head()

Unnamed: 0,text,spam,tidy
0,Subject: naturally irresistible your corporate...,1,Subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,Subject the stock trading gunslinger fanny mer...
2,Subject: unbelievable new homes made easy im ...,1,Subject unbelievable new homes made easy wanti...
3,Subject: 4 color printing special request add...,1,Subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",1,Subject not have money get software cds from h...


In [40]:
tokenized_review=data['tidy'].apply(lambda x:x.split())
tokenized_review.head()

0    [Subject, naturally, irresistible, your, corpo...
1    [Subject, the, stock, trading, gunslinger, fan...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, color, printing, special, request, a...
4    [Subject, not, have, money, get, software, cds...
Name: tidy, dtype: object

In [41]:
stemmer=PorterStemmer()

In [42]:
tokenized_review=tokenized_review.apply(lambda x:[stemmer.stem(i) for i in x])

In [43]:
tokenized_review.head()

0    [subject, natur, irresist, your, corpor, ident...
1    [subject, the, stock, trade, gunsling, fanni, ...
2    [subject, unbeliev, new, home, made, easi, wan...
3    [subject, color, print, special, request, addi...
4    [subject, not, have, money, get, softwar, cd, ...
Name: tidy, dtype: object

In [44]:

for i in range(len(tokenized_review)):
    tokenized_review[i]=' '.join(tokenized_review[i])

data['tidy']=tokenized_review

In [45]:
data.head()

Unnamed: 0,text,spam,tidy
0,Subject: naturally irresistible your corporate...,1,subject natur irresist your corpor ident reall...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trade gunsling fanni merril ...
2,Subject: unbelievable new homes made easy im ...,1,subject unbeliev new home made easi want show ...
3,Subject: 4 color printing special request add...,1,subject color print special request addit info...
4,"Subject: do not have money , get software cds ...",1,subject not have money get softwar cd from her...


In [46]:
all_words=' '.join([word for word in data['tidy']])

In [47]:
all_words_list=all_words.split(' ')

In [48]:
all_word_frame=pd.DataFrame(all_words_list)

In [49]:
X=all_word_frame[0].value_counts()

In [50]:

X

the               50110
and               27480
you               19154
for               16696
enron             13388
ect               11427
subject           10202
thi               10031
your               9399
that               9296
with               8855
vinc               8532
will               8252
have               7859
from               6746
are                6521
hou                5577
com                5444
pleas              5113
kaminski           4801
not                4576
would              4426
our                4359
can                4257
thank              3730
forward            3161
time               3145
all                2929
ani                2822
research           2820
                  ...  
lundquist             1
spradley              1
verifictaion          1
furthest              1
cfarrel               1
humphri               1
polder                1
bergbau               1
podjac                1
aix                   1
zofia           

In [51]:

df2 = pd.DataFrame(np.array(all_words.split(' ')).reshape(-1, 1), columns=['words'])

In [52]:
unique_words = list(df2['words'].str.split(' ', expand=True).stack().unique())

In [53]:
len(unique_words)

25348

In [54]:
word_counts = list(df2['words'].value_counts())

In [55]:

df4 = pd.DataFrame(np.hstack((np.array(unique_words).reshape(-1,1),np.array(word_counts).reshape(-1,1))), columns=['words','word_count'])

In [56]:
df4_new=df4[df4['word_count'].map(len)==1]

In [57]:
df4_words_list = list(df4_new['words'])

In [58]:
unique_words=list(unique_words)

In [59]:
for word in df4_words_list:
    unique_words.remove(word)

In [60]:
len(unique_words)

5480

In [61]:
type(unique_words)

list

In [62]:
df4_new.head()

Unnamed: 0,words,word_count
5480,benno,9
5481,witti,9
5482,siyvia,9
5483,afterward,9
5484,laugh,9


In [63]:
new_data=data

In [64]:
new_data.head()

Unnamed: 0,text,spam,tidy
0,Subject: naturally irresistible your corporate...,1,subject natur irresist your corpor ident reall...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trade gunsling fanni merril ...
2,Subject: unbelievable new homes made easy im ...,1,subject unbeliev new home made easi want show ...
3,Subject: 4 color printing special request add...,1,subject color print special request addit info...
4,"Subject: do not have money , get software cds ...",1,subject not have money get softwar cd from her...


In [65]:
tokenized_tidy=new_data['tidy'].apply(lambda x:x.split())
tokenized_tidy.head()

0    [subject, natur, irresist, your, corpor, ident...
1    [subject, the, stock, trade, gunsling, fanni, ...
2    [subject, unbeliev, new, home, made, easi, wan...
3    [subject, color, print, special, request, addi...
4    [subject, not, have, money, get, softwar, cd, ...
Name: tidy, dtype: object

In [66]:
for i in tokenized_tidy:
    for j in i:
        if j not in unique_words:
            i.remove(j)

In [67]:
len(tokenized_tidy)

5728

In [68]:
for i in range(len(tokenized_tidy)):
    tokenized_tidy[i]=' '.join(tokenized_tidy[i])

new_data['new_tidy']=tokenized_tidy

In [69]:
new_data.head()

Unnamed: 0,text,spam,tidy,new_tidy
0,Subject: naturally irresistible your corporate...,1,subject natur irresist your corpor ident reall...,subject natur irresist your corpor ident reall...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trade gunsling fanni merril ...,subject the stock trade gunsling fanni merril ...
2,Subject: unbelievable new homes made easy im ...,1,subject unbeliev new home made easi want show ...,subject unbeliev new home made easi want show ...
3,Subject: 4 color printing special request add...,1,subject color print special request addit info...,subject color print special request addit info...
4,"Subject: do not have money , get software cds ...",1,subject not have money get softwar cd from her...,subject not have money get softwar cd from her...


In [70]:
rev_vectorizer=CountVectorizer(stop_words='english',max_features=20000,ngram_range=(1,3))
rev= rev_vectorizer.fit_transform(new_data['new_tidy'])

In [71]:
train_rev=rev

xtrain_rev,xvalid_rev,ytrain,yvalid=train_test_split(train_rev,new_data['spam'],test_size=0.2)

In [72]:
lreg=LogisticRegression()
lreg.fit(xtrain_rev,ytrain)

prediction = lreg.predict_proba(xvalid_rev) # predicting on the validation set
prediction_int = prediction[:,1] >= 0.5 # if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction_int.astype(np.int)

In [73]:
f1_score(yvalid, prediction_int)

0.97879858657243823

In [74]:
lreg.fit(xtrain_rev,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [75]:
y_pred=lreg.predict(xvalid_rev)

In [76]:
print(accuracy_score(yvalid,y_pred))

0.989528795812
