In [88]:
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding,Input,Dense,Dropout,Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential,load_model
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
#Read the ham files into a list
ham = os.listdir('ham')
spam = os.listdir('spam')
ham_file = [open('ham/'+i,'r').read() for i in ham]
# The ham files are all htf8 encoded and so no need to specify

In [4]:
#Some spam files are latin1 encoded and so i will use a try except block to capture them
spam_file = []
for j in spam:
    try:
        with open('spam/'+j,'r') as file:
            a = file.read()
            spam_file.append(a)
    except:
        with open('spam/'+j,'r', encoding = 'latin1') as file:
            a = file.read()
            spam_file.append(a)

In [5]:
#Labels for the spam and ham classes
zero = np.zeros(len(ham_file))
one = np.ones(len(spam_file))

In [6]:
labels = list(zero) + list(one)

In [7]:
emails = ham_file + spam_file

In [8]:
dic = {'mails':emails,'labels':labels}

In [9]:
df = pd.DataFrame(dic)

In [10]:
df

Unnamed: 0,mails,labels
0,From exmh-workers-admin@redhat.com Thu Aug 22...,0.0
1,From Steve_Burt@cursor-system.com Thu Aug 22 ...,0.0
2,From timc@2ubh.com Thu Aug 22 13:52:59 2002\n...,0.0
3,From irregulars-admin@tb.tf Thu Aug 22 14:23:...,0.0
4,From Stewart.Smith@ee.ed.ac.uk Thu Aug 22 14:...,0.0
...,...,...
2995,From yelanotyami912@bot.or.th Tue Dec 3 15:1...,1.0
2996,From bolttish@hotmail.com Tue Dec 3 15:16:06...,1.0
2997,From removeme@marysstore.com Tue Dec 3 15:16...,1.0
2998,From eBayInternetMarketing@yahoo.com Tue Dec ...,1.0


In [47]:
df_clean = df.copy()
df_clean['labels'] = df_clean['labels'].astype(int)

### Data Cleaning

In [134]:
a = df_clean['mails'].loc[34]

In [135]:
ans = a.split('\n')

In [138]:
ind = ans.index('')
''.join(ans[ind:])

'Hi,Thank you for the useful replies, I have found some interesting tutorials in the ibm developer connection.https://www6.software.ibm.com/developerworks/education/j-sec1andhttps://www6.software.ibm.com/developerworks/education/j-sec2Registration is needed.I will post the same message on the Web Application Security list, as suggested by someone.For now, I thing I will use md5 for password checking (I will use the approach described in secure programmin fo linux and unix how-to).I will separate the authentication module, so I can change its implementation at anytime.Thank you again!Mario Torre-- Please avoid sending me Word or PowerPoint attachments.See http://www.fsf.org/philosophy/no-word-attachments.html '

In [140]:
mail = []
for i in df_clean['mails']:
    ans = i.split('\n')
    ind = ans.index('')
    text = ''.join(ans[ind:])
    mail.append(text)

In [143]:
df_clean['mail'] = mail

In [157]:
ans = [j for j in range(3000) if '<html>' in list(df_clean['mail'])[j]]

In [167]:
to_clean = df_clean.loc[ans]

In [177]:
to_clean

Unnamed: 0,mails,labels,mail
882,From fork-admin@xent.com Fri Oct 4 18:19:18 ...,0,--------------080209060700030309080805Content-...
1422,Return-Path: anthony@interlink.com.au\nDeliver...,0,>>> Tim Peters wrote> > I've actually got a bu...
1553,From razor-users-admin@lists.sourceforge.net ...,0,The following razor debugging sequence show my...
1682,Return-Path: tim.one@comcast.net\nDelivery-Dat...,0,[Jeremy]> The total collections are 1100 messa...
1712,Return-Path: guido@python.org\nDelivery-Date: ...,0,> > But it also identified as spam everything ...
...,...,...,...
2991,From akakay54@excite.com Tue Dec 3 11:57:08 ...,1,"<html><body><center><table bgcolor=3D""663399"" ..."
2992,From dega_jc271@hotmail.com Tue Dec 3 11:57:...,1,<HR><html><head> <title>Secured Investements ...
2995,From yelanotyami912@bot.or.th Tue Dec 3 15:1...,1,<html><head><title>Toy</title></head><body bgc...
2996,From bolttish@hotmail.com Tue Dec 3 15:16:06...,1,<html><head><title>Untitled Document</title><m...


In [187]:
to_clean['mail'] = [BeautifulSoup(i,'lxml').find('body').text for i in to_clean['mail']]

In [188]:
to_clean

Unnamed: 0,mails,labels,mail
882,From fork-admin@xent.com Fri Oct 4 18:19:18 ...,0,--------------080209060700030309080805Content-...
1422,Return-Path: anthony@interlink.com.au\nDeliver...,0,>>> Tim Peters wrote> > I've actually got a bu...
1553,From razor-users-admin@lists.sourceforge.net ...,0,The following razor debugging sequence show my...
1682,Return-Path: tim.one@comcast.net\nDelivery-Dat...,0,[Jeremy]> The total collections are 1100 messa...
1712,Return-Path: guido@python.org\nDelivery-Date: ...,0,> > But it also identified as spam everything ...
...,...,...,...
2991,From akakay54@excite.com Tue Dec 3 11:57:08 ...,1,Get 12 FREE VHS or DVDs! Click HERE ...
2992,From dega_jc271@hotmail.com Tue Dec 3 11:57:...,1,Secured Investements WEALTH WITHOUT RISK!...
2995,From yelanotyami912@bot.or.th Tue Dec 3 15:1...,1,ABC's Good Morning America ranks it the #1 Ch...
2996,From bolttish@hotmail.com Tue Dec 3 15:16:06...,1,Let Mortgage Lenders co...


In [189]:
df_clean.loc[ans] = to_clean

In [195]:
df_clean.to_csv('cleaned.csv',index = False)

In [7]:
df_clean = pd.read_csv('cleaned.csv')

In [13]:
data = df_clean[['mail','labels']]

In [14]:
data = data[:2999]

#### Learning

In [18]:
data = data.dropna()

In [19]:
vocab_size = 1000
encoded = [one_hot(d,vocab_size) for d in data['mail']]

In [20]:
pd.DataFrame([len(i) for i in encoded]).describe()

Unnamed: 0,0
count,2998.0
mean,266.354903
std,595.30172
min,6.0
25%,75.25
50%,145.0
75%,260.0
max,13225.0


In [21]:
max_len = 260

In [55]:
padded = []
for i in encoded:
    lenny = len(i)
    if lenny == max_len:
        padded.append(i)
    elif lenny < max_len:
        a = max_len - lenny
        rem = [0 for j in range(a)]
        padded.append(i+rem)
    elif lenny > max_len:
        padded.append(i[:max_len])

In [56]:
padded = np.array(padded)

In [57]:
model = Sequential()
model.add(Embedding(1000, 50, input_length = max_len))
model.add(Flatten())
model.add(Dense(100, activation = 'relu'))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = Adam(), metrics = ['acc'])

In [37]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 260, 50)           50000     
                                                                 
 flatten_1 (Flatten)         (None, 13000)             0         
                                                                 
 dense_6 (Dense)             (None, 100)               1300100   
                                                                 
 dense_7 (Dense)             (None, 100)               10100     
                                                                 
 dense_8 (Dense)             (None, 1)                 101       
                                                                 
Total params: 1,360,301
Trainable params: 1,360,301
Non-trainable params: 0
_________________________________________________________________


In [59]:
labels = np.array(data.labels)

In [60]:
x,y = shuffle(padded,labels)

In [63]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.4)

In [64]:
model.fit(xtrain,ytrain,epochs = 10, batch_size = 32, validation_data = (xtest,ytest))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1bad44ca6b0>

In [67]:
pred = model.predict(xtest)



In [72]:
pred = [round(i[0]) for i in pred]

In [74]:
f1_score(pred,ytest)

0.9870801033591732

In [77]:
model.save('model.h5')

In [80]:
model = load_model('model.h5')

### MAIL PIPELINE

In [107]:
def prediction(path):
    with open(path) as data:
        x = data.read()
    ans = x.split('\n')
    ind = ans.index('')
    text = ''.join(ans[ind:])
    if '<html>' in text:
        soup = BeautifulSoup(text,'lxml')
        text = soup.text
    encoded = one_hot(text,vocab_size)
    if len(encoded) < max_len:
        encoded = encoded + [0 for i in range(max_len - len(encoded))]
    elif len(encoded) > max_len:
        encoded = encoded[:max_len]
    padded = np.array(encoded)
    padded = padded.reshape(1,-1)
    pred = model.predict(padded)
    return round(pred[0][0])

In [110]:
prediction('spam/00004.eac8de8d759b7e74154f142194282724')



1