In [1]:
from os import walk
from os.path import join 
import pandas as pd
import nltk
import numpy as np
from sklearn.model_selection import train_test_split

from wordcloud import WordCloud
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline

In [2]:
EXAMPLE_FILE = 'SpamData/01_Processing/practice_email.txt'
SPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
SPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_2'
EASY_NONSPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'
EASY_NONSPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'
DATA_JSON = 'SpamData/01_Processing/email-text-data.json'
WHALE_ICON ='SpamData/01_Processing/wordcloud_resources/whale-icon.png'
SKULL_ICON ='SpamData/01_Processing/wordcloud_resources/skull-icon.png'
THUMBS_UP_ICON ='SpamData/01_Processing/wordcloud_resources/thumbs-up.png'
THUMBS_DOWN_ICON ='SpamData/01_Processing/wordcloud_resources/thumbs-down.png'
FONTSANS ='SpamData/01_Processing/wordcloud_resources/OpenSansCondensed-Bold.ttf'
VOCAB_SIZE = 2500
WORD_ID = 'SpamData/01_Processing/word-by-id.csv'

TRAINING_DATA = 'SpamData/02_Training/training-data.txt'

TEST_DATA = 'SpamData/02_Training/testing-data.txt'


TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt'

In [3]:
sparse_trained_data = np.loadtxt(TRAINING_DATA,delimiter = ' ',dtype = int)


In [4]:
sparse_test_data = np.loadtxt(TEST_DATA,delimiter = ' ',dtype = int)

In [5]:
sparse_test_data[0][0]

8

In [8]:
sparse_trained_data[:5]

array([[ 0,  2,  1,  1],
       [ 0,  3,  1,  2],
       [ 0,  4,  1,  1],
       [ 0,  7,  1,  3],
       [ 0, 11,  1,  1]])

In [9]:
np.unique(sparse_test_data[:,0],).shape

(1724,)

In [10]:
column_names = []
column_names.append('DOC_ID')
column_names.append('CATEGORY')  
for i in range(VOCAB_SIZE):
    column_names.append(i)
#column_names.append(i for i in range(VOCAB_SIZE))

In [11]:
len(column_names)

2502

In [12]:
index_names = np.unique(sparse_trained_data[:,0])

In [13]:
full_train_data = pd.DataFrame(index = index_names,columns=column_names)

In [15]:
full_train_data = full_train_data.fillna(value=0)

In [16]:
# Create a full matrix from sparse matrix

def make_full_matrix(sparse_matrix,no_words):
    column_names = []
    column_names.append('DOC_ID')
    column_names.append('CATEGORY')  
    for i in range(no_words):
        column_names.append(i)
    index_names = np.unique(sparse_matrix[:,0])
    full_train_data = pd.DataFrame(index = index_names,columns=column_names)
    full_train_data = full_train_data.fillna(value=0)
    
    for i in range(sparse_matrix.shape[0]):
        doc_id = sparse_matrix[i][0]
        word_id = sparse_matrix[i][1]
        category = sparse_matrix[i][2]
        freq = sparse_matrix[i][3]
        full_train_data.at[doc_id,'DOC_ID'] = doc_id
        full_train_data.at[doc_id,'CATEGORY'] = category
        full_train_data.at[doc_id,word_id] = freq


        
    full_train_data.set_index('DOC_ID', inplace=True)

    return full_train_data

In [17]:
full_train_df = make_full_matrix(sparse_trained_data,VOCAB_SIZE)

In [19]:
full_train_df.head()

Unnamed: 0_level_0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,0,1,2,1,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6,0,0,2,4,0,3,14,0,...,0,0,0,0,0,0,0,0,0,0
4,1,5,1,2,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
type(sparse_trained_data)

numpy.ndarray

# Calculating Probability of mail being spam

In [21]:
spam_df = full_train_df['CATEGORY'].size

In [22]:
spam_df

4013

In [23]:
spam_cnt = full_train_df['CATEGORY'].values
spam_cnt.sum()
prob_spam = spam_cnt.sum()/spam_df

In [24]:
prob_spam

0.310989284824321

In [25]:
full_train_features = full_train_df.loc[:, full_train_df.columns != 'CATEGORY']

In [26]:
full_train_features
email_lengths = full_train_features.sum(axis=1)

In [28]:
email_lengths.head()

DOC_ID
0     87
1     53
2     40
3    183
4     43
dtype: int64

In [29]:
spam_lengths = email_lengths[full_train_df.CATEGORY == 1]
spam_lengths.shape

(1248,)

In [30]:
spam_wc = spam_lengths.sum()
spam_wc

176322

In [31]:
ham_lengths = email_lengths[full_train_df.CATEGORY == 0]
ham_lengths.shape

(2765,)

In [32]:
email_lengths.shape[0] - spam_lengths.shape[0] - ham_lengths.shape[0]

0

In [33]:
nonspam_wc = ham_lengths.sum()
nonspam_wc
spam_wc + nonspam_wc - email_lengths.sum()


0

## SUmming the tokens

In [34]:
full_train_df.shape

(4013, 2501)

In [35]:
train_spam = full_train_features.loc[full_train_df['CATEGORY']==1]

In [36]:
summed_spam_tokens = train_spam.sum(axis=0)
summed_spam_tokens += 1
summed_spam_tokens.head()

0    2179
1     935
2    1217
3    2022
4    1219
dtype: int64

In [37]:
train_non_spam = full_train_features.loc[full_train_df['CATEGORY']==0]
summed_nonspam_tokens = train_non_spam.sum()
summed_nonspam_tokens += 1
summed_nonspam_tokens.head(10)

0    5484
1    2590
2    2045
3     938
4    1612
5    1606
6    1714
7     412
8    1243
9    1298
dtype: int64

# P(token| spam) 

In [38]:
prob_tokens_spam = summed_spam_tokens/(spam_wc + VOCAB_SIZE)

In [39]:
prob_tokens_spam.head()

0    0.012185
1    0.005229
2    0.006806
3    0.011307
4    0.006817
dtype: float64

In [40]:
prob_tokens_nonspam = summed_nonspam_tokens/(nonspam_wc + VOCAB_SIZE)

In [41]:
prob_tokens_nonspam.sum()

1.0

# P(token)

In [42]:
prob_tokens_all = full_train_features.sum()/email_lengths.sum()

In [43]:
prob_tokens_all.head()

0    0.017848
1    0.008208
2    0.007595
3    0.006891
4    0.006591
dtype: float64

In [44]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_tokens_spam)
np.savetxt(TOKEN_HAM_PROB_FILE, prob_tokens_nonspam)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_tokens_all)

In [45]:
# prepare test data
full_test = make_full_matrix(sparse_test_data,VOCAB_SIZE)


In [46]:
full_test.shape

(1724, 2501)

In [47]:
y_test = full_test['CATEGORY']
x_test = full_test.loc[:,full_test.columns!='CATEGORY']

In [48]:
y_test
x_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0,0,1,4,2,1,2,4,1,2,...,0,0,0,0,0,0,0,0,0,0
12,6,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,0,2,1,1,2,0,0,3,0,4,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
np.savetxt(TEST_TARGET_FILE, y_test)
np.savetxt(TEST_FEATURE_MATRIX, x_test)