In [60]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import os
from operator import itemgetter
import json

path = './/Data//Emails'
dict_path = './/Data//English Words'

In [3]:
train_data = pd.read_csv(os.path.join(path,'emails.train.csv'))
train_data

Unnamed: 0,id,text,spam
0,0,Subject: naturally irresistible your corporate...,1
1,2,Subject: unbelievable new homes made easy im ...,1
2,3,Subject: 4 color printing special request add...,1
3,4,"Subject: do not have money , get software cds ...",1
4,5,"Subject: great nnews hello , welcome to medzo...",1
5,6,Subject: here ' s a hot play in motion homela...,1
6,9,Subject: save your money buy getting this thin...,1
7,11,Subject: save your money buy getting this thin...,1
8,12,Subject: brighten those teeth get your teeth...,1
9,14,Subject: fpa notice : ebay misrepresentation o...,1


Possible features: 
* Word frequency
 * Each element of the vector indicates frequency of respective word. Occurances divided by total words
* Length frequency
 * Each element of the vector indicates frequency of word with length i. Occurances dived by total words
* Total number of words
 * Numeric value
* Total number of letters
 * Numeric value
* Number of non-alphabet characeters in subject
* Number of non-alphabet characters in body
* Indicators in subject
 * Reply, forward ..
* Personal greeting
* Presence of personal names
* Number of grammatically incorrect words
* Number of unique words

In [56]:
def word_split(text):
    word_dict = {}
    current_word = str()
    for i in range(len(text)):
        if text[i].isalpha():
            current_word += text[i]
            continue
        else:
            if current_word:
                if current_word in word_dict:
                    word_dict[current_word] += 1
                else:
                    word_dict[current_word] = 1
                current_word = str()
            else:
                continue
    return word_dict

def local_words(text_list):
    local_word_list = []
    for i in range(len(text_list)):
        text = text_list.iloc[i]
        word_dict = word_split(text)
        for word in word_dict:
            local_word_list.append(word)
    return local_word_list

def local_word_dict(text_list):
    local_word_dict = {}
    for i in range(len(text_list)):
        word_dict = word_split(text_list.iloc[i])
        local_word_dict = {word: local_word_dict.get(word, 0) + word_dict.get(word, 0) for word in set(local_word_dict)|set(word_dict)}
    return local_word_dict

def top_words(n, word_dict):
    assert n < len(word_dict)
    top_dict = {}
    sorted_keys = sorted(word_dict, key=word_dict.get, reverse=True)
    for i in range(n):
        top_dict[sorted_keys[i]] = word_dict[sorted_keys[i]]
    return top_dict
        
            
emails = train_data.text
all_words = local_words(emails)

spam = train_data.text[train_data.spam > 0]
spam_words = local_words(spam)

non_spam = train_data.text[train_data.spam < 1]
non_spam_words = local_words(non_spam)

email_dict = local_word_dict(emails)

spam_dict = local_word_dict(spam)

non_spam_dict = local_word_dict(non_spam)

In [58]:
print(top_words(10, spam_dict))

{'the': 6239, 'to': 5686, 'and': 4588, 'of': 3949, 'you': 3400, 'a': 3374, 'in': 2788, 'your': 2531, 'for': 2258, 'is': 2093}


In [80]:
english_dict_file = os.path.join(dict_path, 'words_dictionary.json')
with open(english_dict_file,"r") as english_dictionary:
    english_words_dict = json.load(english_dictionary)
english_words_set = set(english_words_dict.keys())

In [94]:
def text_dict_check(text_dict, english_dict_set):
    correct_words = 0
    incorrect_words = 0
    text_set = set(text_dict.keys())
    for word in text_set:
        if word in english_dict_set:
            correct_words += 1
    return correct_words/len(text_dict)

non_spam_prop = np.mean(np.array([text_dict_check(word_split(non_spam.iloc[i]), english_words_set)
                                  for i in range(len(non_spam))]))
spam_prop = np.mean(np.array([text_dict_check(word_split(spam.iloc[i]), english_words_set)
                              for i in range(len(spam))]))

In [177]:
def create_design_matrix(data, english_dict_set):
    features = 2
    observations = len(data)
    X = np.zeros((observations, features))
    for i in range(observations):
        mail = data.text.iloc[i]
        #Number of unique words
        d = word_split(mail)
        X[i, 0] = len(d)/sum(d.values())
        #Proportion correct words
        X[i, 1] = text_dict_check(d, english_dict_set)
    return X
    
def create_response_vector(data):
    Y = np.array(data.spam)
    Y[Y == 0] = -1
    return Y

In [178]:
from sklearn.linear_model import LogisticRegression

def data_split(data):
    test_indices = np.random.choice(len(data), len(data)//5)
    train_indices = np.array([i for i in range(len(data)) if i not in test_indices])
    Y = create_response_vector(data)
    X = create_design_matrix(data, english_words_set)
    X_train = X[train_indices]
    Y_train = Y[train_indices]
    X_test = X[test_indices]
    Y_test = Y[test_indices]
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = data_split(train_data)

lr = LogisticRegression().fit(X_train, Y_train)
print(lr.score(X_train, Y_train))
print(lr.score(X_test, Y_test))

0.755232029117
0.771144278607
-3155
3297


0.23700571997015668
