In [27]:
import json
import pandas as pd
import numpy as np
import email as em

# Leo los mails (poner los paths correctos).
ham_txt = json.load(open('./data/ham_dev.json'))
spam_txt = json.load(open('./data/spam_dev.json'))

txt = ham_txt + spam_txt
output = []

for (nro, text) in enumerate(txt):
    current = {}

    if nro <= len(ham_txt):
        # HAM es -1
        current['class'] = -1
    else:
        # SPAM es 1
        current['class'] = 1

    current['email'] = text
    output.append(current)

# This is the merged, created dataset
df = pd.DataFrame(output)

# Split and save holdout and training data
# We hold out about 10% of the data
mask = np.random.rand(len(df)) < 0.8

development = df[mask]
development.to_msgpack('./data/development.msg')

holdout = df[~mask]
holdout.to_msgpack('./data/holdout.msg')

In [28]:
development

Unnamed: 0,class,email
0,-1,message-id: <13247446.1075856567056.javamail.e...
1,-1,message-id: <2011334.1075840007235.javamail.ev...
2,-1,message-id: <18606548.1075840884320.javamail.e...
3,-1,message-id: <31974158.1075845117294.javamail.e...
4,-1,message-id: <5265286.1075840899541.javamail.ev...
6,-1,message-id: <3906275.1075857067358.javamail.ev...
7,-1,message-id: <8305802.1075863310355.javamail.ev...
8,-1,message-id: <2430584.1075857027581.javamail.ev...
9,-1,message-id: <28740831.1075840828087.javamail.e...
11,-1,message-id: <15795469.1075840776586.javamail.e...


In [29]:
# Load up development dataset
import pandas as pd
import email

print("Loading data")
df = pd.read_msgpack('./data/development.msg', encoding='latin-1')
df['email'] = df['email'].apply(email.message_from_string)

Loading data


In [None]:
import pandas as pd
import re
import multiprocessing
import collections

def generate_content_types(row):
    email = row['email']
    output = collections.defaultdict(False)
    check = ['x-world', 'application', 'text', 'text/plain', 'text/html', 'video', 'audio', 'image', 'drawing', 'model', 'multipart', 'x-conference', 'i-world', 'music', 'message', 'x-music', 'www', 'chemical', 'paleovu', 'windows', 'xgl']
    
    for part in email.walk():
        ct = part.get_content_type()
        
        for kind in check:
            output['has_' + kind] |= ct.startswith(kind)
    
    return output

def generate_number_of_spaces(row):
    email = str(row['email'])
    
    return {
        'spaces': email.count(' '),
        'newlines': email.count('\n')
    }

def number_of_images(row):
    email = row['email']
    output = { 'multipart_number': 0, 'number_of_images': 0 }
    rgx = re.compile('\.(jpeg|jpg|png|gif|bmp)')
    
    for part in email.walk():
        output['multipart_number'] += 1
        
        if part.get_content_type().startswith('image/'):
            output['number_of_images'] += 1
        elif part.get_content_type() == 'text/html' or part.get_content_type() == 'text/plain':
            output['number_of_images'] += len(re.findall(rgx, part.get_payload()))
    
    return output

def 

# Functions which create the output features
transforms = [
    lambda row: {'class': row['class']},
    lambda row: {'length': len(row['email'])},
    generate_multipart_number,
    generate_content_types,
    generate_number_of_spaces,
    number_of_images]

# Set up thread pool
def transform_row(x):
    (index, row) = x
    current = {}
    
    for function in transforms:
        current.update(function(row))
    
    return current

print("Processing")
pool = multiprocessing.Pool(20)

# Create dataframe
transformed = pool.map(transform_row, df.iterrows())

print("Done!")

try:
    del ds
except:
    pass

ds = pd.DataFrame(transformed)

Processing


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix

dataset = ds[[x for x in ds.columns if x != 'class']].values
labels = ds['class'].apply(lambda x: x == 'spam')

model = DecisionTreeClassifier()
res = cross_val_score(model, dataset, labels, cv=10, scoring='roc_auc')
print(np.mean(res), np.std(res))

In [None]:
import collections

c = collections.Counter()

for row in df[df['class'] == 'spam']['email']:
    for entry in row.walk():
        c[entry.get_content_type()] += 1
        
c.most_common(50)

In [4]:
# Calculate number people in xbcc xto, ...
import numpy as np

def get_receiver_numbers(row):
    def normalize(contacts):
        if pd.isnull(contacts):
            return []
        else:
            contacts = str(contacts)
            contacts = ''.join([x for x in contacts if x not in ['#', '\n', '\t', '\r']])
            contacts = contacts.split(',')
            return [c for c in contacts if c != '']
    
    check = ['to', 'x-to', 'from', 'x-from', 'cc', 'x-cc', 'bcc', 'x-bcc']
    output = {}
    
    for header in check:
        output[header] = 0
    
    for header in row['email'].keys():
        header = header.lower()
        
        if header in check:
            output[header] = len(normalize(row['email'][header]))
    
    return output

In [3]:
df['email'][0].keys()

['message-id',
 'date',
 'from',
 'to',
 'subject',
 'cc',
 'mime-version',
 'content-type',
 'content-transfer-encoding',
 'bcc',
 'x-from',
 'x-to',
 'x-cc',
 'x-bcc',
 'x-origin',
 'x-filename']

In [7]:
df['email'].apply(lambda x: x.get_subject())

AttributeError: 'Message' object has no attribute 'get_subject'

In [1]:
import pandas

df = pandas.read_msgpack('./data/processed.msg')

In [2]:
df

Unnamed: 0,class,has_application,has_audio,has_chemical,has_drawing,has_i-world,has_image,has_message,has_model,has_multipart,...,people_in_cc,people_in_from,people_in_to,people_in_x-bcc,people_in_x-cc,people_in_x-from,people_in_x-to,spaces,title_case_words_to_words_ratio,upper_case_letters_to_letters_ratio
0,-1,False,False,False,False,False,False,False,False,False,...,2,1,1,0,2,1,1,129,0.000000,0.000000
1,-1,False,False,False,False,False,False,False,False,False,...,-1,1,1,0,0,2,1,698,0.000000,0.000000
2,-1,False,False,False,False,False,False,False,False,False,...,4,1,8,0,8,2,16,223,0.000000,0.000000
3,-1,False,False,False,False,False,False,False,False,False,...,-1,1,1,0,0,2,2,167,0.000000,0.000000
4,-1,False,False,False,False,False,False,False,False,False,...,-1,1,3,0,0,1,3,409,0.000000,0.000000
5,-1,False,False,False,False,False,False,False,False,False,...,4,1,1,0,4,1,1,207,0.000000,0.000000
6,-1,False,False,False,False,False,False,False,False,False,...,-1,1,1,0,0,2,1,56,0.000000,0.000000
7,-1,False,False,False,False,False,False,False,False,False,...,3,1,2,0,3,1,2,1355,0.000000,0.000000
8,-1,False,False,False,False,False,False,False,False,False,...,1,1,3,0,2,2,6,231,0.000000,0.000000
9,-1,False,False,False,False,False,False,False,False,False,...,-1,1,1,0,0,1,1,505,0.000000,0.000000


In [32]:
import sklearn.tree
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix
import numpy as np

dataset = df[[x for x in df.columns if x != 'class']].values
labels = df['class'].apply(lambda x: x == 1)
model = sklearn.tree.DecisionTreeClassifier()
res = cross_val_score(model, dataset, labels, cv=10, scoring='accuracy')
print(np.mean(res), np.std(res))

0.973493614746 0.0201790309039


In [31]:

model = sklearn.tree.DecisionTreeClassifier()
model = model.fit(dataset, labels)
sklearn.tree.export_graphviz(model.tree_)

In [33]:
res

array([ 0.99888951,  0.99930594,  0.99944475,  0.99458559,  0.96126076,
        0.9583449 ,  0.9566787 ,  0.95362399,  0.9555679 ,  0.9572341 ])