In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Use talon to extract the email signatures as training corpus.

In [2]:
import talon
from talon import quotations
talon.init()
text = """
Dear Vince,

Thank you for registering for Henwood's ERCOT Symposium on January 23, 2001.
We are pleased to confirm your attendance.  Attached is a copy of the
program agenda.  As indicated, registration will begin at 9:30am.  The
program begins at 10:00am and runs until 3:00pm.  Lunch along with
refreshments will be provided. Demonstrations of Henwood's software
applications and eBusiness solutions will be available following the
workshop for interested parties.

Directions to the Hyatt Regency Houston are attached for your convenience.
Please do not hesitate to contact me with any questions or concerns that you
may have.

We look forward to seeing you in Houston!

Heather Mason
Marketing Assistant
Henwood Energy Services, Inc.
2710 Gateway Oaks Dr.
Suite 300 N
Sacramento, CA 95833
Phone: (916) 569-0985
Fax: (916) 569-0999
"""

reply = quotations.extract_from_plain(text)
from talon import signature
from talon.signature.bruteforce import extract_signature
text, signature = extract_signature(text)
print(text)

Dear Vince,

Thank you for registering for Henwood's ERCOT Symposium on January 23, 2001.
We are pleased to confirm your attendance.  Attached is a copy of the
program agenda.  As indicated, registration will begin at 9:30am.  The
program begins at 10:00am and runs until 3:00pm.  Lunch along with
refreshments will be provided. Demonstrations of Henwood's software
applications and eBusiness solutions will be available following the
workshop for interested parties.

Directions to the Hyatt Regency Houston are attached for your convenience.
Please do not hesitate to contact me with any questions or concerns that you
may have.

We look forward to seeing you in Houston!

Heather Mason
Marketing Assistant
Henwood Energy Services, Inc.
2710 Gateway Oaks Dr.
Suite 300 N
Sacramento, CA 95833
Phone: (916) 569-0985
Fax: (916) 569-0999


Talon results are not always good and can only apply certain well-formated cases. So pick the email signature corpus manually. ~~First, use the Python email package to extract the main body (including the signature). Then apply the talon to strip off the signature part.~~ First, manually extract the ground signatures from the top 100 emails of 'kaminski-v/conferences/', and save in one file (_EnronSignatures.txt_) separated by an empty line. Also the emails from Jeb Bush folder within file '12+December+2003+Public+2.txt', and save into one file '_JebBushSignatures.txt_'. 

Next, use the signature corpus to train the word vector to recognize the email signatures.

In [3]:
# read in the signature text file
fname = "EnronSignatures.txt"
fsig = open(fname, 'r')
sigList = []
sigText = []
for line in fsig.readlines():
    #print len(line)
    if len(line) <= 3:
        sigList.append(' '.join(sigText)) # combine all the texts of each signature
        sigText = []
    else:
        sigText.append(' '.join(line.lower().split()))
fsig.close()

print(sigList[:10])

['sincerely, thaleia zariphopoulou chair of the scientific committee v.n.neuhaus professor dpts of mathematics and msis the university of texas at austin', 'thank you, clare fitzgerald director, training courses marcus evans 312-540-3000x6785', 'joanna vidal events coordinator risk waters group t: (212) 925 1864 ext. 197 f: (212) 925 7585 jvidal@riskwaters.com www.riskwaters.com', 'duane seppi carnegie mellon university graduate school of industrial administrations pittsburgh, pa 15213-3890 t: 001 412-268-2298 f: 001 412-269-8896', 'helyette geman universite de paris dauphine finance department au de ka grand ecole corgy pontois, paris france 95021 t: 00 33 60-807-4200', 'vincent kaminski enron credit 1400 smith street room eb1962 houston, tx 77002-7361 t: 001 713-853-3848 f: 001 713-646-2503', 'peter nance teknecon, inc. 1515 s. capital of texas highway suite 101 austin, tx 78746 t: 001 512-732-7084 f: 001 512-732-7099', 'chris harris innogy holdings place windmill hill business park 

#### Use word2vec to train directly
from gensim import corpora, models, similarities
sigTexts = []
for term in sigList:
    sigTexts.append([word for word in term.split()])
model = models.word2vec.Word2Vec(sigTexts, min_count=1)
print(model.doesnt_match('thank you but'.split()))

On the other hand, extract the body(excluding email signatures) text from Enron dataset 'kaminski-v/conferences/', and use as negative training set. Try to make use of both talon and Python email packages.

import os
import re
import talon
from talon import quotations
from talon import signature
from talon.signature.bruteforce import extract_signature
import email
from email.parser import Parser
talon.init()
dirName = "conferences"
#filelist = [fn for fn in os.listdir(dirName) if fn != "."]
#print filelist
os.chdir(dirName)
fcontent = open("content.txt",'w')
for fe in range(1,101):
    fname = str(fe)+'.'
    if not os.path.exists(fname):
        continue
    fmail = open(fname, 'r')
    mailText = ''.join(fmail.readlines())
    content = email.message_from_string(mailText)
    if content is None:
        continue
    body = []
    if content.is_multipart():
        for payload in content.get_payload():
            body.append(payload.get_payload())
    else:
        body.append(content.get_payload())
    if body is None: # discard mail without body
        fmail.close()
        continue
    #body = ''.join(body)
    bodyText = []
    sigSymbols = ['-- forwarded by', 'sincerely', 'kind regards', 'thank you', 'thanks']
    for entry in body:
        findEnd = False
        for sym in sigSymbols:
            if sym in entry.lower():
                findEnd = True
        if findEnd:
            break
        bodyText.append(entry)
    reply = quotations.extract_from_plain(''.join(bodyText))
    texts, signature = extract_signature(reply)
    fcontent.write(texts)
    fcontent.write('\n')
    fmail.close()
fcontent.close()
os.chdir('../')

Now train the model using signature text as the positive dataset and body content text as the negative dataset.

In [7]:
import random
random.seed(0)
from gensim.models.doc2vec import LabeledSentence, Doc2Vec
import collections
import sklearn.naive_bayes
import sklearn.linear_model
import nltk
def load_data(path_to_data):
    train_pos = []
    train_neg = []
    sigwords = []
    with open(path_to_data+"EnronSignatures.txt",'r') as fes:
        for line in fes:
            if len(line) <= 3:
                train_pos.append((' '.join(sigwords)).split()) # combine all the texts of each signature
                sigwords = []
            else:
                sigwords.append(' '.join(line.lower().strip().split()))
    with open(path_to_data+"train_content.txt",'r') as ftc:
        for line in ftc:
            words = [w.lower() for w in line.strip().split()]
            if len(words) < 2:
                continue
            train_neg.append(words)
    return train_pos, train_neg

train_pos, train_neg = load_data('./')
print train_pos[:10]
print train_neg[:10]

[['sincerely,', 'thaleia', 'zariphopoulou', 'chair', 'of', 'the', 'scientific', 'committee', 'v.n.neuhaus', 'professor', 'dpts', 'of', 'mathematics', 'and', 'msis', 'the', 'university', 'of', 'texas', 'at', 'austin'], ['thank', 'you,', 'clare', 'fitzgerald', 'director,', 'training', 'courses', 'marcus', 'evans', '312-540-3000x6785'], ['joanna', 'vidal', 'events', 'coordinator', 'risk', 'waters', 'group', 't:', '(212)', '925', '1864', 'ext.', '197', 'f:', '(212)', '925', '7585', 'jvidal@riskwaters.com', 'www.riskwaters.com'], ['duane', 'seppi', 'carnegie', 'mellon', 'university', 'graduate', 'school', 'of', 'industrial', 'administrations', 'pittsburgh,', 'pa', '15213-3890', 't:', '001', '412-268-2298', 'f:', '001', '412-269-8896'], ['helyette', 'geman', 'universite', 'de', 'paris', 'dauphine', 'finance', 'department', 'au', 'de', 'ka', 'grand', 'ecole', 'corgy', 'pontois,', 'paris', 'france', '95021', 't:', '00', '33', '60-807-4200'], ['vincent', 'kaminski', 'enron', 'credit', '1400', '

Extract features for training process using gensim.

In [13]:
def feature_extraction(train_pos, train_neg):
    labeled_train_pos = []
    for index, words in enumerate(train_pos):
        sentence = LabeledSentence(words, ["TRAIN_POS_%s"%index])
        labeled_train_pos.append(sentence)
    labeled_train_neg = []
    for index, words in enumerate(train_neg):
        sentence = LabeledSentence(words, ["TRAIN_NEG_%s"%index])
        labeled_train_neg.append(sentence)
    model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)
    sentences = labeled_train_pos + labeled_train_neg
    model.build_vocab(sentences)
    for i in range(5):
        print "Training iteration %d" %(i)
        random.shuffle(sentences)
        model.train(sentences)
    train_pos_vec, train_neg_vec = [], []
    for index in range(len(labeled_train_pos)):
        doc_vec = model.docvecs["TRAIN_POS_%s"%index]
        train_pos_vec.append(doc_vec)
    for index in range(len(labeled_train_neg)):
        doc_vec = model.docvecs["TRAIN_NEG_%s"%index]
        train_neg_vec.append(doc_vec)
    return train_pos_vec, train_neg_vec

train_pos_vec, train_neg_vec = feature_extraction(train_pos, train_neg)

Training iteration 0
Training iteration 1
Training iteration 2
Training iteration 3
Training iteration 4


Use the transformed vectors to build model.

In [14]:
def build_model(train_pos_vec, train_neg_vec):
    Y = ["pos"]*len(train_pos_vec) + ["neg"]*len(train_neg_vec)
    X = train_pos_vec + train_neg_vec
    lr_model = sklearn.linear_model.LogisticRegression()
    lr_model.fit(X,Y)
    return lr_model

lr_model = build_model(train_pos_vec, train_neg_vec)

Evaluate the model using the training (testing) data by presenting the confusion matrix.

In [16]:
def evaluate_model(model, test_pos_vec, test_neg_vec, print_confusion=False):
    test_pos_predict = model.predict(test_pos_vec)
    test_neg_predict = model.predict(test_neg_vec)
    test_pos_Y = ["pos"]*len(test_pos_vec)
    test_neg_Y = ["neg"]*len(test_neg_vec)
    tp, tn, fp, fn = 0, 0, 0, 0
    for i in range(len(test_pos_predict)):
        if test_pos_predict[i] == test_pos_Y[i]:
            tp += 1
        else:
            fn += 1
    for i in range(len(test_neg_predict)):
        if test_neg_predict[i] == test_neg_Y[i]:
            tn += 1
        else:
            fp += 1
    accuracy = float(tp+tn) / float(tp+tn+tp+tn)
    precision = float(tp) / float(tp+fp)
    recall = float(tp) / float(tp+fn)
    if print_confusion:
        print "predicted:\tpos\tneg"
        print "actual:"
        print "pos\t\t%d\t%d" % (tp, fn)
        print "neg\t\t%d\t%d" % (fp, tn)
    print "accuracy: %f" % (accuracy)
    print "precision: %f" % (precision)
    print "recall: %f" % (recall)
    
evaluate_model(lr_model, train_pos_vec, train_neg_vec, True)

predicted:	pos	neg
actual:
pos		0	45
neg		0	338
accuracy: 0.500000
