In [90]:
import csv
import sys
import time
from email.utils import parsedate

csv.field_size_limit(sys.maxsize)

def get_emails_from_file(email_file):
    reader = csv.DictReader(f, delimiter='\t')
    emails_converted = {}
    for line in reader:
        e = {}
        e['ID'] = line['Message-ID']
        e['From'] = line['From'].strip()
        e['To'] = [t for t in line['To'].strip().split(' ') if t.strip() != '']
        e['Cc'] = [c for c in line['Cc'].strip().split(' ') if c.strip() != '']
        e['Subject'] = line['Subject'].strip()
        e['In-Reply-To'] = line['In-Reply-To'].strip()
        d = parsedate(line['Date'].strip())
        e['Date'] = time.mktime(d) if d else None
        e['Body'] = line['Body'].strip()
        emails_converted[line['Message-ID']] = e
    return emails_converted

emails = {}
with open('emails.tsv', 'r') as f:
    emails = get_emails_from_file(f)
    response_emails = []
    for id in emails:
        e = emails[id]
        if e['From'] == 'joseph_engelman@brown.edu' and e['In-Reply-To'] != '':
            response_emails.append(e)
        idx = e['In-Reply-To']
        if idx != '' and idx in emails:
            emails[idx]['Followed-By'] = id
    
print "Found", len(response_emails), "email responses."
print "Found", len(emails), "total emails."

Found 677 email responses.
Found 21397 total emails.


In [91]:
from collections import defaultdict

def label_for_email(email):
    if 'Followed-By' not in email:
        return None
    idx = email['Followed-By']
    if idx in emails:
        if emails[idx]['From'] != 'joseph_engelman@brown.edu':
            return None
        t = (emails[idx]['Date'] - email['Date']) / 3600. # convert to hours
        if t < 0:
            t = -t
        return t
    else:
        return None

def get_avg_response_times(emails):
    avg_response_times = {}
    response_times = defaultdict(list)
    for email in emails:
        time = label_for_email(email)
        if not time:
            continue
        response_times[email['From']].append(time)
    for sender in response_times:
        avg = float(sum(response_times[sender])) / float(len(response_times[sender]))
        avg_response_times[sender] = avg
    return avg_response_times

In [113]:
# Features: day-of-week, time-of-day, length-of-body, num-recipients
# Label: time-to-respond

from datetime import datetime
import requests
import json

avg_response_times = get_avg_response_times(emails.values())
#for a in avg_response_times:
#    print a, ":", avg_response_times[a]

def features_for_email(email):
    if not email['Date']:
        return None
    d = datetime.fromtimestamp(email['Date'])
    
    day_of_week = d.weekday()
    time_of_day = d.hour + d.minute / 60. + d.second / 3600.
    time_of_day = time_of_day + 18. if time_of_day < 6. else time_of_day - 6.
    time_of_day = -time_of_day
    num_recipients = len(email['To']) + len(email['Cc'])
    length_of_body = len(email['Body'])
    sent_time = float(email['Date'])
    
    if email['From'] not in avg_response_times:
        return None
    features = [avg_response_times[email['From']], num_recipients, day_of_week, time_of_day, length_of_body, sent_time]
    return features

labeled_samples = []

for e in emails.values():
    if e['From'] == 'joseph_engelman@brown.edu':
        continue
    features = features_for_email(e)
    if not features:
        continue
    
    label = label_for_email(e)
    if label:
        labeled_samples.append((features, label))

labeled_samples.sort(key=lambda sample: sample[0][5])
X = [ls[0] for ls in labeled_samples]
y = [ls[1] for ls in labeled_samples]
length = len(y)
weights = [i/float(length) for i in range(0,length)]
print len(y)
print len(weights)

print "Done."

587
587
Done.


In [115]:
from sklearn.linear_model import LinearRegression 
from sklearn import cross_validation

lr = LinearRegression() 
lr.fit(X, y, weights) 
lrScores = cross_validation.cross_val_score(lr, X, y) 

print lrScores 
print("Accuracy: %0.2f (+/- %0.2f)" % (lrScores.mean(), lrScores.std() * 2))

[ 0.71875458  0.31086681  0.28920894]
Accuracy: 0.44 (+/- 0.40)


In [96]:
idx = 20
orig, resp = emails[response_emails[idx]['In-Reply-To']], response_emails[idx]

print "Original:", orig
# print "Response:", resp

f = features_for_email(orig)
print "Features:", f
print "Label:", label_for_email(orig)
print "Prediction:", lr.predict(f)[0]

Original: {'Body': 'Hey guys!  *Football vs. Holy Cross */When:/ Saturday, October 11 /Calltime:/ 8:30AM /Uniform:/ KPB /Practice Field (in case you arrive late/are confused about where the  practice fields actually are):/ Berylson Family Fields  <https://www.google.com/maps/place/Berylson+Family+Fields/@41.8309209,-71.395243,16z/data=%214m2%213m1%211s0x0:0xa3156b139970afa6> Unfortunately, the Holy Cross "Good Time" Marching Band will not be at  the game. But, it will be fun! Football!!!!  *Princeton Scriptwriting and Song Learning */When (Scriptwriting):/ this Friday, 4:00PM at Andrew\\\'s Commons /When (Song Learning):/ this Friday, 7:00PMish at Historian Zoe\\\'s house  (the house that party was at, you know, the party, you remember that,  right, the party?) Feel free to drop by either or both of these! Scriptwriting is as usual,  but afterwards, we\\\'ll be heading up to Zoe\\\'s house to learn some Band  songs (you know, the ones we sing on the bus, you know, those ones, with  all



In [95]:
for c in lr.coef_:
    print '{0:.10f}'.format(c)

0.9949950976
0.2904772434
0.0582510735
0.4749926871
0.0021069264
-0.2113185300
-0.8603029850


In [116]:
from sklearn.externals import joblib

joblib.dump(lr, 'lr.pkl')


['lr.pkl', 'lr.pkl_01.npy', 'lr.pkl_02.npy']

In [119]:
import pickle

pickle.dump(avg_response_times, open( "art.pkl", "wb" ))