# Loading up the data

In [3]:
import pandas as pd
import sqlite3
from collections import Counter
from sklearn.linear_model import LinearRegression, SGDClassifier,LogisticRegression, RandomizedLogisticRegression
from sklearn.metrics import classification_report, fbeta_score, accuracy_score
import numpy as np
from textblob import TextBlob
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier 

These data were run using the `make_dataset.py` script in `../src/data`

In [25]:
conn = sqlite3.connect("/home/ic/indices/sanders-r.sqlite3")
df = pd.read_sql_query("SELECT * FROM emails", conn)
#import csv
#with open('oracletest.csv', 'w') as f:
#    writer = csv.writer(f, delimiter=',')
#    for i in range(len(df)):
#        row = [df['body'][i], df['did_reply'][i]]
#        writer.writerow(row)
        

In [26]:
# How many times did we respond, out of how many rows there are total?
len(df.query('did_reply==1')) / len(df)

0.06631738590927892

# Oracle Testing

In [47]:
y_true = df['did_reply'][:101].values
true_indices = np.where(y_true)[0]

One of my friends went through all of the emails, and marked which ones she thought I was likely to respond to.

In [48]:


oracle_indices = [1,2,7,10,12,15,19,20,23,31,33,34,38,40,44,47,51,57,58,66,81,86,87,88,91,92,93,95,98,100]

y_oracle = np.zeros_like(y_true)
for i in oracle_indices:
    y_oracle[i] = 1

She was correct for four of the emails.

In [49]:
set(true_indices).intersection(oracle_indices)

{20, 58}

In [51]:
print(classification_report(y_true, y_oracle, target_names=['no reply', 'reply']))
print('f_2 = %s' % fbeta_score(y_true, y_oracle, 2, labels=['no reply', 'reply'], pos_label=1))

             precision    recall  f1-score   support

   no reply       0.86      0.69      0.76        89
      reply       0.07      0.17      0.10        12

avg / total       0.76      0.62      0.68       101

f_2 = 0.1282051282051282


# Baseline Testing

Can a machine doe better? Let's cut the dataset into a training slice (first 100 samples) and a validation slice (last 100 samples)

In [16]:
train_slice = slice(0, 1000)
test_slice = slice(1000, 1500)


Define a feature vector based on whether the sender is somebody who we have replied to in the past.

In [17]:
favorites = list(Counter(df.iloc[train_slice].query('did_reply == 1')['sender'].values).keys())
len(favorites)

def featureExtractor(row):
    favorites_vector = [favorite in row['sender'] for favorite in favorites]
    return favorites_vector

def label(row):
    return row['did_reply']

X = np.array([featureExtractor(row) for idx, row in df.iterrows()])
y_true = np.array([label(row) for idx, row in df.iterrows()])

KeyError: 'sender'

Machine learning magic happens here:

In [27]:
model = LinearRegression()
model.fit(X[train_slice], y_true[train_slice])
y_pred = model.predict(X[test_slice]) > .5

In [28]:
print(classification_report(y_true[test_slice], y_pred, target_names=["no reply", "reply"]))
print('f_2 = %s' % fbeta_score(y_true[test_slice], y_pred, 2, labels=['no reply', 'reply'], pos_label=1))
print(accuracy_score(y_true[test_slice], y_pred))  

             precision    recall  f1-score   support

   no reply       0.72      0.94      0.82       339
      reply       0.65      0.23      0.33       160

avg / total       0.70      0.71      0.66       499

f_2 = 0.258992805755
0.713426853707


Additional Features

In [29]:
#Favorites extractor from before
def favoritesExtractor(row):
    favorites_vector = [favorite in row['sender'] for favorite in favorites]
    return favorites_vector

#Perform rudimentary sentiment analysis on entire message body
def sentimentBodyExtractor(row):
    if row['subject'] is not None:
        
        polarity, subjectivity = TextBlob(row['body']).sentiment
    else:
        polarity, subjectivity = 0,0
        
    return [polarity,subjectivity]

#Perform rudimentary sentiment analysis on entire message subject
def sentimentSubjectExtractor(row):
    if row['subject'] is not None:
        
        polarity, subjectivity = TextBlob(row['subject']).sentiment
    else:
        polarity, subjectivity = 0,0
        
    return [polarity,subjectivity]

#Count occurences of keywords
def keywordExtractor(row):
    keywords = ["please", "response", "request", "reply", "when", "thanks", "?", ]
    score = 0
    text = TextBlob(row['body'])
    for word in text.words:
        if str(word).lower() in keywords:
            score += 1
    return [score]


#Determine whether there exists more than one recipient
def recipientsExtractor(row):
    feature_vector = [row['multiple_recipients'] == 1]
    return feature_vector

#length of body
def lengthExtractor(row):
    return [len(row['body'])]

#Determine if "Re: " appears in subject line
def reExtractor(row):
    re_vector = [row['re_in_subject'] == 1]
    return  re_vector

#Determine if "Fwd: " appears in subject line
def fwdExtractor(row):
    fw_vector = [row['fw_in_subject'] == 1]
    return fw_vector

Run more machine learning magic, using additional features

In [32]:
X = np.array([favoritesExtractor(row)
              + lengthExtractor(row)
              + recipientsExtractor(row)
              + reExtractor(row)
              + fwdExtractor(row)
              + sentimentBodyExtractor(row)
              + sentimentSubjectExtractor(row)
              + keywordExtractor(row)
              for idx, row in df.iterrows()])
y_true = np.array([label(row) for idx, row in df.iterrows()])
print(len(X[test_slice]))
print(len(y_true[test_slice]))
print("Linear Regressor")
model = LinearRegression()
model.fit(X[train_slice], y_true[train_slice])
y_pred = model.predict(X[test_slice]) > .5
print(classification_report(y_true[test_slice], y_pred, target_names=["no reply", "reply"]))
print('f_2 = %s' % fbeta_score(y_true[test_slice], y_pred, 2, labels=['no reply', 'reply'], pos_label=1))
print(accuracy_score(y_true[test_slice], y_pred))  

100
100
Linear Regressor
             precision    recall  f1-score   support

   no reply       0.79      0.79      0.79        72
      reply       0.46      0.46      0.46        28

avg / total       0.70      0.70      0.70       100

f_2 = 0.464285714286
0.7


With multiple features, performance is about the same. Let's use a different classifier that is more suitable to the number of features:

Performance is degraded when using a new classifier. Probably for reasons of insufficient data. Lets increase the slice sizes (the more_features branch has modified the download script to download more emails):

In [33]:
train_slice = slice(0, 1500)
test_slice = slice(1500, 2000)

In [34]:
X = np.array([favoritesExtractor(row)
               + lengthExtractor(row)
               + recipientsExtractor(row)
               + reExtractor(row)
               + fwdExtractor(row)
               + sentimentBodyExtractor(row)
               + sentimentSubjectExtractor(row)
               + keywordExtractor(row)
              for idx, row in df.iterrows()])
y_true = np.array([label(row) for idx, row in df.iterrows()])
model = RandomForestClassifier()  
print(len(X[train_slice]), len(X[test_slice]))
print("MLPClassifier")
#model = MLPClassifier(solver='relu', learning_rate='adaptive', hidden_layer_sizes=(100,100,), verbose=True, early_stopping=False, max_iter=2000)
model.fit(X[train_slice], y_true[train_slice])
#print("Features sorted by their score:")
#print(sorted(zip(map(lambda x: round(x, 4), model.feature_importances_), X[train_slice]), reverse=True))
y_pred = model.predict(X[test_slice]) > .5
print(classification_report(y_true[test_slice], y_pred, target_names=["no reply", "reply"]))
print('f_2 = %s' % fbeta_score(y_true[test_slice], y_pred, 2, labels=['no reply', 'reply'], pos_label=1))
print(accuracy_score(y_true[test_slice], y_pred))  

1500 499
MLPClassifier
             precision    recall  f1-score   support

   no reply       0.74      0.89      0.81       339
      reply       0.59      0.33      0.42       160

avg / total       0.69      0.71      0.68       499

f_2 = 0.357142857143
0.711422845691


The machine learning algorithm seems to work much better than the human.