# Loading up the data

In [1]:
import pandas as pd
import sqlite3
from collections import Counter
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, fbeta_score
import numpy as np

These data were run using the `make_dataset.py` script in `../src/data`

In [2]:
conn = sqlite3.connect("../data/raw/p-proposal.sqlite3")
df = pd.read_sql_query("SELECT * FROM emails WHERE folder='received'", conn)

In [3]:
# How many times did we respond, out of how many rows there are total?
len(df.query('did_reply==1')) / len(df)

0.14

# Oracle Testing

In [4]:
y_true = df['did_reply'].values
true_indices = np.where(y_true)[0]

One of my friends went through all of the emails, and marked which ones she thought I was likely to respond to.

In [5]:
oracle_indices = [11, 31, 39, 51, 63, 68, 74, 78, 120, 122, 131, 133, 134, 138, 145, 159, 179]
y_oracle = np.zeros_like(y_true)
for i in oracle_indices:
    y_oracle[i] = 1

She was correct for four of the emails.

In [6]:
set(true_indices).intersection(oracle_indices)

{63, 131, 133, 179}

In [7]:
print(classification_report(y_true, y_oracle, target_names=['no reply', 'reply']))
print('f_2 = %s' % fbeta_score(y_true, y_oracle, 2, labels=['no reply', 'reply'], pos_label=1))

             precision    recall  f1-score   support

   no reply       0.87      0.92      0.90       172
      reply       0.24      0.14      0.18        28

avg / total       0.78      0.81      0.80       200

f_2 = 0.15503875968992248


# Baseline Testing

Can a machine doe better? Let's cut the dataset into a training slice (first 100 samples) and a validation slice (last 100 samples)

In [8]:
train_slice = slice(0, 100)
test_slice = slice(100, 200)


Define a feature vector based on whether the sender is somebody who we have replied to in the past.

In [9]:
favorites = list(Counter(df.iloc[train_slice].query('did_reply == 1')['sender'].values).keys())
len(favorites)

def featureExtractor(row):
    favorites_vector = [favorite in row['sender'] for favorite in favorites]
    return favorites_vector

def label(row):
    return row['did_reply']

X = np.array([featureExtractor(row) for idx, row in df.iterrows()])
y_true = np.array([label(row) for idx, row in df.iterrows()])

Machine learning magic happens here:

In [10]:
model = LinearRegression()
model.fit(X[train_slice], y_true[train_slice])
y_pred = model.predict(X[test_slice]) > .5

In [11]:
print(classification_report(y_true[test_slice], y_pred, target_names=["no reply", "reply"]))
print('f_2 = %s' % fbeta_score(y_true[test_slice], y_pred, 2, labels=['no reply', 'reply'], pos_label=1))

             precision    recall  f1-score   support

   no reply       0.90      0.95      0.93        85
      reply       0.60      0.40      0.48        15

avg / total       0.85      0.87      0.86       100

f_2 = 0.42857142857142866


Additional Features

In [None]:
#Favorites extractor from before
def favoritesExtractor(row):
    favorites_vector = [favorite in row['sender'] for favorite in favorites]
    return favorites_vector

#Perform rudimentary sentiment analysis on entire message body
def sentimentExtractor(row):
    sentiment = [TextBlob(row['body']).sentiment[0]]
    polarity = [TextBlob(row['body']).sentiment[1]]
    return sentiment + polarity

#Determine whether there exists more than one recipient
def recipientsExtractor(row):
    feature_vector = [row['multiple_recipients'] == 1]
    return feature_vector

#Determine if "Re: " appears in subject line
def reExtractor(row):
    re_vector = [row['re_in_subject'] == 1]
    return  re_vector

#Determine if "Fwd: " appears in subject line
def fwdExtractor(row):
    fw_vector = [row['fw_in_subject'] == 1]
    return fw_vector

In [None]:
X = np.array([favoritesExtractor(row) 
              + recipientsExtractor(row) 
              + reExtractor(row) 
              + fwdExtractor(row) 
              + sentimentExtractor(row) 
              for idx, row in df.iterrows()])

             precision    recall  f1-score   support

   no reply       0.77      0.79      0.78        72
      reply       0.42      0.39      0.41        28

avg / total       0.67      0.68      0.68       100

f_2 = 0.398550724638 

With multiple features, performance is about the same. Let's use a different classifier that is more suitable to the number of features:

Linear Regressor
            precision    recall  f1-score   support

   no reply       0.77      0.79      0.78        72
      reply       0.42      0.39      0.41        28

avg / total       0.67      0.68      0.68       100

f_2 = 0.398550724638

SGDClassifier
             precision    recall  f1-score   support

   no reply       0.73      0.86      0.79        72
      reply       0.33      0.18      0.23        28

avg / total       0.62      0.67      0.63       100

f_2 = 0.196850393701

In [None]:
Performance is degraded

The machine learning algorithm seems to work much better than the human.