In [17]:
from willireply.data import enron
from willireply.features import features
from willireply.features.feature_extractor import FeatureExtractor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, fbeta_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from pathlib import Path
from tqdm.autonotebook import tqdm
from sklearn.neural_network import MLPRegressor,MLPClassifier
from sklearn.linear_model import SGDRegressor, LinearRegression
import numpy as np
import ipywidgets as widgets

In [18]:
# Best Feature Extractor So Far

from willireply.features import features

my_common_words = ['ASAP', 'please', 'could you', 'unsubscribe', '?', '!']
subject_common_words_feature = lambda df: features.common_words_subject(df, my_common_words)
body_common_words_feature = lambda df: features.common_words_body(df, my_common_words)

fe = FeatureExtractor(
      subject_common_words_feature,
      body_common_words_feature,
      features.was_replied,
      features.was_forwarded,
      lambda df: np.log(1+features.number_of_ccs(df)),
      lambda df: np.log(1+features.number_of_recipients(df)),
      features.thread_length,
      lambda df: np.log(1+features.words_in_body(df)),
      features.words_in_subject
                     )

# Train on One User, Validate on Same User

In [33]:
labeled_users = [user.stem for user in enron.ENRON_INDEX_FOLDER.iterdir() if enron.is_labeled(user.stem)]

In [34]:
labeled_users

['ring-a',
 'stepenovitch-j',
 'shively-h',
 'hodge-j',
 'baughman-d',
 'donoho-l',
 'may-l',
 'sanders-r',
 'crandell-s',
 'hayslett-r']

In [35]:
emails_f2 = []

for user in labeled_users:
    df = enron.get_dataframe(user, received_only=True)

    # Randomly select the train and test data
    print(user)
    print(len(df), 'emails')
    if len(df) == 0:
        continue
    N_training_samples = int(0.8 * len(df))
    training_indices = np.random.choice(df.index, N_training_samples, replace=False)
    testing_indices = set(df.index) - set(training_indices)

    X_train = fe.extract(df.loc[training_indices])
    y_train = fe.get_labels(df.loc[training_indices])
    X_test = fe.extract(df.loc[testing_indices])
    y_test = fe.get_labels(df.loc[testing_indices])

    if sum(y_test) == 0 or sum(y_train) == 0:
        print('nothing')
        continue

    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) > 0

    print(classification_report(y_test, y_pred, target_names=["no reply", "reply"]))
    f2 = fbeta_score(y_test, y_pred, 2, labels=['no reply', 'reply'], pos_label=1)
    print('f_2 = %s' % f2)
    print(sum(y_test) + sum(y_train))
    
    # Append the number of replies and the f2 score so we can take a weighted avergage
    emails_f2.append((sum(y_test) + sum(y_train), f2))

ring-a
437 emails


             precision    recall  f1-score   support

   no reply       0.98      0.98      0.98        86
      reply       0.00      0.00      0.00         2

avg / total       0.95      0.95      0.95        88

f_2 = 0.0
17
stepenovitch-j
939 emails


             precision    recall  f1-score   support

   no reply       0.96      0.97      0.96       174
      reply       0.54      0.50      0.52        14

avg / total       0.93      0.93      0.93       188

f_2 = 0.507246376812
48
shively-h
1306 emails


             precision    recall  f1-score   support

   no reply       0.94      0.97      0.96       245
      reply       0.22      0.12      0.15        17

avg / total       0.89      0.92      0.90       262

f_2 = 0.12987012987
70
hodge-j
1516 emails


             precision    recall  f1-score   support

   no reply       0.96      0.99      0.98       293
      reply       0.00      0.00      0.00        11

avg / total       0.93      0.95      0.94       304

f_2 = 0.0
38
baughman-d
1687 emails


             precision    recall  f1-score   support

   no reply       0.99      0.99      0.99       335
      reply       0.00      0.00      0.00         3

avg / total       0.98      0.98      0.98       338

f_2 = 0.0
27
donoho-l
317 emails


             precision    recall  f1-score   support

   no reply       0.95      0.97      0.96        61
      reply       0.00      0.00      0.00         3

avg / total       0.91      0.92      0.91        64

f_2 = 0.0
14
may-l
1492 emails


             precision    recall  f1-score   support

   no reply       0.98      0.99      0.99       289
      reply       0.71      0.50      0.59        10

avg / total       0.97      0.98      0.97       299

f_2 = 0.531914893617
38


sanders-r
5218 emails


             precision    recall  f1-score   support

   no reply       0.95      0.96      0.96       951
      reply       0.54      0.54      0.54        93

avg / total       0.92      0.92      0.92      1044

f_2 = 0.538793103448
481
crandell-s
342 emails


             precision    recall  f1-score   support

   no reply       0.90      0.89      0.90        63
      reply       0.00      0.00      0.00         6

avg / total       0.82      0.81      0.82        69

f_2 = 0.0
26
hayslett-r
1655 emails


             precision    recall  f1-score   support

   no reply       0.94      0.98      0.96       299
      reply       0.70      0.44      0.54        32

avg / total       0.92      0.93      0.92       331

f_2 = 0.472972972973
164


In [36]:
# number of replies, f2 score
emails_f2

[(17, 0.0),
 (48, 0.50724637681159412),
 (70, 0.12987012987012989),
 (38, 0.0),
 (27, 0.0),
 (14, 0.0),
 (38, 0.53191489361702127),
 (481, 0.53879310344827591),
 (26, 0.0),
 (164, 0.47297297297297297)]

In [None]:
f2s = 0
emails = 0
for e, f in emails_f2:
    f2s += f*e
    emails += e
f2s/emails

0.49676923038466025

# Estimate statistics

In [27]:
# Get a rough idea of how many replies the average user sends
for user in labeled_users:
    df = enron.get_dataframe(user, received_only=True)
    if len(df) == 0: continue
    print(len(df.loc[df.did_reply==1])/len(df))

0.038901601830663615
0.051118210862619806
0.05359877488514548
0.025065963060686015


0.016004742145820983
0.04416403785488959
0.02546916890080429


0.09218091222690686
0.07602339181286549
0.09909365558912386


# SGD across multiple users

BIG CAVEAT!! I'm not sure this code does what it's supposed to do

In [28]:
# 
emails_f2 = []
model = DecisionTreeClassifier()#warm_start=True)
for user in labeled_users[:35]:
    df = enron.get_dataframe(user, received_only=True)

    # Randomly select the train and test data
    print(user)
    print(len(df), 'emails')
    if len(df) == 0:
        continue
    #N_training_samples = int(0.5 * len(df))
    #training_indices = np.random.choice(df.index, N_training_samples, replace=False)
    #testing_indices = set(df.index) - set(training_indices)

    X_train = fe.extract(df) #.loc[training_indices])
    y_train = fe.get_labels(df) #.loc[training_indices])
    #X_test = fe.extract(df.loc[testing_indices])
    #y_test = fe.get_labels(df.loc[testing_indices])

    if sum(y_test) == 0 or sum(y_train) == 0:
        print('nothing')
        continue

    model.fit(X_train, y_train)
    

ring-a
437 emails


stepenovitch-j
939 emails


shively-h
1306 emails


hodge-j
1516 emails


baughman-d
1687 emails


donoho-l
317 emails


may-l
1492 emails


sanders-r
5218 emails


crandell-s
342 emails


hayslett-r
1655 emails


In [30]:
len(labeled_users)

10