In [1]:
from willireply.data import enron
from willireply.features import features
from willireply.features.feature_extractor import FeatureExtractor

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, fbeta_score

from pathlib import Path
from tqdm.autonotebook import tqdm
from sklearn.linear_model import SGDRegressor, LinearRegression
import numpy as np
import ipywidgets as widgets



In [10]:
# Best Feature Extractor So Far

from willireply.features import features

my_common_words = ['ASAP', 'please', 'could you', 'unsubscribe', '?', '!']
subject_common_words_feature = lambda df: features.common_words_subject(df, my_common_words)
body_common_words_feature = lambda df: features.common_words_body(df, my_common_words)

fe = FeatureExtractor(
      subject_common_words_feature,
      body_common_words_feature,
      features.was_replied,
      features.was_forwarded,
      lambda df: np.log(1+features.number_of_ccs(df)),
      lambda df: np.log(1+features.number_of_recipients(df)),
      features.thread_length,
      lambda df: np.log(1+features.words_in_body(df)),
      features.words_in_subject
                     )

# Train on One User, Validate on Same User

In [41]:
labeled_users = [user.stem for user in enron.ENRON_INDEX_FOLDER.iterdir() if enron.is_labeled(user.stem)]

In [30]:
labeled_users

['brawner-s',
 'horton-s',
 'neal-s',
 'giron-d',
 'delainey-d',
 'ring-a',
 'scholtes-d',
 'davis-d',
 'keavey-p',
 'williams-w3',
 'dean-c',
 'solberg-g',
 'merriss-s',
 'keiser-k',
 'stepenovitch-j',
 'steffes-j',
 'tycholiz-b',
 'gang-l',
 'forney-j',
 'benson-r',
 'schoolcraft-d',
 'platter-p',
 'harris-s',
 'stokley-c',
 'zufferli-j',
 'mcconnell-m',
 'meyers-a',
 'perlingiere-d',
 'shively-h',
 'dorland-c',
 'thomas-p',
 'grigsby-m',
 'fischer-m',
 'germany-c',
 'motley-m',
 'hyatt-k',
 'donohoe-t',
 'geaccone-t',
 'zipper-a',
 'panus-s',
 'pimenov-v',
 'lewis-a',
 'white-s',
 'ruscitti-k',
 'nemec-g',
 'hodge-j',
 'townsend-j',
 'baughman-d',
 'kitchen-l',
 'bass-e',
 'gilbertsmith-d',
 'lucci-p']

In [31]:
emails_f2 = []

for user in labeled_users:
    df = enron.get_dataframe(user, received_only=True)

    # Randomly select the train and test data
    print(user)
    print(len(df), 'emails')
    if len(df) == 0:
        continue
    N_training_samples = int(0.5 * len(df))
    training_indices = np.random.choice(df.index, N_training_samples, replace=False)
    testing_indices = set(df.index) - set(training_indices)

    X_train = fe.extract(df.loc[training_indices])
    y_train = fe.get_labels(df.loc[training_indices])
    X_test = fe.extract(df.loc[testing_indices])
    y_test = fe.get_labels(df.loc[testing_indices])

    if sum(y_test) == 0 or sum(y_train) == 0:
        print('nothing')
        continue

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) > 0

    print(classification_report(y_test, y_pred, target_names=["no reply", "reply"]))
    f2 = fbeta_score(y_test, y_pred, 2, labels=['no reply', 'reply'], pos_label=1)
    print('f_2 = %s' % f2)
    print(sum(y_test) + sum(y_train))
    
    # Append the number of replies and the f2 score so we can take a weighted avergage
    emails_f2.append((sum(y_test) + sum(y_train), f2))

brawner-s
678 emails
             precision    recall  f1-score   support

   no reply       0.97      0.21      0.34       329
      reply       0.03      0.80      0.06        10

avg / total       0.94      0.22      0.33       339

f_2 = 0.129449838188
27
horton-s
1644 emails
             precision    recall  f1-score   support

   no reply       0.97      0.09      0.17       794
      reply       0.03      0.93      0.07        28

avg / total       0.94      0.12      0.17       822

f_2 = 0.151515151515
59
neal-s
2166 emails
             precision    recall  f1-score   support

   no reply       0.95      0.04      0.07      1002
      reply       0.08      0.98      0.14        81

avg / total       0.88      0.11      0.08      1083

f_2 = 0.288742690058
169
giron-d
2340 emails
             precision    recall  f1-score   support

   no reply       0.90      0.02      0.03      1073
      reply       0.08      0.98      0.15        97

avg / total       0.83      0.10      0.

donohoe-t
963 emails
             precision    recall  f1-score   support

   no reply       1.00      0.38      0.55       476
      reply       0.02      1.00      0.04         6

avg / total       0.99      0.39      0.54       482

f_2 = 0.0920245398773
10
geaccone-t
1036 emails
             precision    recall  f1-score   support

   no reply       1.00      0.04      0.07       467
      reply       0.10      1.00      0.19        51

avg / total       0.91      0.13      0.09       518

f_2 = 0.362215909091
102
zipper-a
1213 emails
             precision    recall  f1-score   support

   no reply       1.00      0.07      0.13       598
      reply       0.02      1.00      0.03         9

avg / total       0.99      0.08      0.12       607

f_2 = 0.0746268656716
25
panus-s
411 emails
             precision    recall  f1-score   support

   no reply       0.93      0.19      0.31       203
      reply       0.00      0.00      0.00         3

avg / total       0.91      0.18   

In [32]:
# number of replies, f2 score
emails_f2

[(27, 0.12944983818770225),
 (59, 0.15151515151515152),
 (169, 0.28874269005847952),
 (188, 0.30884265279583872),
 (198, 0.38429752066115702),
 (17, 0.16990291262135923),
 (12, 0.11194029850746269),
 (126, 0.31050228310502281),
 (30, 0.068407960199004969),
 (138, 0.25252525252525249),
 (16, 0.033112582781456949),
 (116, 0.51003344481605351),
 (48, 0.26620370370370366),
 (265, 0.47050561797752805),
 (86, 0.40275049115913553),
 (57, 0.39039039039039047),
 (29, 0.28901734104046245),
 (5, 0.047393364928909956),
 (53, 0.29914529914529914),
 (18, 0.17241379310344829),
 (17, 0.26315789473684209),
 (167, 0.25888958203368684),
 (108, 0.16011644832605532),
 (70, 0.18592297476759628),
 (33, 0.11088709677419357),
 (29, 0.13321492007104796),
 (50, 0.15580736543909346),
 (30, 0.11887072808320952),
 (895, 0.43066821166473546),
 (3, 0.15625),
 (55, 0.23847376788553259),
 (10, 0.092024539877300623),
 (102, 0.36221590909090912),
 (25, 0.074626865671641784),
 (7, 0.0),
 (26, 0.17578125),
 (19, 0.08021390

In [33]:
f2s = 0
emails = 0
for e, f in emails_f2:
    f2s += f*e
    emails += e
f2s/emails

0.34004800492675774

# Estimate statistics

In [37]:
# Get a rough idea of how many replies the average user sends
for user in labeled_users:
    df = enron.get_dataframe(user, received_only=True)
    if len(df) == 0: continue
    print(len(df.loc[df.did_reply==1])/len(df))

0.03982300884955752
0.035888077858880776
0.07802400738688828
0.08034188034188035
0.11301369863013698
0.038901601830663615
0.039473684210526314
0.07208237986270023
0.014962593516209476
0.047244094488188976
0.0067085953878406705
0.002938295788442703
0.0
0.15508021390374332
0.051118210862619806
0.14738598442714126
0.11668928086838534
0.114
0.09764309764309764
0.006954102920723227
0.06838709677419355
0.03854389721627409
0.0
0.096045197740113
0.062197392923649904
0.0009191176470588235
0.04451772464962902
0.05359877488514548
0.033066132264529056
0.02566371681415929
0.03625815808556925
0.021613832853025938
0.13094367227505485
0.015
0.047372954349698536
0.010384215991692628
0.09845559845559845
0.020610057708161583
0.0170316301703163
0.04609929078014184
0.009082217973231358
0.056287851799073745
0.056451612903225805
0.11556603773584906
0.025065963060686015
0.014184397163120567
0.016004742145820983
0.10103626943005181
0.10154617634768073
0.01858736059479554
0.09174311926605505


# SGD across multiple users

BIG CAVEAT!! I'm not sure this code does what it's supposed to do

In [44]:
# 
emails_f2 = []
model = SGDRegressor(warm_start=True)
for user in labeled_users[:35]:
    df = enron.get_dataframe(user, received_only=True)

    # Randomly select the train and test data
    print(user)
    print(len(df), 'emails')
    if len(df) == 0:
        continue
    #N_training_samples = int(0.5 * len(df))
    #training_indices = np.random.choice(df.index, N_training_samples, replace=False)
    #testing_indices = set(df.index) - set(training_indices)

    X_train = fe.extract(df) #.loc[training_indices])
    y_train = fe.get_labels(df) #.loc[training_indices])
    #X_test = fe.extract(df.loc[testing_indices])
    #y_test = fe.get_labels(df.loc[testing_indices])

    if sum(y_test) == 0 or sum(y_train) == 0:
        print('nothing')
        continue

    model.partial_fit(X_train, y_train)
    



brawner-s
678 emails
horton-s
1644 emails
neal-s
2166 emails
giron-d
2340 emails
delainey-d
1752 emails
ring-a
437 emails
scholtes-d
304 emails
davis-d
1748 emails
keavey-p
2005 emails
williams-w3
2921 emails
dean-c
2385 emails
solberg-g
1021 emails
merriss-s
1624 emails
nothing
keiser-k
748 emails
stepenovitch-j
939 emails
steffes-j
1798 emails
tycholiz-b
737 emails
gang-l
500 emails
forney-j
297 emails
benson-r
719 emails
schoolcraft-d
775 emails
platter-p
467 emails
harris-s
548 emails
nothing
stokley-c
0 emails
zufferli-j
177 emails
mcconnell-m
2685 emails
meyers-a
1088 emails
perlingiere-d
2426 emails
shively-h
1306 emails
dorland-c
998 emails
thomas-p
1130 emails
grigsby-m
1379 emails
fischer-m
1388 emails
germany-c
6835 emails
motley-m
200 emails


In [42]:
len(labeled_users)

70

## Do all the predicting. Same as before

In [48]:
emails_f2 = []    

for user in labeled_users[35:]:
    df = enron.get_dataframe(user, received_only=True)

    # Randomly select the train and test data
    print(user)
    print(len(df), 'emails')
    if len(df) == 0:
        continue

    X_test = fe.extract(df)# .loc[testing_indices])
    y_test = fe.get_labels(df) # .loc[testing_indices])
    y_pred = model.predict(X_test) > 0

    print(classification_report(y_test, y_pred, target_names=["no reply", "reply"]))
    f2 = fbeta_score(y_test, y_pred, 2, labels=['no reply', 'reply'], pos_label=1)
    print('f_2 = %s' % f2)
    print(sum(y_test) + sum(y_train))
    emails_f2.append((sum(y_test), f2))

hyatt-k
1161 emails
             precision    recall  f1-score   support

   no reply       0.96      0.43      0.59      1106
      reply       0.05      0.64      0.10        55

avg / total       0.92      0.44      0.57      1161

f_2 = 0.197740112994
58
donohoe-t
963 emails
             precision    recall  f1-score   support

   no reply       0.99      0.52      0.68       953
      reply       0.01      0.30      0.01        10

avg / total       0.98      0.52      0.68       963

f_2 = 0.0302419354839
13
geaccone-t
1036 emails
             precision    recall  f1-score   support

   no reply       0.93      0.50      0.65       934
      reply       0.13      0.68      0.22       102

avg / total       0.85      0.52      0.61      1036

f_2 = 0.365853658537
105
zipper-a
1213 emails
             precision    recall  f1-score   support

   no reply       0.98      0.47      0.64      1188
      reply       0.02      0.56      0.04        25

avg / total       0.96      0.47   

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

   no reply       1.00      0.49      0.66       206
      reply       0.00      0.00      0.00         0

avg / total       1.00      0.49      0.66       206

f_2 = 0.0
3
kaminski-v
18031 emails
             precision    recall  f1-score   support

   no reply       0.83      0.31      0.45     15395
      reply       0.14      0.64      0.23      2636

avg / total       0.73      0.36      0.42     18031

f_2 = 0.369778439443
2639
heard-m
594 emails
             precision    recall  f1-score   support

   no reply       0.91      0.53      0.67       517
      reply       0.17      0.64      0.27        77

avg / total       0.81      0.55      0.62       594

f_2 = 0.409015025042
80
blair-l
2397 emails
             precision    recall  f1-score   support

   no reply       0.89      0.36      0.51      2242
      reply       0.04      0.39      0.07       155

avg / total       0.84      0.36      0.48      2397

f_2 = 0.1435969

In [49]:
f2s = 0
emails = 0
for e, f in emails_f2:
    f2s += f*e
    emails += e
f2s/emails

0.33774663006978412