In [1]:
import re
import numpy as np
from willireply.data import enron
from willireply.features.feature_extractor import FeatureExtractor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, fbeta_score



This uses the FeatureExtractor class defined in willireply.features.feature_extractor. It's very simple.

It takes in a arbitrary number of feature extractors, which take the data frame and return an MxN matrix (M rows, N columns).

It then has built in methods to pull out the feature vector and labels.

# Setup

In [2]:
def was_forwarded(df):
    return df['subject'].str.contains('fwd?\:?\s', flags=re.IGNORECASE).values

def subject_body_sizes(df):
    return df[['m_to', 'subject']].applymap(lambda x: len(x)).values

fe = FeatureExtractor(was_forwarded, subject_body_sizes)

A simple example to get you going

In [3]:
user = 'may-l'
assert enron.is_labeled(user)
df = enron.get_dataframe(user, received_only=True)
X = fe.extract(df)
y_true = fe.get_labels(df)

Split the data into two equal parts

# Experiment

In [4]:
# Randomly select the train and test data
N_training_samples = int(0.5 * len(df))
training_indices = np.random.choice(df.index, N_training_samples, replace=False)
testing_indices = set(df.index) - set(training_indices)

X_train = fe.extract(df.loc[training_indices])
y_train = fe.get_labels(df.loc[training_indices])
X_test = fe.extract(df.loc[testing_indices])
y_test = fe.get_labels(df.loc[testing_indices])

Do the regression, and predict

In [5]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test) > 0

Display the statistics

In [6]:
print(classification_report(y_test, y_pred, target_names=["no reply", "reply"]))
print('f_2 = %s' % fbeta_score(y_test, y_pred, 2, labels=['no reply', 'reply'], pos_label=1))

             precision    recall  f1-score   support

   no reply       1.00      0.03      0.06       724
      reply       0.03      1.00      0.06        22

avg / total       0.97      0.06      0.06       746

f_2 = 0.135301353014


# Alternative experiment

Suppose we want to train on one person, and validate on another...

In [19]:
from willireply.features import features

my_common_words = ['ASAP', 'please', 'could you']
common_words_feature = lambda df: features.common_words(df, my_common_words)

fe = FeatureExtractor(
         features.was_forwarded,
         features.was_replied,
         features.number_of_recipients,
                      common_words_feature
                     )

In [20]:
df_train = enron.get_dataframe('may-l', received_only=True)
df_validate = enron.get_dataframe('ring-a', received_only=True)

In [21]:
model = LinearRegression()
model.fit(fe.extract(df_train), fe.get_labels(df_train))
y_pred = model.predict(fe.extract(df_validate)) > 0.05 # Not sure best place to set this predict...
y_true = fe.get_labels(df_validate)

print(classification_report(y_true, y_pred, target_names=["no reply", "reply"]))
print('f_2 = %s' % fbeta_score(y_true, y_pred, 2, labels=['no reply', 'reply'], pos_label=1))

             precision    recall  f1-score   support

   no reply       0.96      0.68      0.79       420
      reply       0.04      0.35      0.08        17

avg / total       0.93      0.66      0.77       437

f_2 = 0.142857142857
