In [21]:
import re
import numpy as np
from willireply.data import enron
from willireply.features import features

from willireply.features.feature_extractor import FeatureExtractor
from sklearn.linear_model import LinearRegression, RidgeClassifier, LogisticRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, fbeta_score

This uses the FeatureExtractor class defined in willireply.features.feature_extractor. It's very simple.

It takes in a arbitrary number of feature extractors, which take the data frame and return an MxN matrix (M rows, N columns).

It then has built in methods to pull out the feature vector and labels.

# Setup

In [22]:
def was_forwarded(df):
    return df['subject'].str.contains('fwd?\:?\s', flags=re.IGNORECASE).values

def subject_body_sizes(df):
    return df[['m_to', 'subject']].applymap(lambda x: len(x)).values

def was_replied(df):
    """Looks to see if something like Re or RE: is in the subject. Uses regular expressions
    """
    return df['subject'].str.contains('re?\:?\s', flags=re.IGNORECASE).values
def common_words_body(df, words):
    """Given a list of common words (length N), returns an MxN matrix (M is length of df)
    Each cell is the number of times word[N] occurs in df[M].body (case insensitive"""
    return df[['body']].apply(lambda x: pd.Series([x['body'].lower().count(word.lower()) for word in words]), axis=1).values

def common_words_subject(df, words):
    """Given a list of common words (length N), returns an MxN matrix (M is length of df)
    Each cell is the number of times word[N] occurs in df[M].body (case insensitive"""
    return df[['subject']].apply(lambda x: pd.Series([x['subject'].lower().count(word.lower()) for word in words]), axis=1).values


my_common_words = ['ASAP', 'please', 'could you', '?', 'when', 'where', 'who', 'why']

words_body = lambda df: features.common_words_body(df, my_common_words)
words_subject = lambda df: features.common_words_subject(df, my_common_words)


def number_of_ccs(df):
    """Counts the number of CC'd"""
    if df['m_cc'] is not None and len(df['m_cc'].apply(lambda x: len(x.split(','))).values) > 0:
        return df['m_cc'].apply(lambda x: len(x.split(','))).values
    else:
        return 0
def number_of_recipients(df):
    """Counts the number of recipients"""
    return df['m_to'].apply(lambda x: len(x.split(','))).values

fe = FeatureExtractor(was_forwarded, subject_body_sizes, was_replied, number_of_recipients, number_of_ccs, words_body, words_subject)

A simple example to get you going

In [30]:
users = ['sanders-r','shively-h','stepenovitch-j']#,'dorland-c','germany-c','hayslett-r','jones-t','lay-k','mann-k','meyers-a','phanis-s',
           # 'ring-a','sanders-r','shively-h','stepenovitch-j','townsend-j','arnold-j','brawner-s','dasovich-j','ermis-f','gilbertsmith-d','heard-m',
         #'kaminski-v','lenhart-m','martin-t','mims-thurston-p','pimenov-v','ring-r','scholtes-d','skilling-j','stokley-c','tycholiz-b','williams-w3']
frames = []
for user in users:
    assert enron.is_labeled(user)
    df = enron.get_dataframe(user, received_only=True)
    frames.append(df)
df = pd.concat(frames)
X = fe.extract(df)
y_true = fe.get_labels(df)

Split the data into two equal parts

# Experiment

In [35]:
# Randomly select the train and test data
N_training_samples = int(0.5 * len(df))
training_indices = np.random.choice(df.index, N_training_samples, replace=False)
testing_indices = set(df.index) - set(training_indices)

X_train = fe.extract(df.loc[training_indices])
y_train = fe.get_labels(df.loc[training_indices])

X_test = fe.extract(df.loc[testing_indices])
y_test = fe.get_labels(df.loc[testing_indices])
print("Number of training samples: {} \n Number of testing samples: {}".format(len(X_train), len(X_test)))

Number of training samples: 6661 
 Number of testing samples: 2565


Do the regression, and predict

In [36]:
model = LinearRegression()
model.fit(X_train, y_train)
df_validate = enron.get_dataframe('shively-h', received_only=True)
y_pred = model.predict(fe.extract(df_validate)) > 0.05 # Not sure best place to set this predict...
y_true = fe.get_labels(df_validate)
#y_pred = model.predict(X_test) > 0

print("Training complete")

Training complete


In [33]:
missed = df.iloc[np.where(y_test > y_pred)[0]]
#ne = 
#ne
#y_true.shape
#df.equals((y_pred, y_true))
print(len(missed))

4


Display the statistics

In [34]:
print(classification_report(y_test, y_pred, target_names=["no reply", "reply"]))
print('f_2 = %s' % fbeta_score(y_test, y_pred, 2, labels=['no reply', 'reply'], pos_label=1))

             precision    recall  f1-score   support

   no reply       0.96      0.04      0.07      2358
      reply       0.09      0.98      0.17       241

avg / total       0.88      0.13      0.08      2599

f_2 = 0.341597002018


# Alternative experiment

Suppose we want to train on one person, and validate on another...

In [40]:
from willireply.features import features
my_common_words = ['ASAP', 'please', 'could you', '?', 'when', 'where', 'who', 'why']

words_body = lambda df: features.common_words_body(df, my_common_words)
words_subject = lambda df: features.common_words_subject(df, my_common_words)


fe = FeatureExtractor(
         features.was_forwarded,
         features.was_replied,
         features.number_of_recipients,
                      words_body,
        words_subject,
         number_of_ccs
                     )

In [41]:
print("Training")
df_train = enron.get_dataframe('sanders-r', received_only=True)
df_validate = enron.get_dataframe('shively-h', received_only=True)
print(len(df_train))

Training
5218


In [42]:
model = LinearRegression()
model.fit(fe.extract(df_train), fe.get_labels(df_train))
y_pred = model.predict(fe.extract(df_validate)) > 0.05 # Not sure best place to set this predict...
y_true = fe.get_labels(df_validate)

print(classification_report(y_true, y_pred, target_names=["no reply", "reply"]))
print('f_2 = %s' % fbeta_score(y_true, y_pred, 2, labels=['no reply', 'reply'], pos_label=1))

             precision    recall  f1-score   support

   no reply       0.97      0.06      0.12      1236
      reply       0.06      0.97      0.10        70

avg / total       0.93      0.11      0.12      1306

f_2 = 0.225763612218
