In [None]:
import email
import sklearn
import sklearn.decomposition
import features
import numpy as np

pipeline = sklearn.pipeline.Pipeline([
    ('transform_email', sklearn.preprocessing.FunctionTransformer(email.message_from_string)),
    ('generate_features', sklearn.pipeline.FeatureUnion([
        ('content_type_features', sklearn.preprocessing.FunctionTransformer(features.generate_content_type)),
        ('email_counts_features', sklearn.preprocessing.FunctionTransformer(features.generate_email_counts)),
        ('case_ratio_features', sklearn.preprocessing.FunctionTransformer(features.generate_upper_to_lower_case_ratios)),
        ('email_chain_features', sklearn.preprocessing.FunctionTransformer(features.generate_subject_is_chain)),
        ('link_features', sklearn.preprocessing.FunctionTransformer(features.generate_number_of_links)),
        ('mailing_list_features', sklearn.preprocessing.FunctionTransformer(features.generate_is_mailing_list)),
        ('bag_of_words_features', sklearn.pipeline.Pipeline([
            ('generate_bow', sklearn.feature_extraction.text.TfidfVectorizer()),
            ('pca', sklearn.decomposition.TruncatedSVD(n_components=200))
        ]))
    ])),
    ('train_tree', sklearn.tree.DecisionTreeClassifier())
])

# Load processed data
dataset = pandas.read_msgpack('./data/processed.msg', encoding='latin-1')

# Separate features and labels
features = dataset[[x for x in dataset.columns if x != 'class']].values
labels = dataset['class'].apply(lambda x: x == 1)
res = sklearn.cross_validation.cross_val_score(pipeline, features, labels, cv=10, scoring='roc_auc')
print(np.mean(res), np.std(res))