In [2]:
import os.path
import sys
import logging
import csv

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn
import sklearn.svm
import sklearn.linear_model
import sklearn.ensemble
import sklearn.pipeline

sys.path.append("../../")  # trick to import clairvoya from internal notebook directory
import clairvoya.runbench
import clairvoya.pulearning
import clairvoya.voya_plotter
import clairvoya.datasetup
import cPickle as pickle

%matplotlib inline

voya_logger = logging.getLogger('voya_notebook')
voya_logger.setLevel(logging.INFO)
fh = logging.FileHandler('voya_notebook.log')
fh.setLevel(logging.DEBUG)
voya_logger.addHandler(fh)
logging.getLogger('clairvoya').addHandler(fh)

# Setup

In [3]:
df = pd.read_csv('../data/large_uni_f.csv')
clairvoya.datasetup.scale_dataframe_features(df)
df.head()

Unnamed: 0,cid,label,cid/fx/import_month_count,cid/fx/import_event_count,cid/fx/tariff_count,cid/fx/policy_count,cid/fx/export_ratio,cid/fx/foreign_currency,cid/fx/intl_office_count,cid/fx/intl_name,cid/fx/export_event_count
0,1513947,-1,0,0,0,0.0,0,0,0,0,0
1,1513431,-1,0,0,0,0.0,0,0,0,0,0
2,1378671,-1,0,0,0,0.0,0,0,0,0,0
3,1605193,1,0,0,0,0.333333,0,0,0,0,0
4,907802,-1,0,0,0,0.0,0,0,0,0,0


In [4]:
df.describe()

Unnamed: 0,cid,label,cid/fx/import_month_count,cid/fx/import_event_count,cid/fx/tariff_count,cid/fx/policy_count,cid/fx/export_ratio,cid/fx/foreign_currency,cid/fx/intl_office_count,cid/fx/intl_name,cid/fx/export_event_count
count,926078.0,926078.0,926078.0,926078.0,926078.0,926078.0,926078.0,926078.0,926078.0,926078.0,926078.0
mean,1698076.210071,-0.056403,0.031498,0.019285,0.016703,0.013463,0.007016,0.006936,0.000472,0.01366,0.004195
std,996782.450566,0.253168,0.144239,0.08312,0.076114,0.077152,0.057718,0.082992,0.005866,0.116074,0.037108
min,3.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,880987.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1718399.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2352453.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4836363.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
num_labels = pd.value_counts(df[['label']].values.ravel())  # numbers in each group
print num_labels
num_pos = num_labels[1]
num_neg = num_labels[-1]
num_unl = num_labels[0]

 0    863776
-1     57268
 1      5034
dtype: int64


In [6]:
df_pos_neg = df[df['label'] != 0]
df_unlab = df[df['label'] == 0]

In [7]:
def generate_mixed_df(unlab_to_pos_frac):
    df_unlab.reindex(np.random.permutation(df_unlab.index))
    num_required_unlab = int(num_pos * unlab_to_pos_frac)
    mixed_df = df_pos_neg.append(df_unlab[:num_required_unlab], ignore_index=True)
    return mixed_df

# Configuring the benchmark

In [9]:
config = {
    "out_path": None,
    "num_cores": 3,
    "verbosity": 0,
    "pu_learning": True,
}

# Classifiers
LR_estimator = sklearn.linear_model.LogisticRegression()
svm_estimator = clairvoya.pulearning.SVC_mod(kernel='linear', probability=True)
rf_estimator = sklearn.ensemble.RandomForestClassifier(max_depth=7, n_estimators=70, n_jobs=config["num_cores"])

LRPosOnly = sklearn.pipeline.Pipeline([
    ('lr' , LR_estimator),
    ('po', clairvoya.pulearning.PosOnly(LR_estimator)),
])

LRBagging = sklearn.pipeline.Pipeline([
    ('lr' , LR_estimator),
    ('po', clairvoya.pulearning.PUBagging(LR_estimator)),
])

RFBagging = sklearn.pipeline.Pipeline([
    ('rf', rf_estimator),
    ('po', clairvoya.pulearning.PUBagging(rf_estimator, max_samples=0.1, n_estimators=100, n_jobs=config["num_cores"])),
])

RFBagging_0_5 = sklearn.pipeline.Pipeline([
    ('rf', rf_estimator),
    ('po', clairvoya.pulearning.PUBagging(rf_estimator, max_samples=0.5, n_estimators=100, n_jobs=config["num_cores"])),
])

# SVMBagging = sklearn.pipeline.Pipeline([
#     ('svm' , svm_estimator),
#     ('po', clairvoya.pulearning.PUBagging),
# ]) 


# SVMPosOnly = sklearn.pipeline.Pipeline([
#     ('svm' , svm_estimator),
#     ('po', clairvoya.pulearning.PosOnly(svm_estimator)),
# ])

SVMDoubleWeight = sklearn.pipeline.Pipeline([
    ('svm' , svm_estimator),
    ('dw', clairvoya.pulearning.PULearnByDoubleWeighting),
])

RFDoubleWeight = sklearn.pipeline.Pipeline([
    ('rf' , rf_estimator),
    ('dw', clairvoya.pulearning.PULearnByDoubleWeighting(rf_estimator)),
])

LRDoubleWeight = sklearn.pipeline.Pipeline([
    ('lr' , LR_estimator),
    ('dw', clairvoya.pulearning.PULearnByDoubleWeighting(LR_estimator)),
])

In [11]:
save_file = 'result_data/large1_results_rf.csv'
num_runs_per = 20
unlab_to_pos_range = range(1,16)

classifiers = {
    # Bagging eventually breaks with too many open files error
    'RF_Bagging': RFBagging,
    'RF_Bagging_0_5': RFBagging_0_5,
#     'LR_Bagging': LRBagging,
#     'SVM_Bagging': clairvoya.pulearning.PUBagging,
    # PU
#     'LR_PosOnly(E&N2008)': LRPosOnly,
#     'SVM_PosOnly(E&N2008)': SVMPosOnly,  #Â Cant predict proba

#     'SVM_DoubleWeight(E&N2008)': SVMDoubleWeight,
#     'LR_DoubleWeight(E&N2008)': LRDoubleWeight,  # doesnt work as LR wont accept weights
    'RF_DoubleWeight(E&N2008)': clairvoya.pulearning.PULearnByDoubleWeighting(rf_estimator),
    
    # Normal
#     'Logistic Regression': sklearn.linear_model.LogisticRegression(),
#     'Gradient Boosting': sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2),
    'Random Forest': sklearn.ensemble.RandomForestClassifier(max_depth=7, n_estimators=70, n_jobs=config["num_cores"]),
}

classifiers_gridparameters = { # Gridsearch off
    'RF_Bagging': None,#{'po__n_estimators': [30, 100], 'po__max_samples': [0.1, 0.3, 0.7, 1.0],
                  #"rf__n_estimators": [10, 30, 50, 70, 100], 'rf__max_depth': [1, 2, 3, 4, 5, 7, 10]},
    'RF_Bagging_0_5': None,
    'LR_Bagging': {'po__n_estimators': [30, 100], 'po__max_samples': [0.1, 0.3, 0.7, 1.0],
                   'lr__fit_intercept': [True], 'lr__C': [0.1, 0.3, 0.5, 0.7, 1.0]},

    'SVM_Bagging': {'po__n_estimators': [30, 100], 'po__max_samples': [0.1, 0.3, 0.7, 1.0],},

    'SVM_PosOnly(E&N2008)' : {'po__hold_out_ratio': [0.02, 0.05, 0.1, 0.2, 0.3, 0.5],
                               'svm__C': [0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 1.0]},
    
    'LR_PosOnly(E&N2008)': {'po__hold_out_ratio': [0.02, 0.05, 0.1, 0.2, 0.3, 0.5],
                           'lr__fit_intercept': [True], 'lr__C': [0.1, 0.3, 0.5, 0.7, 1.0]},

    'SVM_DoubleWeight(E&N2008)': {'svm__C': [0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 1.0]},
    'LR_DoubleWeight(E&N2008)': {'lr__fit_intercept': [True], 'lr__C': [0.1, 0.3, 0.5, 0.7, 1.0]},
    'RF_DoubleWeight(E&N2008)': None,
    

    'Logistic Regression': {'fit_intercept': [True], 'C': [0.1, 0.3, 0.5, 0.7, 1.0]},
    'Gradient Boosting': {"n_estimators": [10, 30, 50, 70, 100], 'learning_rate': [0.1, 0.3, 0.7, 1.0],
                          'max_depth': [1, 2, 3, 4, 5, 7, 10]},

    'Random Forest': None, #{"n_estimators": [10, 30, 50, 70, 100], 'max_depth': [1, 2, 3, 4, 5, 7, 10]},
}

## Now run it

In [None]:
if not os.path.exists(save_file):
    with open(save_file, 'wb') as f:
        f.write('clf,auc,gamma\n')

gamma_range = unlab_to_pos_range
auc_results = {clf_name:[] for clf_name in classifiers.keys()}
for g_num, gamma in enumerate(gamma_range):
    voya_logger.info('Running classifiers for gamma={} ({}/{})'.format(gamma, g_num+1, len(gamma_range)))
    run_results = {clf_name:[] for clf_name in classifiers.keys()}
    for i in xrange(num_runs_per):
        mixed_df = generate_mixed_df(gamma)
        config.update({"data_file":  mixed_df})

        results_dict = clairvoya.runbench.run_benchmark(config, classifiers, classifiers_gridparameters)
        
        # Output
        csv_output = []
        for clf_name in classifiers.keys():
            csv_output.append((clf_name, results_dict[clf_name]['auc_score'], gamma))
            
        with open(save_file, 'ab') as f:
            csv_f = csv.writer(f)
            csv_f.writerows(csv_output)
            
    for clf_name in classifiers.keys():
        auc_results[clf_name].append(run_results[clf_name])

INFO:voya_notebook:Running classifiers for gamma=1 (1/15)


## plots

In [None]:
# save_file = 'result_data/test1_results copy.csv'
results_df = pd.read_csv(save_file)
results_table = results_df.groupby(["clf", "gamma"], as_index=False).agg(['mean', 'std', 'count'])

In [None]:
colors = seaborn.color_palette("Set2", 10)
result_classifiers = results_df.clf.unique()

plt.figure(figsize=(10,10))
for i, clf_name in enumerate(result_classifiers):
    clf_results = results_table.ix[(clf_name)]
    clf_gamma_range = clf_results.index
    auc_mean = clf_results.auc["mean"]
    auc_std = clf_results.auc["std"]
    auc_count = clf_results.auc["count"]
    auc_std_err = auc_std / np.sqrt(auc_count)
    
    plt.errorbar(clf_gamma_range, auc_mean, yerr=auc_std_err, label=clf_name,
                 c=colors[i], capthick=1)
    plt.scatter(clf_gamma_range, auc_mean, c=colors[i], lw=0)

plt.xscale('log')    
plt.ylabel('AUC Score')
plt.xlabel('Fraction of Unlabelled to Positive')
plt.legend()

title = "Train P {}, N {}".format(
    num_pos, num_neg)
plt.title(title)
plt.savefig(title + '.png')