In [None]:
import os.path
import sys
import logging

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn
import sklearn.svm
import sklearn.linear_model
import sklearn.ensemble

sys.path.append("../../")  # trick to import clairvoya from internal notebook directory
import clairvoya.runbench
import clairvoya.pulearning
import clairvoya.voya_plotter
import clairvoya.datasetup
import cPickle as pickle

%matplotlib inline

voya_logger = logging.getLogger('voya_notebook')
voya_logger.setLevel(logging.INFO)
fh = logging.FileHandler('voya_notebook.log')
fh.setLevel(logging.DEBUG)
voya_logger.addHandler(fh)
logging.getLogger('clairvoya').addHandler(fh)

# Run Conditions

In [None]:
train_num_pos = 100
train_num_unlab = 1000

test_num_all = 3000
test_num_pos_frac = 0.5

num_runs_per = 25

pos_to_unlab_range = (0.1, 0.5, 1, 3, 10)

In [None]:
df = pd.read_csv('../data/test1_uni_f.csv')
df.head()

In [None]:
df.describe()

In [None]:
num_labels = pd.value_counts(df[['label']].values.ravel())  # numbers in each group
print num_labels
num_pos = num_labels[1]
num_neg = num_labels[-1]
num_unl = num_labels[0]

In [None]:
df_pos_neg = df[df['label'] != 0]
df_unlab = df[df['label'] == 0]

In [None]:
def generate_mixed_df(pos_to_unl_frac):
    df_unlab.reindex(np.random.permutation(df_unlab.index))
    num_required_unlab = int(num_pos * pos_to_unl_frac)
    mixed_df = df_pos_neg.append(df_unlab[:num_required_unlab], ignore_index=True)
    return mixed_df

In [None]:
mixed_df = generate_mixed_df(2)

# Configuring the benchmark

In [None]:
clfs=('Logistic Regression', 'Gradient Boosting', 'LR_PosOnly(E&N2008)', 'Random Forrest', 'SVM_DoubleWeight(E&N2008)')
auc_results = {clf_name:[] for clf_name in clfs}
for g_num, pos_to_unlab in enumerate(pos_to_unlab_range):
    voya_logger.info('Running classifiers for gamma={} ({}/{})'.format(pos_to_unlab, g_num, len(pos_to_unlab_range)))
    run_results = {clf_name:[] for clf_name in clfs}
    for i in xrange(num_runs_per):
        
        mixed_df = generate_mixed_df(pos_to_unlab)

        config = {
            "data_file": mixed_df,
            "out_path": None,
            "num_cores": 3,
            "verbosity": 0,
            "pu_learning": True,
        }

        LR_estimator = sklearn.linear_model.LogisticRegression(C=0.4, penalty='l1')
        svc_estimator = sklearn.svm.SVC(C=2.5, kernel='linear', class_weight='auto', probability=True)

        classifiers = {
    #         'Bagging LR': sklearn.ensemble.BaggingClassifier(LR_estimator, n_estimators=200, max_samples=0.3)
            'Logistic Regression': sklearn.linear_model.LogisticRegression(),
            'Gradient Boosting': sklearn.ensemble.GradientBoostingClassifier(n_estimators=100,
                                                                             learning_rate=0.1, max_depth=2),
            'LR_PosOnly(E&N2008)': clairvoya.pulearning.PosOnly(LR_estimator, hold_out_ratio=0.2),
            'SVM_DoubleWeight(E&N2008)': clairvoya.pulearning.PULearnByDoubleWeighting(svc_estimator), 
            'Random Forrest': sklearn.ensemble.RandomForestClassifier(n_jobs=config["num_cores"]),
        }

        classifiers_gridparameters = {clf_name: None for clf_name in clfs}

        results_dict = clairvoya.runbench.run_benchmark(config, classifiers, classifiers_gridparameters)
        for clf_name in clfs:
            run_results[clf_name].append(results_dict[clf_name]['auc_score'])
    for clf_name in clfs:
        auc_results[clf_name].append(run_results[clf_name])

In [None]:
colors = seaborn.color_palette("Set2", 10)

plt.figure(figsize=(10,10))
for i, clf_name in enumerate(clfs):
    auc_scores = np.array(auc_results[clf_name])
    std_error = auc_scores.std(axis=1) / np.sqrt(num_runs_per)
    plt.errorbar(gamma_range, auc_scores.mean(axis=1), yerr=std_error, label=clf_name,
                 c=colors[i], capthick=0)
    plt.scatter(gamma_range, auc_scores.mean(axis=1), c=colors[i], lw=0)
    
    plt.ylabel('AUC Score')
    plt.xlabel('Frac Unlabelled Positives')
    plt.legend()
    
    title = "Train P {}, U {}, Test {}, Testfrac {}".format(
        train_num_pos, train_num_unlab, test_num_all, test_num_pos_frac)
    plt.title(title)
    plt.savefig('plots/{}.png'.format(title))

In [None]:
# Easy results save for now
with open(os.path.join('result_data', 'test 1 ' + title + '.pickle'), 'wb') as f:
    pickle.dump(auc_results, f)