# Baselines: WDVD, ORES, FILTER

## Preamble

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import warnings

from sklearn.base import clone
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
import sys
sys.path.append('../src/')

import evaluation
import load_wdvd
import multipleinstance
import storage

from load_wdvd import transform
from transformers import EqualsTransformer

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [5]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 200)

## Loading

In [6]:
PATH_FEATURES = '../../data/features/'

In [7]:
OUTPUT_DIR = '../../data/classification/'

### Load Dataframe

In [8]:
fields = ['revisionId', 'isEditingTool']

dtype = {
    'revisionId': np.int,
    'isEditingTool': np.bool,
    'subject': np.int,
    'predicate': np.int,
    'object': np.int,
    'superSubject': np.int,
    'superObject': np.int,
}

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    df_train = pd.read_csv(PATH_FEATURES + '/training/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)
    df_val = pd.read_csv(PATH_FEATURES + '/validation/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)
    df_test = pd.read_csv(PATH_FEATURES + '/test/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)

len(df_train)
len(df_val)
len(df_test)

24280578

1703987

2827193

### Load WDVD Features

In [9]:
df_wdvd = load_wdvd.load_df_wdvd(use_test_set=False)

In [10]:
df_wdvd.loc[df_train.index, 'isEditingTool'] = df_train['isEditingTool']
df_wdvd.loc[df_val.index, 'isEditingTool'] = df_val['isEditingTool']
df_wdvd.loc[df_test.index, 'isEditingTool'] = df_test['isEditingTool']

In [11]:
df_train = df_wdvd.loc[df_train.index]
df_val   = df_wdvd.loc[df_val.index]
df_test  = df_wdvd.loc[df_test.index]

mask_nosat_train = ~df_train['isEditingTool'].values.astype(np.bool)
mask_nosat_val   = ~df_val['isEditingTool'].values.astype(np.bool)
mask_nosat_test  = ~df_test['isEditingTool'].values.astype(np.bool)

df_train = df_train.loc[mask_nosat_train]
df_val = df_val.loc[mask_nosat_val]
df_test = df_test.loc[mask_nosat_test]

In [12]:
len(df_train)
len(df_val)
len(df_test)

5890968

550798

560524

### Utils

In [13]:
def classify(columns, clf, use_weights=False):
    y_train = df_train['rollbackReverted'].values
    y_val   = df_val['rollbackReverted'].values
    y_test  = df_test['rollbackReverted'].values
    y_train_val = np.concatenate([y_train, y_val])

    X_train  = df_train[columns].values
    X_val    = df_val[columns].values
    X_test   = df_test[columns].values
    X_train_val = np.concatenate([X_train, X_val])

    if use_weights:
        weight_train = df_train['weight'].values
        weight_val = df_val['weight'].values
        weight_train_val = np.concatenate([weight_train, weight_val])
    else:
        weight_train = None
        weight_train_val = None

    protected_val   = ~df_val['isRegisteredUser'].values
    protected_test  = ~df_test['isRegisteredUser'].values

    clf_val  = clone(clf)
    clf_test = clone(clf)

    if (isinstance(clf, multipleinstance.BaseMultipleInstanceClassifier)):
        g_train = df_train['revisionSessionId'].values
        g_val = df_val['revisionSessionId'].values
        g_test = df_test['revisionSessionId'].values
        g_train_val = np.concatenate([g_train, g_val])

        clf_val.fit(g_train, X_train, y_train, sample_weight=weight_train)
        clf_test.fit(g_train_val, X_train_val, y_train_val, sample_weight=weight_train_val)
        proba_val  = clf_val.predict_proba(g_val, X_val)
        proba_test = clf_test.predict_proba(g_test, X_test)
    else:
        clf_val.fit(X_train, y_train, sample_weight=weight_train)
        clf_test.fit(X_train_val, y_train_val, sample_weight=weight_train_val)
        proba_val   = clf_val.predict_proba(X_val)[:, 1]
        proba_test  = clf_test.predict_proba(X_test)[:, 1]

    metrics = pd.DataFrame()
    metrics = metrics.append(evaluation.evaluate_proba_performance_bias(
        y_val, protected_val, proba_val, index='VAL'))
    metrics = metrics.append(evaluation.evaluate_proba_performance_bias(
        y_test, protected_test, proba_test, index='TEST'))

    display(metrics)

    return proba_val, proba_test

## Baselines on WDVC-2016-Links

### WDVD

In [14]:
FEATURES_WDVD_CHARACTER = [
    'lowerCaseRatio',
    'upperCaseRatio',
    'nonLatinRatio',
    'latinRatio',
    'alphanumericRatio',
    'digitRatio',
    'punctuationRatio',
    'whitespaceRatio',
    'longestCharacterSequence',
    'asciiRatio',
    'bracketRatio'
]

In [15]:
FEATURES_WDVD_WORD = [
    'languageWordRatio',
    'containsLanguageWord',
    'lowerCaseWordRatio',
    'longestWord',
    'containsURL',
    'badWordRatio',
    'proportionOfQidAdded',
    'upperCaseWordRatio',
    'proportionOfLinksAdded',
]

In [16]:
FEATURES_WDVD_SENTENCE = [
    'commentTailLength',
    'commentSitelinkSimilarity',
    'commentLabelSimilarity',
    'commentCommentSimilarity',
]

In [17]:
FEATURES_WDVD_STATEMENT = [
    'propertyFreq',
    'itemValueFreq',
    'literalValueFreq',
]

In [18]:
FEATURES_WDVD_USER = [
    'userCountryFreq',
    'userTimeZoneFreq',
    'userCityFreq',
    'userCountyFreq',
    'userRegionFreq',
    'cumUserUniqueItems',
    'userContinentFreq',
    'isRegisteredUser',
    'userFreq',
    'isPrivilegedUser',
]

In [19]:
FEATURES_WDVD_ITEM = [
    'logCumItemUniqueUsers',
    'logItemFreq',
]

In [20]:
FEATURES_WDVD_REVISION = [
    'revisionTagsFreq',
    'revisionLanguageFreq',
    'revisionActionFreq',
    'commentLength',
    'isLatinLanguage',
    'revisionPrevActionFreq',
    'revisionSubactionFreq',
    'positionWithinSession',
]

In [21]:
FEATURES_WDVD = FEATURES_WDVD_CHARACTER + FEATURES_WDVD_WORD + FEATURES_WDVD_SENTENCE + FEATURES_WDVD_STATEMENT + FEATURES_WDVD_USER + FEATURES_WDVD_ITEM + FEATURES_WDVD_REVISION

FEATURES_WDVD.sort()

len(FEATURES_WDVD)

47

In [22]:
columns = FEATURES_WDVD

clf = RandomForestClassifier(n_estimators=8, max_depth=32, max_features=2, random_state=1)
clf = BaggingClassifier(base_estimator=clf, n_estimators=16, max_samples=1/16, n_jobs=1, random_state=1)
clf1 = multipleinstance.SingleInstanceClassifier(base_estimator=clf, agg_func='cummean', window=1)

clf = RandomForestClassifier(n_estimators=8, max_depth=32, max_features=2, random_state=1)
clf = BaggingClassifier(base_estimator=clf, n_estimators=16, max_samples=1/16, n_jobs=1, random_state=1)
clf2 = multipleinstance.SimpleMultipleInstanceClassifier(base_estimator=clf, trans_func='cummin_cummax', window=1)

clf = multipleinstance.CombinedMultipleInstanceClassifier(base_estimator1=clf1, base_estimator2=clf2)
WDVD_VAL, WDVD_TEST = classify(columns=columns, clf=clf);

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
VAL,550798,2604,0.995492,0.518211,0.982594,550798,0.176904,0.000794,0.114982,0.000719,0.114263,159.83859
TEST,560524,2429,0.996193,0.546765,0.989712,560524,0.176391,0.000463,0.121456,0.000391,0.121065,310.737997


In [23]:
storage.dump_predictions(df_val, WDVD_VAL, OUTPUT_DIR + 'VALIDATION_WDVD.csv.bz2')
storage.dump_predictions(df_test, WDVD_TEST, OUTPUT_DIR + 'TEST_WDVD.csv.bz2')

### ORES

In [24]:
FEATURES_ORES_GENERAL = [
    # Added/removed/changed sitelinks
    'wbsetsitelink_na',
    'wbsetsitelink_add',
    'wbsetsitelink_set',
    'wbsetsitelink_remove',

    # Added/removed/changed labels
    'wbsetlabel_add',
    'wbsetlabel_set',
    'wbsetlabel_remove',
    'special_setlabel-set',

    # Added/removed/changed descriptions
    'wbsetdescription_add',
    'wbsetdescription_set',
    'wbsetdescription_remove',

    # Added/removed/changed statements
    'wbcreateclaim_na',
    'wbcreateclaim_create',
    'wbsetclaim_create',
    'wbsetclaim_update',
    'wbremoveclaims_na',
    'wbremoveclaims_remove',
    'wbsetclaim_update-rank',
    'wbsetclaimvalue_na',

    # Added/removed/changed aliases
    'wbsetaliases_add',
    'wbsetaliases_add-remove',
    'wbsetaliases_set',
    'wbsetaliases_remove',

    # Added/removed badges
    'wbsetsitelink_set-badges',
    'wbsetsitelink_add-both',
    'wbsetsitelink_set-both',

    # Added/removed qualifiers
    'wbsetqualifier_na',
    'wbsetqualifier_add',
    'wbsetqualifier_update',
    'wbsetclaim_update-qualifiers',
    'wbremovequalifiers_remove',

     # Added/removed references
    'wbsetreference_na',
    'wbsetreference_add',
    'wbsetreference_set',
    'wbremovereferences_na',
    'wbremovereferences_remove',

    # Misc
    'wbsetlabeldescriptionaliases_na',
    'wbcreateredirect_na',
    'wblinktitles_connect',
    'wblinktitles_create',

    'numberOfSitelinks',
    'numberOfLabels',
    'numberOfDescriptions',
    'numberOfStatements',
    'numberOfAliases',
    'numberOfBadges',
    'numberOfQualifiers',
    'numberOfReferences',
    'numberOfProperties',

    'identifier_changed'
]

FEATURES_ORES_VANDALISM = [
    'proportionOfQidAdded',
    'proportionOfLinksAdded',
    'proportionOfLanguageAdded',

    # Has English label changed?
    'en_label_touched',

    # Changed properties (P21, P27, P54, P569, P18, P109, P373, P856)
    'P21',
    'P27',
    'P54',
    'P569',
    'P18',
    'P109',
    'P373',
    'P856',

    'isLivingPerson',
    'isHuman'
]

FEATURES_ORES_NON_VANDALISM = [
    # Is it a client edit?
    'clientsitelink_update',
    'clientsitelink_remove',

    # Is it a merge?
    'wbmergeitems_from',
    'wbmergeitems_to',

    # Revert, rollback, restore
    'undo_na',
    'rollback_na',
    'restore_na',

    # Is it creating a new item?
    'wbcreate_new',
    'pageCreation_na',
    'special_create-item',
    'wbsetentity_na',
    'wbeditentity_na',
    'wbeditentity_create',
    'wbeditentity_update',
    'wbeditentity_override'
]

FEATURES_ORES_EDITOR = [
    'isBotUser',
    'isAdvancedUser',
    'isAdminUser',
    'isCuratorUser',
    'isRegisteredUser',
    'userSecondsSinceFirstRevisionRegistered'
]

FEATURES_ORES = FEATURES_ORES_GENERAL + FEATURES_ORES_VANDALISM + FEATURES_ORES_NON_VANDALISM + FEATURES_ORES_EDITOR

FEATURES_ORES.sort()

assert len(FEATURES_ORES) == 85

In [25]:
def transform_ores(slice_fit, df):
    df = df.copy()
    df_action = df[['revisionAction', 'revisionSubaction']]

    # Added/removed/changed sitelinks
    df['wbsetsitelink_na']     = transform(slice_fit, df_action, EqualsTransformer(('wbsetsitelink', np.nan)))
    df['wbsetsitelink_add']    = transform(slice_fit, df_action, EqualsTransformer(('wbsetsitelink', 'add')))
    df['wbsetsitelink_set']    = transform(slice_fit, df_action, EqualsTransformer(('wbsetsitelink', 'set')))
    df['wbsetsitelink_remove'] = transform(slice_fit, df_action, EqualsTransformer(('wbsetsitelink', 'remove')))

    # Added/removed/changed labels
    df['wbsetlabel_add']       = transform(slice_fit, df_action, EqualsTransformer(('wbsetlabel', 'add')))
    df['wbsetlabel_set']       = transform(slice_fit, df_action, EqualsTransformer(('wbsetlabel', 'set')))
    df['wbsetlabel_remove']    = transform(slice_fit, df_action, EqualsTransformer(('wbsetlabel', 'remove')))
    df['special_setlabel-set'] = transform(slice_fit, df_action, EqualsTransformer(('special', 'setlabel-set')))

    # Added/removed/changed descriptions
    df['wbsetdescription_add']    = transform(slice_fit, df_action, EqualsTransformer(('wbsetdescription', 'add')))
    df['wbsetdescription_set']    = transform(slice_fit, df_action, EqualsTransformer(('wbsetdescription', 'set')))
    df['wbsetdescription_remove'] = transform(slice_fit, df_action, EqualsTransformer(('wbsetdescription', 'remove')))

    # Added/removed/changed statements
    df['wbcreateclaim_na']       = transform(slice_fit, df_action, EqualsTransformer(('wbcreateclaim', np.nan)))
    df['wbcreateclaim_create']   = transform(slice_fit, df_action, EqualsTransformer(('wbcreateclaim', 'create')))
    df['wbsetclaim_create']      = transform(slice_fit, df_action, EqualsTransformer(('wbsetclaim', 'create')))
    df['wbsetclaim_update']      = transform(slice_fit, df_action, EqualsTransformer(('wbsetclaim', 'update')))
    df['wbremoveclaims_na']      = transform(slice_fit, df_action, EqualsTransformer(('wbremoveclaims', np.nan)))
    df['wbremoveclaims_remove']  = transform(slice_fit, df_action, EqualsTransformer(('wbremoveclaims', 'remove')))
    df['wbsetclaim_update-rank'] = transform(slice_fit, df_action, EqualsTransformer(('wbsetclaim', 'update-rank')))
    df['wbsetclaimvalue_na']     = transform(slice_fit, df_action, EqualsTransformer(('wbsetclaimvalue', np.nan)))

    # Added/removed/changed aliases
    df['wbsetaliases_add']        = transform(slice_fit, df_action, EqualsTransformer(('wbsetaliases', 'add')))
    df['wbsetaliases_add-remove'] = transform(slice_fit, df_action, EqualsTransformer(('wbsetaliases', 'add-remove')))
    df['wbsetaliases_set']        = transform(slice_fit, df_action, EqualsTransformer(('wbsetaliases', 'set')))
    df['wbsetaliases_remove']     = transform(slice_fit, df_action, EqualsTransformer(('wbsetaliases', 'remove')))

    # Added/removed badges
    df['wbsetsitelink_set-badges'] = transform(slice_fit, df_action, EqualsTransformer(('wbsetsitelink', 'set-badges')))
    df['wbsetsitelink_add-both']   = transform(slice_fit, df_action, EqualsTransformer(('wbsetsitelink', 'add-both')))
    df['wbsetsitelink_set-both']   = transform(slice_fit, df_action, EqualsTransformer(('wbsetsitelink', 'set-both')))

    # Added/removed qualifiers
    df['wbsetqualifier_na']            = transform(slice_fit, df_action, EqualsTransformer(('wbsetqualifier', np.nan)))
    df['wbsetqualifier_add']           = transform(slice_fit, df_action, EqualsTransformer(('wbsetqualifier', 'add')))
    df['wbsetqualifier_update']        = transform(slice_fit, df_action, EqualsTransformer(('wbsetqualifier', 'update')))
    df['wbsetclaim_update-qualifiers'] = transform(slice_fit, df_action, EqualsTransformer(('wbsetclaim', 'update-qualifiers')))
    df['wbremovequalifiers_remove']    = transform(slice_fit, df_action, EqualsTransformer(('wbremovequalifiers', 'remove')))

    # Added/removed references
    df['wbsetreference_na']         = transform(slice_fit, df_action, EqualsTransformer(('wbsetreference', np.nan)))
    df['wbsetreference_add']        = transform(slice_fit, df_action, EqualsTransformer(('wbsetreference', 'add')))
    df['wbsetreference_set']        = transform(slice_fit, df_action, EqualsTransformer(('wbsetreference', 'set')))
    df['wbremovereferences_na']     = transform(slice_fit, df_action, EqualsTransformer(('wbremovereferences', np.nan)))
    df['wbremovereferences_remove'] = transform(slice_fit, df_action, EqualsTransformer(('wbremovereferences', 'remove')))

    # Misc
    df['wbsetlabeldescriptionaliases_na'] =\
                                 transform(slice_fit, df_action, EqualsTransformer(('wbsetlabeldescriptionaliases', np.nan)))
    df['wbcreateredirect_na']  = transform(slice_fit, df_action, EqualsTransformer(('wbcreateredirect', np.nan)))
    df['wblinktitles_connect'] = transform(slice_fit, df_action, EqualsTransformer(('wblinktitles', 'connect')))
    df['wblinktitles_create']  = transform(slice_fit, df_action, EqualsTransformer(('wblinktitles', 'create')))

    df['identifier_changed'] = transform(slice_fit, df[['dataType']], EqualsTransformer(('external-id',)))

    # Has English label changed?
    df['en_label_touched'] = transform(
        slice_fit, df[['revisionAction', 'revisionLanguage']], EqualsTransformer(('wbsetlabel', 'en')))

    # Changed properties (P21, P27, P54, P569, P18, P109, P373, P856)
    df['P21']  = transform(slice_fit, df[['property']], EqualsTransformer((21,)))   # sex or gender
    df['P27']  = transform(slice_fit, df[['property']], EqualsTransformer((27,)))   # country of citizenship
    df['P54']  = transform(slice_fit, df[['property']], EqualsTransformer((54,)))   # member of sports team
    df['P569'] = transform(slice_fit, df[['property']], EqualsTransformer((569,)))  # date of birth
    df['P18']  = transform(slice_fit, df[['property']], EqualsTransformer((18,)))   # image
    df['P109'] = transform(slice_fit, df[['property']], EqualsTransformer((109,)))  # signature
    df['P373'] = transform(slice_fit, df[['property']], EqualsTransformer((373,)))  # commons category
    df['P856'] = transform(slice_fit, df[['property']], EqualsTransformer((856,)))  # official website

    # Is it a client edit?
    df['clientsitelink_update'] = transform(slice_fit, df_action, EqualsTransformer(('clientsitelink', 'update')))
    df['clientsitelink_remove'] = transform(slice_fit, df_action, EqualsTransformer(('clientsitelink', 'remove')))

    # Is it a merge?
    df['wbmergeitems_from'] = transform(slice_fit, df_action, EqualsTransformer(('wbmergeitems', 'from')))
    df['wbmergeitems_to']   = transform(slice_fit, df_action, EqualsTransformer(('wbmergeitems', 'to')))

    # Revert, rollback, restore
    df['undo_na']     = transform(slice_fit, df_action, EqualsTransformer(('undo', np.nan)))
    df['rollback_na'] = transform(slice_fit, df_action, EqualsTransformer(('rollback', np.nan)))
    df['restore_na']= transform(slice_fit, df_action, EqualsTransformer(('restore', np.nan)))

    # Is it creating a new item?
    df['wbcreate_new']          = transform(slice_fit, df_action, EqualsTransformer(('wbcreate', 'new')))
    df['pageCreation_na']       = transform(slice_fit, df_action, EqualsTransformer(('pageCreation', np.nan)))
    df['special_create-item']   = transform(slice_fit, df_action, EqualsTransformer(('special', 'create-item')))
    df['wbsetentity_na']        = transform(slice_fit, df_action, EqualsTransformer(('wbsetentity', np.nan)))
    df['wbeditentity_na']       = transform(slice_fit, df_action, EqualsTransformer(('wbeditentity', np.nan)))
    df['wbeditentity_create']   = transform(slice_fit, df_action, EqualsTransformer(('wbeditentity', 'create')))
    df['wbeditentity_update']   = transform(slice_fit, df_action, EqualsTransformer(('wbeditentity', 'update')))
    df['wbeditentity_override'] = transform(slice_fit, df_action, EqualsTransformer(('wbeditentity', 'override')))

    return df

In [26]:
df_ores_test = transform_ores(slice(0, 0), df_test)

assert len(set(df_ores_test.loc[:, 'wbsetsitelink_na':].columns)) == len(df_ores_test.loc[:, 'wbsetsitelink_na':].columns)
assert set(df_ores_test.loc[:, 'wbsetsitelink_na':].columns).intersection(set(df_test.columns)) == set()

In [27]:
df_train = transform_ores(slice(0, 0), df_train)
df_val = transform_ores(slice(0, 0), df_val)
df_test = transform_ores(slice(0, 0), df_test)

In [28]:
columns = FEATURES_ORES
clf = RandomForestClassifier(
    n_estimators=80, criterion='entropy', min_samples_leaf=1, max_features='log2', n_jobs=1, random_state=1)
ORES_VAL, ORES_TEST = classify(columns=columns, clf=clf);

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
VAL,550798,2604,0.995476,0.390388,0.941102,550798,0.143689,0.001553,0.099599,0.001487,0.098112,66.976387
TEST,560524,2429,0.995918,0.433974,0.964806,560524,0.154411,0.000958,0.114428,0.000859,0.113568,133.141375


In [29]:
storage.dump_predictions(df_val, ORES_VAL, OUTPUT_DIR + 'VALIDATION_ORES.csv.bz2')
storage.dump_predictions(df_test, ORES_TEST, OUTPUT_DIR + 'TEST_ORES.csv.bz2')

### FILTER

In [30]:
FEATURES_FILTER = ['revisionTagsFreq']

In [31]:
columns = FEATURES_FILTER
clf = RandomForestClassifier(n_estimators=10, max_depth=None, n_jobs=1, random_state=1)
FILTER_VAL, FILTER_TEST = classify(columns=columns, clf=clf);

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
VAL,550798,2604,0.995514,0.27377,0.883888,550798,0.120335,0.002087,0.085727,0.002016,0.083711,42.52713
TEST,560524,2429,0.995979,0.301973,0.923834,560524,0.128555,0.001539,0.097819,0.001413,0.096406,69.223894


In [32]:
storage.dump_predictions(df_val, FILTER_VAL, OUTPUT_DIR + 'VALIDATION_FILTER.csv.bz2')
storage.dump_predictions(df_test, FILTER_TEST, OUTPUT_DIR + 'TEST_FILTER.csv.bz2')

## Weighting WDVD Samples

In [33]:
def wdvd_weighted(max_depth, weight):
    columns = FEATURES_WDVD

    df_weight = pd.concat([df_train, df_val])
    df_weight['weight'] = df_weight.reset_index().groupby(['rollbackReverted', 'isRegisteredUser'])['revisionId'].transform(lambda x: 1 / len(x)).values

    df_weight.loc[~df_weight['rollbackReverted'] & ~df_weight['isRegisteredUser'], 'weight'] *= weight
    df_weight.loc[ df_weight['rollbackReverted'] &  df_weight['isRegisteredUser'], 'weight'] *= weight

    df_train['weight'] = df_weight['weight']
    df_val['weight']   = df_weight['weight']

    clf = RandomForestClassifier(n_estimators=8, max_depth=max_depth, max_features=2, random_state=1)
    clf = BaggingClassifier(base_estimator=clf, n_estimators=16, max_samples=1/16, n_jobs=1, random_state=1)
    clf1 = multipleinstance.SingleInstanceClassifier(base_estimator=clf, agg_func='cummean', window=1)

    clf = RandomForestClassifier(n_estimators=8, max_depth=max_depth, max_features=2, random_state=1)
    clf = BaggingClassifier(base_estimator=clf, n_estimators=16, max_samples=1/16, n_jobs=1, random_state=1)
    clf2 = multipleinstance.SimpleMultipleInstanceClassifier(base_estimator=clf, trans_func='cummin_cummax', window=1)

    clf = multipleinstance.CombinedMultipleInstanceClassifier(base_estimator1=clf1, base_estimator2=clf2)

    return classify(columns=columns, clf=clf, use_weights=True);

In [34]:
WDVD_WEIGHTED1_VAL, WDVD_WEIGHTED1_TEST = wdvd_weighted(16, 8.1)

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
VAL,550798,2604,0.993442,0.109354,0.942348,550798,0.031377,0.004119,0.01684,0.004092,0.012748,4.115646
TEST,560524,2429,0.993988,0.160444,0.963498,560524,0.035996,0.003621,0.018935,0.003552,0.015384,5.33132


In [35]:
# storage.dump_predictions(df_val, WDVD_WEIGHTED1_VAL, OUTPUT_DIR + 'VALIDATION_WDVD_WEIGHTED1.csv.bz2')
storage.dump_predictions(df_test, WDVD_WEIGHTED1_TEST, OUTPUT_DIR + 'TEST_WDVD_WEIGHTED1.csv.bz2')

In [36]:
WDVD_WEIGHTED2_VAL, WDVD_WEIGHTED2_TEST = wdvd_weighted(16, 4.3)

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
VAL,550798,2604,0.994352,0.200457,0.957326,550798,0.050695,0.003678,0.026648,0.003638,0.02301,7.32428
TEST,560524,2429,0.995017,0.314351,0.972571,560524,0.064599,0.002978,0.033117,0.00287,0.030247,11.537424


In [37]:
# storage.dump_predictions(df_val, WDVD_WEIGHTED2_VAL, OUTPUT_DIR + 'VALIDATION_WDVD_WEIGHTED2.csv.bz2')
storage.dump_predictions(df_test, WDVD_WEIGHTED2_TEST, OUTPUT_DIR + 'TEST_WDVD_WEIGHTED2.csv.bz2')