# FAIR-S

## Preamble

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import gc
import warnings

import numpy as np
import pandas as pd

from pandas import MultiIndex
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier

In [3]:
import sys
sys.path.append('../src/')

import load_predicate_embedding
import load_wdvd
import storage

from evaluation import evaluate_proba_performance_bias

In [4]:
from transformers import FrequencyTransformer
from transformers import CumFrequencyTransformer

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)

## Load Data

In [6]:
PATH_FEATURES = '../../data/features/'

In [7]:
OUTPUT_DIR = '../../data/classification/'
FILE_OUTPUT_VALIDATION_FAIR_S = OUTPUT_DIR + 'VALIDATION_FAIR_S.csv.bz2'
FILE_OUTPUT_TEST_FAIR_S = OUTPUT_DIR + 'TEST_FAIR_S.csv.bz2'

### Load Dataframe

In [8]:
fields = ['revisionId', 'isEditingTool', 'subject', 'predicate', 'object', 'superSubject', 'superObject']

dtype = {
    'revisionId': np.int,
    'isEditingTool': np.bool,
    'subject': np.int,
    'predicate': np.int,
    'object': np.int,
    'superSubject': np.int,
    'superObject': np.int,
}

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    df_train = pd.read_csv(PATH_FEATURES + '/training/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)
    df_val = pd.read_csv(PATH_FEATURES + '/validation/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)
    df_test = pd.read_csv(PATH_FEATURES + '/test/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)

len(df_train)
len(df_val)
len(df_test)

24280578

1703987

2827193

### Load WDVD Features

In [9]:
SLICE_FIT = slice(0, load_wdvd.VALIDATION_SET_START)

In [10]:
df_wdvd = load_wdvd.load_df_wdvd(use_test_set=False)

In [11]:
df_wdvd.loc[df_train.index, 'isEditingTool'] = df_train['isEditingTool']
df_wdvd.loc[df_val.index, 'isEditingTool'] = df_val['isEditingTool']
df_wdvd.loc[df_test.index, 'isEditingTool'] = df_test['isEditingTool']

### Load Predicates

In [12]:
y_all = df_wdvd['rollbackReverted'][df_wdvd['isEditingTool'] == df_wdvd['isEditingTool']].values  # filter na values for noSAT

In [13]:
N_FEATURES = 100
SCORE_FUNC = load_predicate_embedding.count_nonzero

data, meta = load_predicate_embedding.load_matrices()
load_predicate_embedding.binarize_features(data)
load_predicate_embedding.select_item_predicates_at_end_of_training_set(data, meta)

slice_fit = slice(0, meta['n_train'])
load_predicate_embedding.select_features(data, meta, y_all, slice_fit, score_func=SCORE_FUNC, k=N_FEATURES)

In [14]:
df_freq = load_predicate_embedding.frequency_encoding(data, slice_fit)

In [15]:
df_freq.index = np.concatenate([df_train.index, df_val.index, df_test.index])

df_wdvd.loc[df_train.index, 'subjectPredEmbedFrequency'] = df_freq.loc[df_train.index, 'subjectPredEmbedFrequency']
df_wdvd.loc[df_val.index, 'subjectPredEmbedFrequency'] = df_freq.loc[df_val.index, 'subjectPredEmbedFrequency']
df_wdvd.loc[df_test.index, 'subjectPredEmbedFrequency'] = df_freq.loc[df_test.index, 'subjectPredEmbedFrequency']

df_wdvd.loc[df_train.index, 'objectPredEmbedFrequency']  = df_freq.loc[df_train.index, 'objectPredEmbedFrequency']
df_wdvd.loc[df_val.index, 'objectPredEmbedFrequency']  = df_freq.loc[df_val.index, 'objectPredEmbedFrequency']
df_wdvd.loc[df_test.index, 'objectPredEmbedFrequency']  = df_freq.loc[df_test.index, 'objectPredEmbedFrequency']

df_wdvd.loc[df_train.index, 'objectOutPredEmbedFrequency']  = df_freq.loc[df_train.index, 'objectOutPredEmbedFrequency']
df_wdvd.loc[df_val.index, 'objectOutPredEmbedFrequency']  = df_freq.loc[df_val.index, 'objectOutPredEmbedFrequency']
df_wdvd.loc[df_test.index, 'objectOutPredEmbedFrequency']  = df_freq.loc[df_test.index, 'objectOutPredEmbedFrequency']

In [16]:
del(df_freq)

In [17]:
print(df_wdvd['subjectPredEmbedFrequency'].nunique())
print(df_wdvd['objectPredEmbedFrequency'].nunique())

2243
2209


### Load Super Subjects

In [18]:
df_wdvd.loc[df_train.index, 'superSubject']  = df_train['superSubject']
df_wdvd.loc[df_val.index, 'superSubject']  = df_val['superSubject']
df_wdvd.loc[df_test.index, 'superSubject']  = df_test['superSubject']

### Load Super Objects

In [19]:
df_wdvd.loc[df_train.index, 'superObject']  = df_train['superObject']
df_wdvd.loc[df_val.index, 'superObject']  = df_val['superObject']
df_wdvd.loc[df_test.index, 'superObject']  = df_test['superObject']

### New Features

In [20]:
df_wdvd['superSubjectFrequency'] = load_wdvd.transform(SLICE_FIT, df_wdvd[['superSubject']], FrequencyTransformer())
df_wdvd['superObjectFrequency']  = load_wdvd.transform(SLICE_FIT, df_wdvd[['superObject']], FrequencyTransformer())

In [21]:
df_wdvd.loc[df_train.index, 'predicate'] = df_train['predicate']
df_wdvd.loc[df_val.index, 'predicate']   = df_val['predicate']
df_wdvd.loc[df_test.index, 'predicate']  = df_test['predicate']

df_wdvd.loc[df_train.index, 'object'] = df_train['object']
df_wdvd.loc[df_val.index, 'object']   = df_val['object']
df_wdvd.loc[df_test.index, 'object']  = df_test['object']

In [22]:
df_wdvd['subjectPredicateFrequency'] = load_wdvd.transform(SLICE_FIT, df_wdvd[['itemId', 'predicate']], FrequencyTransformer())
df_wdvd['objectPredicateFrequency'] = load_wdvd.transform(SLICE_FIT, df_wdvd[['predicate', 'object']], FrequencyTransformer())

In [23]:
df_wdvd['subjectPredicateCumFrequency'] =\
    load_wdvd.transform(SLICE_FIT, df_wdvd[['itemId', 'predicate']], CumFrequencyTransformer())
df_wdvd['objectPredicateCumFrequency'] =\
    load_wdvd.transform(SLICE_FIT, df_wdvd[['predicate', 'object']], CumFrequencyTransformer())

In [24]:
columns = {
    'englishLabel': 'subjectLabel',
    'englishDescription': 'subjectDescription',
    'englishAliases': 'subjectAliases',
    'englishSitelink': 'subjectSitelink',
}

df_wdvd.rename(columns=columns, inplace=True)

In [25]:
df_wdvd['subjectLabelWordLength'] = df_wdvd['subjectLabel'].str.split().str.len().fillna(-1)
df_wdvd['subjectSitelinkWordLength'] = df_wdvd['subjectSitelink'].str.split().str.len().fillna(-1)

### Synchronize Dataframes

In [26]:
mask_nosat_train = ~df_train['isEditingTool'].values.astype(np.bool)
mask_nosat_val   = ~df_val['isEditingTool'].values.astype(np.bool)
mask_nosat_test  = ~df_test['isEditingTool'].values.astype(np.bool)

df_train = df_train[mask_nosat_train]
df_val   = df_val[mask_nosat_val]
df_test  = df_test[mask_nosat_test]

In [27]:
usecols = df_wdvd.columns.difference(df_test.columns)
df_train = df_train.merge(df_wdvd[usecols], left_index=True, right_index=True)
df_val = df_val.merge(df_wdvd[usecols], left_index=True, right_index=True)
df_test = df_test.merge(df_wdvd[usecols], left_index=True, right_index=True)
df_all = pd.concat([df_train, df_val, df_test])
print(len(df_all))

7002290


In [28]:
gc.collect()
del(df_wdvd)
gc.collect()

42

159

## Classification

### Classification Utils

In [29]:
def get_variance(clf, X, y, protected):
    metrics = pd.DataFrame()
    for estimator in clf.estimators_:
        proba   = estimator.predict_proba(X)[:, 1]
        metrics_cur = evaluate_proba_performance_bias(y, protected, proba)
        metrics = metrics.append(metrics_cur)

    aggregate = metrics.describe()
    display(aggregate)

    metrics = pd.concat([metrics, aggregate])
    return metrics

In [30]:
def classify(columns, n_estimators, max_depth, n_jobs=1, random_state=1, use_weights=False):
    y_train = df_train['rollbackReverted'].values
    y_val   = df_val['rollbackReverted'].values
    y_test  = df_test['rollbackReverted'].values
    y_train_val = np.concatenate([y_train, y_val])

    X_train  = df_train[columns].values
    X_val    = df_val[columns].values
    X_test   = df_test[columns].values
    X_train_val = np.concatenate([X_train, X_val])

    if use_weights:
        weight_train = df_train['weight'].values
        weight_val = df_val['weight'].values
        weight_train_val = np.concatenate([weight_train, weight_val])
    else:
        weight_train = None
        weight_train_val = None

    protected_val   = ~df_val['isRegisteredUser'].values
    protected_test  = ~df_test['isRegisteredUser'].values

    print(X_train.shape)

    clf_val = RandomForestClassifier(
        random_state=random_state, n_jobs=n_jobs, n_estimators=n_estimators, max_depth=max_depth)
    clf_test = clone(clf_val)

    clf_val.fit(X_train, y_train, sample_weight=weight_train)
    clf_test.fit(X_train_val, y_train_val, sample_weight=weight_train_val)

    proba_val  = clf_val.predict_proba(X_val)[:, 1]
    proba_test = clf_test.predict_proba(X_test)[:, 1]

    metrics = pd.DataFrame()
    metrics = metrics.append(evaluate_proba_performance_bias(
        y_val, protected_val, proba_val, index='VAL'))
    metrics = metrics.append(evaluate_proba_performance_bias(
        y_test, protected_test, proba_test, index='TEST'))

    display(metrics)

    return proba_val, proba_test

In [31]:
def classify_val(columns, n_estimators, max_depth, n_jobs=1, random_state=1, index=''):
    y_train = df_train['rollbackReverted'].values
    y_val   = df_val['rollbackReverted'].values

    X_train  = df_train[columns].values
    X_val    = df_val[columns].values

    protected_val   = ~df_val['isRegisteredUser'].values

    clf = RandomForestClassifier(
        random_state=random_state, n_jobs=n_jobs, n_estimators=n_estimators, max_depth=max_depth)

    clf.fit(X_train, y_train)
    proba_val   = clf.predict_proba(X_val)[:, 1]
    metrics = evaluate_proba_performance_bias(y_val, protected_val, proba_val, index=index)

    return metrics

### Define Candidate Features

In [32]:
# Source: Table 3 in Heindorf 2017 et al (WSDM Cup 2017 Overview Paper)

FEATURE_CANDIDATES_WDVD = [
    # Chracter Features
    # 'lowerCaseRatio',
    # 'upperCaseRatio',
    # 'nonLatinRatio',
    # 'latinRatio',
    # 'alphanumericRatio',
    # 'digitRatio',
    # 'punctuationRatio',
    # 'whitespaceRatio',
    # 'longestCharacterSequence',
    # 'asciiRatio',
    # 'bracketRatio',

    # Word Features
    #  'languageWordRatio',
    #  'containsLanguageWord',
    #  'lowerCaseWordRatio',
    #  'longestWord',
    #  'containsURL',
    #  'badWordRatio',
    'proportionOfQidAdded',
    # 'upperCaseWordRatio',
    # 'proportionOfLinksAdded',
    # 'proportionOfLanguageAdded',

    # Sentence Features
    'commentCommentSimilarity',
    # 'commentLabelSimilarity',
    'commentTailLength',
    # 'commentSitelinkSimilarity',

    # Statement Features
    # 'dataTypeFreq',  # It should always be dataype 'item'.
    'itemValueFreq',
    # 'hasIdentifierChanged', # Always false since always the same data type.
    # 'literalValueFreq',
    'propertyFreq',

    # User Features
    #  'userCountryFreq',
    #  'userTimeZoneFreq',
    #  'userCityFreq',
    #  'userCountyFreq',
    #  'userRegionFreq',
    #  'cumUserUniqueItems',
    #  'userContinentFreq',
    #  'isRegisteredUser',
    #  'userFreq',
    #  'isPrivilegedUser',
    #  'userSecondsSinceFirstRevision',

    # Item Features
    'logCumItemUniqueUsers',
    'logItemFreq',

    # Revision features
    # 'commentLength',
    # 'isLatinLanguage',  # used by ORES but not WDVD
    # 'numberOfIdentifiersChanged',
    'changeCount',  # also known as param1
    # 'positionWithinSession',   # includes user information (sessions consider the user)
    'revisionActionFreq',
    # 'revisionLanguageFreq',
    'revisionPrevActionFreq',
    'revisionSubactionFreq',
]

FEATURE_CANDIDATES_ORES = [
    'revisionActionFreq',
    'revisionSubactionFreq',

    'isHuman',
    'isLivingPerson',

    'numberOfAliases',
    'numberOfBadges',
    'numberOfDescriptions',
    'numberOfLabels',
    'numberOfProperties',
    'numberOfQualifiers',
    'numberOfReferences',
    'numberOfSitelinks',
    'numberOfStatements',
]

FEATURE_CANDIDATES_NEW = [
    # selected new features
    'subjectLabelWordLength',
    'subjectPredicateCumFrequency',
    'objectPredEmbedFrequency',
    'objectPredicateCumFrequency',

    # "symmetric" features to selected features
    'subjectPredEmbedFrequency',

    # experiments described in paper
    'superSubject',
    'superSubjectFrequency',
    'superObject',
    'superObjectFrequency',

    # Further features
    'subjectPredicateFrequency',
    'objectPredicateFrequency',
]

FEATURE_CANDIDATES = list(set(FEATURE_CANDIDATES_WDVD + FEATURE_CANDIDATES_ORES + FEATURE_CANDIDATES_NEW))

In [33]:
FEATURE_MAPPING = {
    'proportionOfQidAdded': 'editProportionOfTriplesAdded',
    'commentCommentSimilarity': 'editCommentCommentSimilarity',
    'commentTailLength': 'editCommentTailLength',
    'itemValueFreq': 'objectFrequency',
    'propertyFreq': 'predicateFrequency',
    'logCumItemUniqueUsers': 'subjectLogCumUniqueUsers',
    'logItemFreq': 'subjectLogFrequency',
    'isHuman': 'subjectIsHuman',
    'isLivingPerson': 'subjectIsLivingPerson',
    'changeCount': 'editChangecount',
    'revisionActionFreq': 'editActionFrequency',
    'revisionPrevActionFreq': 'editPrevActionFrequency',
    'revisionSubactionFreq': 'editSubactionFrequency',
    'numberOfAliases': 'subjectNumberOfAliases',
    'numberOfBadges': 'subjectNumberOfBadges',
    'numberOfDescriptions': 'subjectNumberOfDescriptions',
    'numberOfLabels': 'subjectNumberOfLabels',
    'numberOfProperties': 'subjectNumberOfProperties',
    'numberOfQualifiers': 'subjectNumberOfQualifiers',
    'numberOfReferences': 'subjectNumberOfReferences',
    'numberOfSitelinks': 'subjectNumberOfSitelinks',
    'numberOfStatements': 'subjectNumberOfStatements',
    'userSecondsSinceFirstRevision': 'userAge'
}

In [34]:
# Assert there is no problem with FEATURE_MAPPING.
assert (len(FEATURE_MAPPING) == len(set(FEATURE_MAPPING.values())))
[(key, FEATURE_MAPPING[key]) for key in FEATURE_MAPPING if key not in df_all.columns]
assert all([FEATURE_MAPPING[key] not in df_all.columns for key in FEATURE_MAPPING])

[]

In [35]:
FEATURE_CANDIDATES = [f if f not in FEATURE_MAPPING else FEATURE_MAPPING[f] for f in FEATURE_CANDIDATES]
FEATURE_CANDIDATES.sort()

In [36]:
df_all.rename(columns=FEATURE_MAPPING, copy=False, inplace=True)
df_train.rename(columns=FEATURE_MAPPING, copy=False, inplace=True)
df_val.rename(columns=FEATURE_MAPPING, copy=False, inplace=True)
df_test.rename(columns=FEATURE_MAPPING, copy=False, inplace=True)

In [37]:
# Assert there is no problem with FEATURE_CANDIDATES.
[feature for feature in FEATURE_CANDIDATES if feature not in df_all.columns]
assert all([feature in df_all.columns for feature in FEATURE_CANDIDATES])

[]

### Feature Ranking

In [38]:
def evaluate_features(df_train, df_val, features):  # df_val might be the test set!
    X_train = df_train[features].values
    y_train = df_train['rollbackReverted'].values.astype(np.bool)

    X_val = df_val[features].values
    y_val = df_val['rollbackReverted'].values.astype(np.bool)
    protected_val = ~df_val['isRegisteredUser'].values

    clf = RandomForestClassifier(random_state=1, n_jobs=1, n_estimators=32, max_depth=16)
    clf.fit(X_train, y_train)
    proba = clf.predict_proba(X_val)[:, 1]

    metrics = evaluate_proba_performance_bias(y_val, protected_val, proba, index=str(features))
    display(metrics)
    return metrics

In [39]:
FEATURES_FAIR_S_SUBJECT = ['subjectLabelWordLength', 'subjectLogCumUniqueUsers', 'subjectLogFrequency', 'subjectNumberOfAliases', 'subjectNumberOfLabels',  'subjectPredicateCumFrequency']

FEATURES_FAIR_S_PREDICATE = ['predicateFrequency']

FEATURES_FAIR_S_OBJECT = ['objectFrequency', 'objectPredEmbedFrequency', 'objectPredicateCumFrequency']

FEATURES_FAIR_S_EDIT = ['editActionFrequency', 'editPrevActionFrequency', 'editProportionOfTriplesAdded', 'editSubactionFrequency']


FEATURES_FAIR_S = FEATURES_FAIR_S_SUBJECT\
                + FEATURES_FAIR_S_PREDICATE\
                + FEATURES_FAIR_S_OBJECT\
                + FEATURES_FAIR_S_EDIT

FEATURES_FAIR_S.sort()

In [40]:
df_train_val = pd.concat([df_train, df_val])
del(df_train_val)

In [41]:
columns = FEATURES_FAIR_S.copy()

metrics = []

df_train_val = pd.concat([df_train, df_val])

for feature in columns:
    metrics.append(evaluate_features(df_train_val, df_test, [feature]))

metrics.append(evaluate_features(df_train_val, df_test, FEATURES_FAIR_S_SUBJECT))
metrics.append(evaluate_features(df_train_val, df_test, FEATURES_FAIR_S_PREDICATE))
metrics.append(evaluate_features(df_train_val, df_test, FEATURES_FAIR_S_OBJECT))
metrics.append(evaluate_features(df_train_val, df_test, FEATURES_FAIR_S_EDIT))

df_feature_metrics = pd.concat(metrics).sort_values(('Performance', 'ROC'), ascending=False)
df_feature_metrics.sort_values(('Performance', 'ROC'), ascending=False, inplace=True)

del(df_train_val)

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['editActionFrequency'],560524,2429,0.995667,0.005407,0.575149,560524,0.004875,0.004321,0.004839,0.004321,0.000519,1.120059


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['editPrevActionFrequency'],560524,2429,0.995667,0.024051,0.628737,560524,0.009389,0.00422,0.007185,0.004215,0.002969,1.704369


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['editProportionOfTriplesAdded'],560524,2429,0.995667,0.021559,0.865537,560524,0.010161,0.004202,0.00837,0.004194,0.004176,1.995523


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['editSubactionFrequency'],560524,2429,0.995667,0.019646,0.851195,560524,0.009777,0.004211,0.008014,0.004204,0.00381,1.906351


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['objectFrequency'],560524,2429,0.995667,0.01033,0.64999,560524,0.00582,0.0043,0.005371,0.004298,0.001073,1.249522


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['objectPredEmbedFrequency'],560524,2429,0.995667,0.01283,0.682232,560524,0.006878,0.004276,0.006153,0.004274,0.00188,1.439824


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['objectPredicateCumFrequency'],560524,2429,0.995661,0.006575,0.613001,560524,0.004524,0.004329,0.00447,0.004329,0.000141,1.032628


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['predicateFrequency'],560524,2429,0.995667,0.011312,0.729425,560524,0.006629,0.004282,0.006239,0.004279,0.00196,1.458009


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['subjectLabelWordLength'],560524,2429,0.995667,0.008009,0.721256,560524,0.006156,0.004292,0.005938,0.004291,0.001647,1.383946


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['subjectLogCumUniqueUsers'],560524,2429,0.995667,0.051933,0.901236,560524,0.018342,0.004018,0.014042,0.003997,0.010045,3.513325


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['subjectLogFrequency'],560524,2429,0.995667,0.041746,0.880001,560524,0.015141,0.00409,0.012027,0.004073,0.007955,2.953224


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['subjectNumberOfAliases'],560524,2429,0.995667,0.025735,0.760878,560524,0.007665,0.004258,0.006233,0.004252,0.001981,1.465929


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['subjectNumberOfLabels'],560524,2429,0.995667,0.031042,0.859237,560524,0.008753,0.004234,0.007433,0.004225,0.003208,1.759186


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['subjectPredicateCumFrequency'],560524,2429,0.995667,0.007847,0.686947,560524,0.005495,0.004307,0.005269,0.004306,0.000963,1.223756


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
"['subjectLabelWordLength', 'subjectLogCumUniqueUsers', 'subjectLogFrequency', 'subjectNumberOfAliases', 'subjectNumberOfLabels', 'subjectPredicateCumFrequency']",560524,2429,0.99556,0.071116,0.907487,560524,0.020995,0.003959,0.015925,0.003933,0.011992,4.048793


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
['predicateFrequency'],560524,2429,0.995667,0.011312,0.729425,560524,0.006629,0.004282,0.006239,0.004279,0.00196,1.458009


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
"['objectFrequency', 'objectPredEmbedFrequency', 'objectPredicateCumFrequency']",560524,2429,0.99567,0.026037,0.729252,560524,0.008393,0.004242,0.006573,0.004237,0.002336,1.551301


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
"['editActionFrequency', 'editPrevActionFrequency', 'editProportionOfTriplesAdded', 'editSubactionFrequency']",560524,2429,0.995661,0.062345,0.888986,560524,0.01862,0.004012,0.013253,0.003996,0.009257,3.316427


In [42]:
df_feature_metrics

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
"['subjectLabelWordLength', 'subjectLogCumUniqueUsers', 'subjectLogFrequency', 'subjectNumberOfAliases', 'subjectNumberOfLabels', 'subjectPredicateCumFrequency']",560524,2429,0.99556,0.071116,0.907487,560524,0.020995,0.003959,0.015925,0.003933,0.011992,4.048793
['subjectLogCumUniqueUsers'],560524,2429,0.995667,0.051933,0.901236,560524,0.018342,0.004018,0.014042,0.003997,0.010045,3.513325
"['editActionFrequency', 'editPrevActionFrequency', 'editProportionOfTriplesAdded', 'editSubactionFrequency']",560524,2429,0.995661,0.062345,0.888986,560524,0.01862,0.004012,0.013253,0.003996,0.009257,3.316427
['subjectLogFrequency'],560524,2429,0.995667,0.041746,0.880001,560524,0.015141,0.00409,0.012027,0.004073,0.007955,2.953224
['editProportionOfTriplesAdded'],560524,2429,0.995667,0.021559,0.865537,560524,0.010161,0.004202,0.00837,0.004194,0.004176,1.995523
['subjectNumberOfLabels'],560524,2429,0.995667,0.031042,0.859237,560524,0.008753,0.004234,0.007433,0.004225,0.003208,1.759186
['editSubactionFrequency'],560524,2429,0.995667,0.019646,0.851195,560524,0.009777,0.004211,0.008014,0.004204,0.00381,1.906351
['subjectNumberOfAliases'],560524,2429,0.995667,0.025735,0.760878,560524,0.007665,0.004258,0.006233,0.004252,0.001981,1.465929
['predicateFrequency'],560524,2429,0.995667,0.011312,0.729425,560524,0.006629,0.004282,0.006239,0.004279,0.00196,1.458009
['predicateFrequency'],560524,2429,0.995667,0.011312,0.729425,560524,0.006629,0.004282,0.006239,0.004279,0.00196,1.458009


In [43]:
metrics = [('Performance', 'ROC'), ('Performance', 'PR'), ('Bias', 'score_diff'), ('Bias', 'score_ratio')]


def get_group_metrics(df_feature_metrics, features):
    metrics = [('Performance', 'ROC'), ('Performance', 'PR'), ('Bias', 'score_diff'), ('Bias', 'score_ratio')]
    feature_str = [str([feature]) for feature in features]
    df_result1 = df_feature_metrics.loc[[str(features)], metrics]
    df_result2 = df_feature_metrics.loc[feature_str, metrics]
    df_result2.sort_values(('Performance', 'ROC'), ascending=False, inplace=True)
    df_result = pd.concat([df_result1, df_result2])
    return df_result


df_feature_metrics_selected = pd.concat([
    get_group_metrics(df_feature_metrics, FEATURES_FAIR_S_SUBJECT),
    get_group_metrics(df_feature_metrics, FEATURES_FAIR_S_PREDICATE),
    get_group_metrics(df_feature_metrics, FEATURES_FAIR_S_OBJECT),
    get_group_metrics(df_feature_metrics, FEATURES_FAIR_S_EDIT)
])

df_feature_metrics_selected\
    .to_latex(OUTPUT_DIR + 'table-fair-s-features.tex',
              float_format='{:.3f}'.format,
              formatters={
                  ('Bias', 'score_diff'): '{:.5f}'.format,
                  ('Bias', 'score_ratio'): '{:.2f}'.format
             })
df_feature_metrics_selected

Unnamed: 0_level_0,Performance,Performance,Bias,Bias
Unnamed: 0_level_1,ROC,PR,score_diff,score_ratio
"['subjectLabelWordLength', 'subjectLogCumUniqueUsers', 'subjectLogFrequency', 'subjectNumberOfAliases', 'subjectNumberOfLabels', 'subjectPredicateCumFrequency']",0.907487,0.071116,0.011992,4.048793
['subjectLogCumUniqueUsers'],0.901236,0.051933,0.010045,3.513325
['subjectLogFrequency'],0.880001,0.041746,0.007955,2.953224
['subjectNumberOfLabels'],0.859237,0.031042,0.003208,1.759186
['subjectNumberOfAliases'],0.760878,0.025735,0.001981,1.465929
['subjectLabelWordLength'],0.721256,0.008009,0.001647,1.383946
['subjectPredicateCumFrequency'],0.686947,0.007847,0.000963,1.223756
['predicateFrequency'],0.729425,0.011312,0.00196,1.458009
['predicateFrequency'],0.729425,0.011312,0.00196,1.458009
['predicateFrequency'],0.729425,0.011312,0.00196,1.458009


### Classification: FAIR-S

In [44]:
columns = FEATURES_FAIR_S.copy()
FAIR_S_VAL, FAIR_S_TEST = classify(columns=columns, n_jobs=1, n_estimators=32, max_depth=16)

(5890968, 14)


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
VAL,550798,2604,0.995389,0.252511,0.960936,550798,0.051877,0.003651,0.024768,0.003581,0.021187,6.916121
TEST,560524,2429,0.995827,0.316492,0.963333,560524,0.066263,0.00294,0.033748,0.002842,0.030906,11.874172


In [45]:
storage.dump_predictions(df_test, FAIR_S_TEST, FILE_OUTPUT_TEST_FAIR_S)

## Experiments

### Utils

In [46]:
def get_index(operation, column):
    return MultiIndex.from_tuples([(operation, column)], names=('Operation', 'Column'))

In [47]:
def add_remove_features(columns, feature_candidates):
    columns_to_remove = list(set(columns))
    columns_to_remove.sort()
    print('columns_to_remove=\n{}'.format(columns_to_remove))

    columns_to_add = list(set(feature_candidates).difference(columns))
    columns_to_add.sort()
    print('columns_to_add=\n{}'.format(columns_to_add))

    metrics_list = []
    metrics_baseline = classify_val(
        columns=columns,
        n_estimators=N_ESTIMATORS,
        max_depth=MAX_DEPTH,
        index=get_index('None', 'Baseline'))
    metrics_list.append(metrics_baseline)
    display(metrics_baseline)

    for column in columns_to_remove:
        columns_cur = columns[:]
        columns_cur.remove(column)
        columns_cur.sort()

        metrics = classify_val(
            columns=columns_cur,
            n_estimators=N_ESTIMATORS,
            max_depth=MAX_DEPTH,
            index=get_index('Remove', column))
        metrics_list.append(metrics)
        display(metrics)

    for column in columns_to_add:
        columns_cur = columns + [column]
        columns_cur.sort()

        metrics = classify_val(
            columns=columns_cur,
            n_estimators=N_ESTIMATORS,
            max_depth=MAX_DEPTH,
            index=get_index('Add', column))
        metrics_list.append(metrics)
        display(metrics)

    metrics = pd.concat(metrics_list)

    return metrics

In [48]:
def get_improvements(metrics):
    improvement = metrics.loc[[('None', 'Baseline')]]
    improvement = improvement.append(
        metrics[metrics[('Performance', 'ROC')] > metrics.loc[('None', 'Baseline'), ('Performance', 'ROC')]])
    improvement.sort_values(('Performance', 'ROC'), inplace=True, ascending=False)
    return improvement

In [49]:
def apply_best_improvement_to_columns(columns, improvements):
    improvement = improvements.iloc[[0]]

    operation = improvement.index.get_level_values('Operation')
    columns_to_add    = improvement[operation == 'Add'].index.get_level_values('Column')
    columns_to_remove = improvement[operation == 'Remove'].index.get_level_values('Column')

    columns_to_add    = list(columns_to_add)
    columns_to_remove = list(columns_to_remove)

    columns = list(set(columns).union(columns_to_add))
    columns = list(set(columns).difference(columns_to_remove))
    columns.sort()

    return columns

### Optimize Features

In [50]:
N_ESTIMATORS = 32
MAX_DEPTH = 16

In [51]:
columns = FEATURES_FAIR_S.copy()

In [52]:
assert (len(set(FEATURES_FAIR_S)) == len(FEATURES_FAIR_S))
assert (len(set(FEATURE_CANDIDATES)) == len(FEATURE_CANDIDATES))
for feature in FEATURES_FAIR_S:
    assert feature in FEATURE_CANDIDATES, feature

In [53]:
metrics = add_remove_features(columns, FEATURE_CANDIDATES)
df_metrics_feature_optimization = metrics.copy()

columns_to_remove=
['editActionFrequency', 'editPrevActionFrequency', 'editProportionOfTriplesAdded', 'editSubactionFrequency', 'objectFrequency', 'objectPredEmbedFrequency', 'objectPredicateCumFrequency', 'predicateFrequency', 'subjectLabelWordLength', 'subjectLogCumUniqueUsers', 'subjectLogFrequency', 'subjectNumberOfAliases', 'subjectNumberOfLabels', 'subjectPredicateCumFrequency']
columns_to_add=
['editChangecount', 'editCommentCommentSimilarity', 'editCommentTailLength', 'objectPredicateFrequency', 'subjectIsHuman', 'subjectIsLivingPerson', 'subjectNumberOfBadges', 'subjectNumberOfDescriptions', 'subjectNumberOfProperties', 'subjectNumberOfQualifiers', 'subjectNumberOfReferences', 'subjectNumberOfSitelinks', 'subjectNumberOfStatements', 'subjectPredEmbedFrequency', 'subjectPredicateFrequency', 'superObject', 'superObjectFrequency', 'superSubject', 'superSubjectFrequency']


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
,Baseline,550798,2604,0.995389,0.252511,0.960936,550798,0.051877,0.003651,0.024768,0.003581,0.021187,6.916121


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,editActionFrequency,550798,2604,0.995372,0.242291,0.945525,550798,0.050512,0.003682,0.023794,0.003622,0.020172,6.569106


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,editPrevActionFrequency,550798,2604,0.995359,0.223029,0.951828,550798,0.04606,0.003783,0.022076,0.003733,0.018343,5.914182


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,editProportionOfTriplesAdded,550798,2604,0.99533,0.224224,0.951365,550798,0.046563,0.003772,0.021917,0.003718,0.018199,5.895023


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,editSubactionFrequency,550798,2604,0.995363,0.239838,0.956333,550798,0.048565,0.003726,0.023293,0.003657,0.019635,6.36856


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,objectFrequency,550798,2604,0.995379,0.258561,0.955802,550798,0.054573,0.003589,0.02604,0.003527,0.022514,7.383327


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,objectPredEmbedFrequency,550798,2604,0.995309,0.241105,0.957997,550798,0.052433,0.003638,0.02546,0.003582,0.021879,7.108523


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,objectPredicateCumFrequency,550798,2604,0.99528,0.170887,0.953777,550798,0.035099,0.004034,0.018101,0.003981,0.01412,4.54666


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,predicateFrequency,550798,2604,0.995345,0.219378,0.95054,550798,0.045951,0.003786,0.02311,0.003728,0.019382,6.198965


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,subjectLabelWordLength,550798,2604,0.995359,0.224886,0.945112,550798,0.047052,0.003761,0.022355,0.003712,0.018643,6.022076


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,subjectLogCumUniqueUsers,550798,2604,0.995167,0.183386,0.942669,550798,0.04011,0.003919,0.019087,0.003875,0.015212,4.925581


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,subjectLogFrequency,550798,2604,0.995365,0.236307,0.956891,550798,0.048917,0.003718,0.023284,0.003656,0.019628,6.368233


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,subjectNumberOfAliases,550798,2604,0.995369,0.231876,0.954244,550798,0.047302,0.003755,0.022404,0.003697,0.018707,6.059954


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,subjectNumberOfLabels,550798,2604,0.995387,0.242789,0.954419,550798,0.050004,0.003693,0.023546,0.00363,0.019916,6.487234


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Remove,subjectPredicateCumFrequency,550798,2604,0.995381,0.219571,0.948172,550798,0.045986,0.003785,0.021958,0.003737,0.018221,5.876163


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,editChangecount,550798,2604,0.995332,0.222061,0.951128,550798,0.046279,0.003778,0.021903,0.003727,0.018176,5.87727


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,editCommentCommentSimilarity,550798,2604,0.995381,0.240099,0.957588,550798,0.048874,0.003719,0.023151,0.003655,0.019496,6.333775


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,editCommentTailLength,550798,2604,0.995358,0.233618,0.956612,550798,0.048587,0.003726,0.022788,0.00367,0.019117,6.208569


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,objectPredicateFrequency,550798,2604,0.995309,0.203798,0.9522,550798,0.041624,0.003885,0.020137,0.003835,0.016302,5.251151


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectIsHuman,550798,2604,0.995327,0.222763,0.952168,550798,0.047871,0.003742,0.023546,0.003688,0.019858,6.384457


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectIsLivingPerson,550798,2604,0.99533,0.238721,0.95193,550798,0.049756,0.003699,0.023207,0.003643,0.019565,6.370883


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectNumberOfBadges,550798,2604,0.995325,0.223555,0.951734,550798,0.046349,0.003777,0.022227,0.003724,0.018502,5.967934


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectNumberOfDescriptions,550798,2604,0.99533,0.229819,0.953171,550798,0.047344,0.003754,0.022582,0.00369,0.018892,6.119641


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectNumberOfProperties,550798,2604,0.995392,0.245141,0.954355,550798,0.049982,0.003694,0.023011,0.003633,0.019378,6.333393


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectNumberOfQualifiers,550798,2604,0.995343,0.233498,0.960393,550798,0.047804,0.003744,0.023023,0.003682,0.019341,6.252951


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectNumberOfReferences,550798,2604,0.99535,0.244828,0.944451,550798,0.051422,0.003661,0.024223,0.003604,0.020618,6.720532


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectNumberOfSitelinks,550798,2604,0.995363,0.221246,0.941336,550798,0.045831,0.003789,0.022181,0.00374,0.018442,5.931224


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectNumberOfStatements,550798,2604,0.995365,0.243543,0.954905,550798,0.050105,0.003691,0.023762,0.003633,0.020129,6.540887


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectPredEmbedFrequency,550798,2604,0.995394,0.213752,0.932474,550798,0.044035,0.00383,0.021235,0.003786,0.017449,5.609473


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,subjectPredicateFrequency,550798,2604,0.995191,0.178019,0.956148,550798,0.03742,0.003981,0.018828,0.003929,0.0149,4.792631


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,superObject,550798,2604,0.995403,0.265635,0.957191,550798,0.054632,0.003588,0.025158,0.003522,0.021636,7.14296


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,superObjectFrequency,550798,2604,0.99535,0.265116,0.95712,550798,0.054755,0.003585,0.025661,0.00352,0.022142,7.290724


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,superSubject,550798,2604,0.995336,0.234299,0.948563,550798,0.049172,0.003712,0.022984,0.003655,0.019329,6.288283


Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Add,superSubjectFrequency,550798,2604,0.995383,0.253341,0.946519,550798,0.052777,0.00363,0.024559,0.003569,0.02099,6.880751


In [54]:
df_metrics_feature_optimization.to_csv(OUTPUT_DIR + 'fair-s-feature-optimization.csv')
df_metrics_feature_optimization

Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
,Baseline,550798,2604,0.995389,0.252511,0.960936,550798,0.051877,0.003651,0.024768,0.003581,0.021187,6.916121
Remove,editActionFrequency,550798,2604,0.995372,0.242291,0.945525,550798,0.050512,0.003682,0.023794,0.003622,0.020172,6.569106
Remove,editPrevActionFrequency,550798,2604,0.995359,0.223029,0.951828,550798,0.04606,0.003783,0.022076,0.003733,0.018343,5.914182
Remove,editProportionOfTriplesAdded,550798,2604,0.99533,0.224224,0.951365,550798,0.046563,0.003772,0.021917,0.003718,0.018199,5.895023
Remove,editSubactionFrequency,550798,2604,0.995363,0.239838,0.956333,550798,0.048565,0.003726,0.023293,0.003657,0.019635,6.36856
Remove,objectFrequency,550798,2604,0.995379,0.258561,0.955802,550798,0.054573,0.003589,0.02604,0.003527,0.022514,7.383327
Remove,objectPredEmbedFrequency,550798,2604,0.995309,0.241105,0.957997,550798,0.052433,0.003638,0.02546,0.003582,0.021879,7.108523
Remove,objectPredicateCumFrequency,550798,2604,0.99528,0.170887,0.953777,550798,0.035099,0.004034,0.018101,0.003981,0.01412,4.54666
Remove,predicateFrequency,550798,2604,0.995345,0.219378,0.95054,550798,0.045951,0.003786,0.02311,0.003728,0.019382,6.198965
Remove,subjectLabelWordLength,550798,2604,0.995359,0.224886,0.945112,550798,0.047052,0.003761,0.022355,0.003712,0.018643,6.022076


In [55]:
improvements = get_improvements(metrics)
improvements

Unnamed: 0_level_0,Unnamed: 1_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,Unnamed: 1_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
Operation,Column,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
,Baseline,550798,2604,0.995389,0.252511,0.960936,550798,0.051877,0.003651,0.024768,0.003581,0.021187,6.916121
