# Embedding Optimization

## Preamble

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import collections
import gc
import logging
import warnings

import numpy as np
import scipy.sparse as sp
import pandas as pd

from scipy.sparse import hstack

from sklearn.linear_model import LogisticRegression

In [3]:
import sys
sys.path.append('../src/')

import load_wdvd
import load_predicate_embedding
import storage

In [4]:
from evaluation import evaluate_proba_performance_bias

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.width', 100)
pd.set_option('display.max_rows', 200)

## Load Data

In [6]:
PATH_FEATURES = '../../data/features/'

In [7]:
OUTPUT_DIR = '../../data/classification/'
FILE_OUTPUT_PREDICTIONS_TEST = OUTPUT_DIR + 'TEST_FAIR_E.csv.bz2'
FILE_OUTPUT_PREDICTIONS_VAL = OUTPUT_DIR + 'VAL_FAIR_E.csv.bz2'
FILE_OUTPUT_METRICS = OUTPUT_DIR + 'table-fair-e-feature-optimization.tex'

### Load Dataframe

In [8]:
fields = ['revisionId', 'isEditingTool']

dtype = {
    'revisionId': np.int,
    'isEditingTool': np.bool,
    'subject': np.int,
    'predicate': np.int,
    'object': np.int,
    'superSubject': np.int,
    'superObject': np.int,
}

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    df_train = pd.read_csv(PATH_FEATURES + '/training/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)
    df_val = pd.read_csv(PATH_FEATURES + '/validation/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)
    df_test = pd.read_csv(PATH_FEATURES + '/test/features.csv.bz2', index_col=0, usecols=fields, dtype=dtype)

len(df_train)
len(df_val)
len(df_test)

24280578

1703987

2827193

In [9]:
(~df_train['isEditingTool']).sum()
(~df_val['isEditingTool']).sum()
(~df_test['isEditingTool']).sum()

5890968

550798

560524

### Load WDVD Features

In [10]:
df_wdvd = load_wdvd.load_df_wdvd()

### Synchronize DataFrame

In [11]:
df_wdvd.loc[df_train.index, 'isEditingTool'] = df_train['isEditingTool']
df_wdvd.loc[df_val.index, 'isEditingTool'] = df_val['isEditingTool']
df_wdvd.loc[df_test.index, 'isEditingTool'] = df_test['isEditingTool']

In [12]:
df_train = df_wdvd.loc[df_train.index]
df_val = df_wdvd.loc[df_val.index]
df_test = df_wdvd.loc[df_test.index]
df_all = pd.concat([df_train, df_val, df_test])
print(len(df_all))

28811758


In [13]:
gc.collect()
del(df_wdvd)
gc.collect()

1201319

107

In [14]:
# backup data
df_all_sat = df_all.copy()

### Load Properties

In [15]:
# restore backed up data
df_all = df_all_sat.copy()

In [16]:
y_all = df_all['rollbackReverted'].values

In [17]:
%autoreload

In [18]:
N_FEATURES = 100  # 100 is better than 50 and 100 better than 200 in terms of ROC-AUC
SCORE_FUNC = load_predicate_embedding.count_nonzero

data, meta = load_predicate_embedding.load_matrices()
load_predicate_embedding.binarize_features(data)

load_predicate_embedding.select_item_predicates_at_end_of_training_set(data, meta)

slice_fit = slice(0, meta['n_train'])
load_predicate_embedding.select_features(data, meta, y_all, slice_fit, score_func=SCORE_FUNC, k=N_FEATURES)

In [19]:
meta['n_train']
meta['n_val']
meta['n_test']

24280578

1703987

2827193

In [20]:
for key, X in data.items():
    print('{}: {}'.format(key, X.shape))

X_S_all: (28811758, 100)
X_P_all: (28811758, 100)
X_O_all: (28811758, 100)
X_OO_all: (28811758, 100)


### Exclude Editing Tools

In [21]:
locals().update(meta)

In [22]:
protected_all = ~df_all['isRegisteredUser'].values.astype(np.bool)

In [23]:
mask_train = np.zeros_like(protected_all)
mask_val = np.zeros_like(protected_all)
mask_test = np.zeros_like(protected_all)

In [24]:
mask_train[0:n_train] = 1
mask_val[n_train:n_train + n_val] = 1
mask_test[n_train + n_val:n_train + n_val + n_test] = 1

In [25]:
mask_train.shape[0]
n_train + n_val + n_test

28811758

28811758

In [26]:
mask_all = ~df_all['isEditingTool'].values.astype(np.bool)
mask_all.sum()

7002290

In [27]:
y_all = y_all[mask_all]
protected_all = protected_all[mask_all]
df_all = df_all[mask_all]

mask_train = mask_train[mask_all]
mask_val = mask_val[mask_all]
mask_test = mask_test[mask_all]

for key, X in data.items():
    data[key] = X[mask_all]

In [28]:
for key in data:
    print('{}: {}'.format(key, data[key].shape))

X_S_all: (7002290, 100)
X_P_all: (7002290, 100)
X_O_all: (7002290, 100)
X_OO_all: (7002290, 100)


## Compute Feature Interactions

In [29]:
def combine_features(X_1, X_2):
    """for speed, the second vector X_2 should have the most features."""
    out_cols = []

    print('combining...')

    for col_idx in range(X_1.shape[1]):
        out_col = X_1[:, col_idx].multiply(X_2)
        out_cols.append(out_col)

    print('stacking...')
    result = hstack(out_cols, format='csc')
    print('stacking...done.')
    return result


def combine_feature_names(feature_names1, feature_names2):
    out_feature_names = []

    for f1_idx in range(len(feature_names1)):
        for f2_idx in range(len(feature_names2)):
            out_feature_names.append('{} {}'.format(feature_names1[f1_idx], feature_names2[f2_idx]))

    return out_feature_names

In [30]:
def classify(y_train, X_train, X_val, C=1e4, max_iter=100):  # we set C to max value of LogisticRegressionCV
    clf = LogisticRegression(
        random_state=1,
        solver='sag',
        C=C,
        max_iter=max_iter,
    )

    clf = clf.fit(X_train, y_train)
    proba_val = clf.predict_proba(X_val)[:, 1]
    return clf, proba_val

In [31]:
def compute_feature_interactions(d, m):
    logging.debug('X_SP_all...')
    d['X_SP_all']  = sp.hstack([d['X_S_all'], d['X_P_all']], format='csc')
    m['X_SP_all']  = np.hstack([m['X_S_all'], m['X_P_all']])
    logging.debug('X_SO_all...')
    d['X_SO_all']  = sp.hstack([d['X_S_all'], d['X_O_all']], format='csc')
    m['X_SO_all']  = np.hstack([m['X_S_all'], m['X_O_all']])
    logging.debug('X_PO_all...')
    d['X_PO_all']  = sp.hstack([d['X_P_all'], d['X_O_all']], format='csc')
    m['X_PO_all']  = np.hstack([m['X_P_all'], m['X_O_all']])
    logging.debug('X_SPO_all...')
    d['X_SPO_all'] = sp.hstack([d['X_P_all'], d['X_P_all'], d['X_O_all']], format='csc')
    m['X_SPO_all'] = np.hstack([m['X_P_all'], m['X_P_all'], m['X_O_all']])

    logging.debug('SxP...')
    d['X_SxP_all']   = combine_features(     d['X_S_all'], d['X_P_all'])
    m['X_SxP_all']   = combine_feature_names(m['X_S_all'], m['X_P_all'])
    logging.debug('SxO...')
    d['X_SxO_all']   = combine_features(     d['X_S_all'], d['X_O_all'])
    m['X_SxO_all']   = combine_feature_names(m['X_S_all'], m['X_O_all'])
    logging.debug('PxO...')
    d['X_PxO_all']   = combine_features(     d['X_P_all'], d['X_O_all'])
    m['X_PxO_all']   = combine_feature_names(m['X_P_all'], m['X_O_all'])
    logging.debug('SxPxO...')
    d['X_SxPxO_all'] = combine_features(     d['X_O_all'], d['X_SxP_all'])  # for speed, second vector should have more features
    m['X_SxPxO_all'] = combine_feature_names(m['X_O_all'], m['X_SxP_all'])

    logging.debug('X_SxO_PxO_all...')
    d['X_SxO_PxO_all'] = sp.hstack([d['X_SxO_all'], d['X_PxO_all']], format='csc')
    m['X_SxO_PxO_all'] = np.hstack([m['X_SxO_all'], m['X_PxO_all']])
    logging.debug('X_SxP_PxO_all...')
    d['X_SxP_PxO_all'] = sp.hstack([d['X_SxP_all'], d['X_PxO_all']], format='csc')
    m['X_SxP_PxO_all'] = np.hstack([m['X_SxP_all'], m['X_PxO_all']])
    logging.debug('X_SxP_SxO_all...')
    d['X_SxP_SxO_all'] = sp.hstack([d['X_SxP_all'], d['X_SxO_all']], format='csc')
    m['X_SxP_SxO_all'] = np.hstack([m['X_SxP_all'], m['X_SxO_all']])

In [32]:
def dict_tocsc(data):
    for name in data:
        data[name] = data[name].tocsc()

In [33]:
def dict_tocsr(data):
    for name in data:
        data[name] = data[name].tocsr()

In [34]:
dict_tocsc(data)

In [35]:
locals().update(data)

In [36]:
compute_feature_interactions(data, meta)
locals().update(data)

combining...
stacking...
stacking...done.
combining...
stacking...
stacking...done.
combining...
stacking...
stacking...done.
combining...
stacking...
stacking...done.


In [37]:
dict_tocsr(data)

In [38]:
y_all.shape
X_S_all.shape
mask_train.shape

(7002290,)

(7002290, 100)

(7002290,)

## Statistics

In [39]:
# How many embeddings end up to be 0 because we only take the top 100 predicates?
df_statistics = pd.DataFrame()

df_statistics.loc['S', 'nonzero'] = np.count_nonzero(np.sum(X_S_all, axis=1))
df_statistics.loc['P', 'nonzero'] = np.count_nonzero(np.sum(X_P_all, axis=1))
df_statistics.loc['O', 'nonzero'] = np.count_nonzero(np.sum(X_O_all, axis=1))
df_statistics.loc['SxP', 'nonzero'] = np.count_nonzero(np.sum(X_SxP_all, axis=1))
df_statistics.loc['PxO', 'nonzero'] = np.count_nonzero(np.sum(X_PxO_all, axis=1))
df_statistics.loc['SxP_PxO', 'nonzero'] = np.count_nonzero(np.sum(X_SxP_PxO_all, axis=1))
df_statistics['proportion'] = df_statistics['nonzero'] / X_S_all.shape[0]

formatters = {
    'nonzero': '{:,.0f}',
    'proportion': '{:.1%}'
}
df_statistics.style.format(formatters)

Unnamed: 0,nonzero,proportion
S,6452325,92.1%
P,5988266,85.5%
O,6111397,87.3%
SxP,5492194,78.4%
PxO,5329102,76.1%
SxP_PxO,5913664,84.5%


## Experiments

### Feature Selection

In [40]:
datasets = collections.OrderedDict()

datasets['SxP_PxO'] = X_SxP_PxO_all
datasets['SxP_SxO'] = X_SxP_SxO_all
datasets['SxO_PxO'] = X_SxO_PxO_all

datasets['SxPxO'] = X_SxPxO_all
datasets['SxP'] = X_SxP_all
datasets['SxO'] = X_SxO_all
datasets['PxO'] = X_PxO_all

datasets['SPO'] = X_SPO_all
datasets['SP'] = X_SP_all
datasets['SO'] = X_SO_all
datasets['PO'] = X_PO_all

datasets['S'] = X_S_all
datasets['P'] = X_P_all
datasets['O'] = X_O_all

In [41]:
metrics = []
for index, X in datasets.items():
    _, scores = classify(y_all[mask_train], X[mask_train], X[mask_val])
    metrics_cur = evaluate_proba_performance_bias(y_all[mask_val], protected_all[mask_val], scores, index=index)
    display(metrics_cur)
    metrics.append(metrics_cur)

df_metrics = pd.concat(metrics)



Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SxP_PxO,550798,2604,0.995154,0.111525,0.848233,550798,0.024509,0.004276,0.012339,0.004256,0.008083,2.89933


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SxP_SxO,550798,2604,0.995209,0.09488,0.740849,550798,0.021797,0.004338,0.011426,0.00432,0.007106,2.645027


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SxO_PxO,550798,2604,0.995241,0.068035,0.721034,550798,0.017181,0.004443,0.010094,0.00443,0.005664,2.278549


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SxPxO,550798,2604,0.995234,0.073458,0.674694,550798,0.018638,0.00441,0.010776,0.004398,0.006378,2.450327


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SxP,550798,2604,0.995145,0.024617,0.718301,550798,0.008591,0.004639,0.006727,0.004636,0.002092,1.451206


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SxO,550798,2604,0.995176,0.028162,0.628964,550798,0.00924,0.004625,0.006919,0.00462,0.002298,1.497369


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
PxO,550798,2604,0.995165,0.046818,0.733455,550798,0.012868,0.004542,0.007681,0.004534,0.003147,1.694122


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SPO,550798,2604,0.995278,0.067787,0.738702,550798,0.016399,0.004461,0.008625,0.004451,0.004174,1.937837


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SP,550798,2604,0.995269,0.037838,0.689554,550798,0.011552,0.004572,0.008138,0.004566,0.003572,1.782454


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SO,550798,2604,0.995281,0.03406,0.718794,550798,0.009935,0.004609,0.007016,0.004602,0.002414,1.524444


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
PO,550798,2604,0.995278,0.067789,0.738742,550798,0.016399,0.004461,0.008625,0.004451,0.004174,1.937795


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
S,550798,2604,0.995272,0.020034,0.642548,550798,0.007267,0.00467,0.005875,0.004663,0.001212,1.259943


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
P,550798,2604,0.995272,0.008018,0.659121,550798,0.005754,0.004704,0.005508,0.004703,0.000805,1.171081


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
O,550798,2604,0.995272,0.011113,0.663364,550798,0.005684,0.004706,0.005131,0.004704,0.000427,1.090662


In [42]:
display(df_metrics)

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
SxP_PxO,550798,2604,0.995154,0.111525,0.848233,550798,0.024509,0.004276,0.012339,0.004256,0.008083,2.89933
SxP_SxO,550798,2604,0.995209,0.09488,0.740849,550798,0.021797,0.004338,0.011426,0.00432,0.007106,2.645027
SxO_PxO,550798,2604,0.995241,0.068035,0.721034,550798,0.017181,0.004443,0.010094,0.00443,0.005664,2.278549
SxPxO,550798,2604,0.995234,0.073458,0.674694,550798,0.018638,0.00441,0.010776,0.004398,0.006378,2.450327
SxP,550798,2604,0.995145,0.024617,0.718301,550798,0.008591,0.004639,0.006727,0.004636,0.002092,1.451206
SxO,550798,2604,0.995176,0.028162,0.628964,550798,0.00924,0.004625,0.006919,0.00462,0.002298,1.497369
PxO,550798,2604,0.995165,0.046818,0.733455,550798,0.012868,0.004542,0.007681,0.004534,0.003147,1.694122
SPO,550798,2604,0.995278,0.067787,0.738702,550798,0.016399,0.004461,0.008625,0.004451,0.004174,1.937837
SP,550798,2604,0.995269,0.037838,0.689554,550798,0.011552,0.004572,0.008138,0.004566,0.003572,1.782454
SO,550798,2604,0.995281,0.03406,0.718794,550798,0.009935,0.004609,0.007016,0.004602,0.002414,1.524444


In [43]:
df_metrics_out = df_metrics[[
    ('Performance', 'PR'),
    ('Performance', 'ROC'),
    ('Bias', 'score_diff'),
    ('Bias', 'score_ratio')
]]

formatters = {
    ('Performance', 'PR'): '{:.3f}'.format,
    ('Performance', 'ROC'): '{:.3f}'.format,
    ('Bias', 'score_diff'): '{:.4f}'.format,
    ('Bias', 'score_ratio'): '{:.2f}'.format
}

df_metrics_out.to_latex(FILE_OUTPUT_METRICS, formatters=formatters)

df_metrics_out.style.format(formatters)

Unnamed: 0_level_0,Performance,Performance,Bias,Bias
Unnamed: 0_level_1,PR,ROC,score_diff,score_ratio
SxP_PxO,0.112,0.848,0.0081,2.9
SxP_SxO,0.095,0.741,0.0071,2.65
SxO_PxO,0.068,0.721,0.0057,2.28
SxPxO,0.073,0.675,0.0064,2.45
SxP,0.025,0.718,0.0021,1.45
SxO,0.028,0.629,0.0023,1.5
PxO,0.047,0.733,0.0031,1.69
SPO,0.068,0.739,0.0042,1.94
SP,0.038,0.69,0.0036,1.78
SO,0.034,0.719,0.0024,1.52


### Parameter Tuning

In [44]:
metrics_params = []

for i in range(-4, 5):
    C = 10**i

    print(C)
    _, scores = classify(y_all[mask_train], X_SxP_PxO_all[mask_train], X_SxP_PxO_all[mask_val], C=C)
    metrics_cur = evaluate_proba_performance_bias(y_all[mask_val], protected_all[mask_val], scores, index=str(C))

    display(metrics_cur)
    metrics_params.append(metrics_cur)

df_metrics_params = pd.concat(metrics_params)

0.0001


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
0.0001,550798,2604,0.995272,0.058867,0.782984,550798,0.015555,0.00448,0.009056,0.004471,0.004585,2.025528


0.001


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
0.001,550798,2604,0.995272,0.120084,0.824154,550798,0.025893,0.004244,0.012607,0.004227,0.00838,2.982596


0.01


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
0.01,550798,2604,0.995303,0.168714,0.854571,550798,0.035118,0.004033,0.016106,0.004005,0.0121,4.020938


0.1


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
0.1,550798,2604,0.995323,0.173329,0.865217,550798,0.036313,0.004006,0.016486,0.003976,0.01251,4.146519


1




Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
1,550798,2604,0.995292,0.148556,0.863511,550798,0.03119,0.004123,0.014792,0.004097,0.010695,3.610396


10


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
10,550798,2604,0.995203,0.120209,0.852438,550798,0.025885,0.004244,0.01282,0.004223,0.008597,3.03551


100


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
100,550798,2604,0.995162,0.112425,0.848731,550798,0.024649,0.004273,0.012403,0.004253,0.008151,2.916656


1000


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
1000,550798,2604,0.995154,0.111593,0.848266,550798,0.024508,0.004276,0.012335,0.004256,0.008079,2.898205


10000


Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
10000,550798,2604,0.995154,0.111525,0.848233,550798,0.024509,0.004276,0.012339,0.004256,0.008083,2.89933


In [45]:
df_metrics_params

Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
0.0001,550798,2604,0.995272,0.058867,0.782984,550798,0.015555,0.00448,0.009056,0.004471,0.004585,2.025528
0.001,550798,2604,0.995272,0.120084,0.824154,550798,0.025893,0.004244,0.012607,0.004227,0.00838,2.982596
0.01,550798,2604,0.995303,0.168714,0.854571,550798,0.035118,0.004033,0.016106,0.004005,0.0121,4.020938
0.1,550798,2604,0.995323,0.173329,0.865217,550798,0.036313,0.004006,0.016486,0.003976,0.01251,4.146519
1.0,550798,2604,0.995292,0.148556,0.863511,550798,0.03119,0.004123,0.014792,0.004097,0.010695,3.610396
10.0,550798,2604,0.995203,0.120209,0.852438,550798,0.025885,0.004244,0.01282,0.004223,0.008597,3.03551
100.0,550798,2604,0.995162,0.112425,0.848731,550798,0.024649,0.004273,0.012403,0.004253,0.008151,2.916656
1000.0,550798,2604,0.995154,0.111593,0.848266,550798,0.024508,0.004276,0.012335,0.004256,0.008079,2.898205
10000.0,550798,2604,0.995154,0.111525,0.848233,550798,0.024509,0.004276,0.012339,0.004256,0.008083,2.89933


## Evaluation on Test Set

In [46]:
mask_train_val = mask_train | mask_val
y_all[mask_train_val].shape

(6441766,)

In [47]:
clf, scores = classify(y_all[mask_train_val], X_SxP_PxO_all[mask_train_val], X_SxP_PxO_all[mask_test])
evaluate_proba_performance_bias(y_all[mask_test], protected_all[mask_test], scores)

FAIR_E_TEST = scores
storage.dump_predictions(df_all[mask_test], scores, FILE_OUTPUT_PREDICTIONS_TEST)



Unnamed: 0_level_0,Performance,Performance,Performance,Performance,Performance,Bias,Bias,Bias,Bias,Bias,Bias,Bias
Unnamed: 0_level_1,n_samples,n_positive,ACC,PR,ROC,n_samples,p_mean,np_mean,bp_mean,bnp_mean,score_diff,score_ratio
,560524,2429,0.995656,0.177,0.864932,560524,0.037805,0.003581,0.01976,0.003524,0.016235,5.60696


## Analysis of Example

### Example of Non-Vandalism

In [48]:
# <Alejandro Cuello (Q15924626), occupation (P106), actor (Q33999)>
# REVISION_ID = 318323765
REVISION_ID = 325717121

In [49]:
row_index = df_all.index.get_loc(REVISION_ID)
row = data['X_SxP_PxO_all'][row_index].todense()
row_coefs = np.squeeze(np.array(np.multiply(row, clf.coef_)))
feature_names = meta['X_SxP_PxO_all']

In [50]:
np.sum(row)

20

In [51]:
df_coefs = pd.DataFrame(data={
    'row_coef': row_coefs,
    'feature': feature_names})
df_coefs = df_coefs.sort_values('row_coef', ascending=False)
df_coefs = df_coefs[['feature', 'row_coef']]
df_coefs.tail(10)

Unnamed: 0,feature,row_coef
7924,S735 P106,-0.074602
524,S21 P106,-0.105342
12480,P106 O971,-0.14624
12411,P106 O69,-0.14731
12492,P106 O1423,-0.224475
12448,P106 O360,-0.308088
12485,P106 O1269,-0.412319
12414,P106 O101,-0.582247
12407,P106 O39,-1.074514
12416,P106 O106,-2.617238


### Example of Vandalism

In [52]:
# <Steve Jobs(Q19837), instance of(P31), Animal(Q729)>
REVISION_ID = 312432941

In [53]:
row_index = df_all.index.get_loc(REVISION_ID)
row = data['X_SxP_PxO_all'][row_index].todense()
row_coefs = np.squeeze(np.array(np.multiply(row, clf.coef_)))
feature_names = meta['X_SxP_PxO_all']

In [54]:
np.sum(row)

38

In [55]:
df_coefs = pd.DataFrame(data={
    'row_coef': row_coefs,
    'feature': feature_names})
df_coefs = df_coefs.sort_values('row_coef', ascending=False)
df_coefs = df_coefs[['feature', 'row_coef']]
df_coefs.head(20)

Unnamed: 0,feature,row_coef
10965,P31 O689,2.254391
1109,S31 P31,2.145446
10933,P31 O171,1.320339
6809,S460 P31,0.81411
309,S19 P31,0.806282
909,S27 P31,0.755346
8409,S910 P31,0.747771
109,S9 P31,0.74526
10983,P31 O1074,0.64515
10984,P31 O1204,0.628263


### Analysis of All Coefficients

In [56]:
df_coefs = pd.DataFrame(data={'coef': clf.coef_.reshape(-1), 'feature': meta['X_SxP_PxO_all']})
df_coefs = df_coefs.sort_values('coef', ascending=False)
df_coefs = df_coefs[['feature', 'coef']]
df_coefs.head(10)

Unnamed: 0,feature,coef
8450,S910 P197,6.349102
16797,P462 O1889,6.03794
10650,P26 O364,5.930705
12143,P102 O272,5.629406
2958,S123 P361,5.534869
10106,P17 O37,5.399539
12233,P103 O171,5.383566
11108,P40 O47,5.354032
16357,P413 O466,5.318285
13805,P161 O31,5.284308


In [57]:
df_coefs.tail(10)

Unnamed: 0,feature,coef
13125,P138 O138,-4.785703
6177,S360 P734,-4.879696
12466,P106 O703,-4.945783
2614,S106 P54,-5.073921
14536,P180 O180,-5.301099
19187,P1346 O1346,-5.650331
15344,P276 O276,-5.810466
1658,S50 P361,-5.843258
3834,S150 P150,-6.46499
1268,S36 P463,-7.397714
