Multilabel Baselines
===



In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.append("../annotation_data")

In [3]:
from responsibility import *

In [32]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.metrics
import sklearn.model_selection
import subprocess
import scipy
import itertools
from collections import Counter

In [5]:
import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl

In [15]:
def get_scores(y_true, y_pred):
    weighted_f1_score = sklearn.metrics.fbeta_score(y_true, y_pred, 1, average='weighted')
    weighted_p_score = sklearn.metrics.precision_score(y_true, y_pred, average='weighted')
    weighted_r_score = sklearn.metrics.recall_score(y_true, y_pred, average='weighted')
    return weighted_p_score, weighted_r_score, weighted_f1_score

In [41]:
def format_float(val):
    if val >= 0 and val < 0.995:
        return "{:.2f}".format(val)[1:]
    elif val >= 0.995:
        return ".99"
    else:
        raise ValueError("Negatives not handled.")

### Responsibilities

In [6]:
resp_subset = high_irr_responsibility_labels

In [7]:
annotated_df = get_annotated_responsibility_df_fixed(conflict_score_cost=0.1, resp_subset=resp_subset)
len(annotated_df)

1895

In [105]:
n_samples = len(annotated_df)
n_classes = len(resp_subset)
y_train_score = annotated_df.loc[range(n_samples), [resp_label + "_score" for resp_label in resp_subset]].values
y_true = (y_train_score > 0.5).astype(int)
y_true.shape

(1895, 6)

In [17]:
baseline = np.zeros(y_true.shape)

In [31]:
def get_pred_matrix_from_row(pred_row, y_true):
    return np.tile(pred_row, y_true.shape[0]).reshape(y_true.shape)

In [62]:
def hamming_loss(y_true):
    label_proportions = np.sum(y_true.T, axis=1) / y_true.shape[0]
    subset = label_proportions >= 0.5
    y_pred = get_pred_matrix_from_row(subset, y_true)
    p, r, f1 = get_scores(y_true, y_pred)
    p = format_float(p)
    r = format_float(r)
    f1 = format_float(f1)
    string = f"{p} & {r} & {f1} \\\\"
    return string

In [63]:
def subset_accuracy(y_true):
    subset_counts = Counter(tuple(row) for row in y_true).most_common()
    max_subset = subset_counts[0][0]
    y_pred = get_pred_matrix_from_row(max_subset, y_true)
    p, r, f1 = get_scores(y_true, y_pred)
    p = format_float(p)
    r = format_float(r)
    f1 = format_float(f1)
    string = f"{p} & {r} & {f1} \\\\"
    return string

In [104]:
def f_measure(y_true):
    cols = list(range(y_true.shape[1]))
    all_subsets = []
    for i in range(1, len(cols) + 1):
        all_subsets += list(itertools.combinations(cols, i))

    max_f1_score = -1
    max_f1_subset = None
    for subset in all_subsets:
        subset_arr = np.zeros(y_true.shape[1])
        subset_arr[list(subset)] = 1
        
        y_pred = get_pred_matrix_from_row(subset_arr, y_true)
        weighted_f1_score = sklearn.metrics.fbeta_score(y_true, y_pred, 1, average='weighted')
        if weighted_f1_score > max_f1_score:
            max_f1_score = weighted_f1_score
            max_f1_subset = subset_arr
    
    print("Best label set:", max_f1_subset)
    y_pred = get_pred_matrix_from_row(max_f1_subset, y_true)
    p, r, f1 = get_scores(y_true, y_pred)
    p = format_float(p)
    r = format_float(r)
    f1 = format_float(f1)
    string = f"{p} & {r} & {f1} \\\\"
    return string

In [106]:


print(hamming_loss(y_true))
print(subset_accuracy(y_true))
print(f_measure(y_true))

.70 & .86 & .77 \\
.70 & .86 & .77 \\
Best label set: [1. 1. 1. 1. 1. 1.]
.72 & .99 & .80 \\


## Phases

In [87]:
from phase import *

In [89]:
working_dir_phase = '/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/classification/phases/vw'
assert os.path.exists(working_dir_phase)
phases_df_filepath = os.path.join(working_dir_phase, 'full_df.pkl')
phases_df = pd.read_pickle(phases_df_filepath)
annotated_df_phase = phases_df[phases_df.is_annotated]
del phases_df
len(annotated_df_phase)

9336

In [107]:
n_samples = len(annotated_df_phase)
n_classes = len(phase_labels)
y_train_score = annotated_df_phase.loc[range(n_samples), [phase_label + "_score" for phase_label in phase_labels]].values
y_true = (y_train_score > 0.5).astype(int)
y_true.shape

(9336, 4)

In [108]:
print(hamming_loss(y_true))
print(subset_accuracy(y_true))
print(f_measure(y_true))

.74 & .86 & .79 \\
.74 & .86 & .79 \\
Best label set: [1. 1. 1. 1.]
.74 & .99 & .81 \\
