In [4]:
import ast, random, pickle, pandas as pd, numpy as np

# Creating a Random Baseline

We don't want the random baseline to be entirely random but to reflect the distribution of labels found in the corpus. Therefore, we will use the given label distributions to generate random label predictions.

In [5]:
# load train + test data
train_cleaned = pd.read_csv('data/dpm_train.csv')
test = pd.read_csv('data/dpm_test.csv')

# load train labels (lower level & higher level)
# we need to turn the labels (currently list of strings) into a list of lists
y_train_low = train_cleaned.loc[:, 'label'].to_numpy()
y_train_low = np.array([np.array(ast.literal_eval(x)) for x in y_train_low])
y_train_high = train_cleaned.loc[:, 'higher level label'].to_numpy()
y_train_high = np.array([np.array(ast.literal_eval(x)) for x in y_train_high])

# load test labels (lower level & higher level)
# we need to turn the labels (currently list of strings) into a list of lists
y_test_low = test.loc[:, 'label'].to_numpy()
y_test_low = np.array([np.array(ast.literal_eval(x)) for x in y_test_low])
y_test_high = test.loc[:, 'higher level label'].to_numpy()
y_test_high = np.array([np.array(ast.literal_eval(x)) for x in y_test_high])

In [6]:
# find out label distributions
dist_low = y_train_low.sum(axis=0)/len(train_cleaned)
dist_high = y_train_high.sum(axis=0)/len(train_cleaned)

print('Distribution of lower level labels:',dist_low)
print('Distribution of higher level labels:',dist_high)

Distribution of lower level labels: [0.70184049 0.19263804 0.2196319  0.22453988 0.19386503 0.45889571
 0.03680982]
Distribution of higher level labels: [0.71288344 0.36809816 0.54601227]


We now create a function that will generate random predictions in the shapes that we need for the lower/higher level labels:

In [7]:
def baseline_predictions(input_len,label_dist):
    """ Function that creates distribution-based random predictions for our higher/lower level labels.
    Input:
        input_len: The length of our dataset (i.e. how many predictions have to be generated).
        label_dist: The distribution of labels (higher/lower level).
    Output:
        pred_collection: An array containing "random" predictions for every input paragraph.
    """
    random.seed(11)
    pred_collection = []
    for i in range(input_len):
        pred = []
        for l in range(len(label_dist)):
            x = random.random()
            if x < label_dist[l]:
                pred.append(1)
            else:
                pred.append(0)
        pred_collection.append(pred)
    return np.asarray(pred_collection)

Get "random" predictions for lower and higher level labels:

In [8]:
input_len = len(test)

pred_low = baseline_predictions(input_len,dist_low)
pred_high = baseline_predictions(input_len,dist_high)

In [9]:
# save predictions for later comparison
pklobj = open('data/pred_low_random.obj','wb')
pickle.dump(pred_low,pklobj)
pklobj.close()
pklobj = open('data/pred_high_random.obj','wb')
pickle.dump(pred_high,pklobj)
pklobj.close()

# Evaluation

In [10]:
import dontpatronizeme.ext_evaluation
from importlib import reload
reload(dontpatronizeme.ext_evaluation)

print('Sum lower level predictions: ',pred_low.sum(axis=0))
print('Sum higher level predictions: ',pred_high.sum(axis=0))

Sum lower level predictions:  [120  36  44  35  36  78   6]
Sum higher level predictions:  [124  63 100]


In [11]:
dontpatronizeme.ext_evaluation.evaluate(y_test_low, pred_low, 'll')

Unbalanced Power Relations
Accuracy: 0.632768361581921
Precision: 0.825
Recall: 0.6923076923076923
F1 Score: 0.752851711026616
Confusion Matrix: (tn, fp / fn, tp)
[[13 21]
 [44 99]]
--------------------------------------------------
Shallow Solution
Accuracy: 0.6779661016949152
Precision: 0.25
Recall: 0.23076923076923078
F1 Score: 0.24000000000000002
Confusion Matrix: (tn, fp / fn, tp)
[[111  27]
 [ 30   9]]
--------------------------------------------------
Presupposition
Accuracy: 0.6892655367231638
Precision: 0.38636363636363635
Recall: 0.37777777777777777
F1 Score: 0.38202247191011235
Confusion Matrix: (tn, fp / fn, tp)
[[105  27]
 [ 28  17]]
--------------------------------------------------
Authority Voice
Accuracy: 0.632768361581921
Precision: 0.22857142857142856
Recall: 0.17391304347826086
F1 Score: 0.19753086419753085
Confusion Matrix: (tn, fp / fn, tp)
[[104  27]
 [ 38   8]]
--------------------------------------------------
Metaphor
Accuracy: 0.7005649717514124
Precision: 0.

In [12]:
dontpatronizeme.ext_evaluation.evaluate(y_test_high, pred_high, 'hl')

The saviour
Accuracy: 0.6384180790960452
Precision: 0.8467741935483871
Recall: 0.7
F1 Score: 0.7664233576642335
Confusion Matrix: (tn, fp / fn, tp)
[[  8  19]
 [ 45 105]]
--------------------------------------------------
The expert
Accuracy: 0.536723163841808
Precision: 0.42857142857142855
Recall: 0.3698630136986301
F1 Score: 0.39705882352941174
Confusion Matrix: (tn, fp / fn, tp)
[[68 36]
 [46 27]]
--------------------------------------------------
The poet
Accuracy: 0.5480225988700564
Precision: 0.63
Recall: 0.5943396226415094
F1 Score: 0.6116504854368932
Confusion Matrix: (tn, fp / fn, tp)
[[34 37]
 [43 63]]
--------------------------------------------------
F1 Score Average: 0.5917108888768462


It is somehow surprising to see how well our random baseline performs. It sets a higher standard for our actual models to compare to than we expected.