In [18]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from fairlearn.metrics import MetricFrame, selection_rate, count

train_data = pd.read_pickle(open("sentence-transformers/train_emb.pkl", "rb"))
train_X = list(train_data['TFIDF'])
train_y = (train_data['Sentiment'] == 'positive') * 1

dev_data = pd.read_pickle(open("sentence-transformers/dev_emb.pkl", "rb"))
dev_X = list(dev_data['TFIDF'])
dev_y  = (dev_data['Sentiment'] == 'positive') * 1



### Prediction results

In [19]:
lr = LogisticRegression(max_iter=500)
lr.fit(train_X, train_y)
labels_predict = lr.predict(dev_X)
print(classification_report(dev_y, labels_predict))
print(confusion_matrix(dev_y, labels_predict))

              precision    recall  f1-score   support

           0       0.71      0.68      0.69      2000
           1       0.69      0.72      0.70      2000

    accuracy                           0.70      4000
   macro avg       0.70      0.70      0.70      4000
weighted avg       0.70      0.70      0.70      4000

[[1358  642]
 [ 565 1435]]


### Bias Evaluation

#### 1. False positive rate + False negative rate

In [20]:
confusion = confusion_matrix(dev_y, labels_predict)
print(confusion[0])
false_positive_rate = confusion[0][1]/sum(confusion[0])
print("False positive rate:", false_positive_rate)

print(confusion[1])
false_negative_rate = confusion[1][0]/sum(confusion[1])
print("False negative rate:", false_negative_rate)

[1358  642]
False positive rate: 0.321
[ 565 1435]
False negative rate: 0.2825


#### 2. Accuracy + Selection rate

In [21]:
multi_metrics = {'accuracy': accuracy_score, 'selection_rate': selection_rate, 'count': count}


gm = MetricFrame(metrics=multi_metrics, y_true=dev_y, y_pred=labels_predict, sensitive_features=dev_data['Demographic'])
print(gm.overall)
print(gm.by_group)

accuracy          0.69825
selection_rate    0.51925
count                4000
dtype: object
            accuracy selection_rate count
Demographic                              
AAE           0.6645         0.5655  2000
SAE            0.732          0.473  2000


#### 3. Demographic parity difference

In [22]:
from fairlearn.metrics import demographic_parity_difference

group_metrics = demographic_parity_difference(y_true=dev_y, y_pred=labels_predict, sensitive_features=dev_data['Demographic'])
print("Difference between selection rate")
print("Demographic parity difference: {}".format(round(group_metrics, 2)))

Difference between selection rate
Demographic parity difference: 0.09
