**Imports**

In [1]:
import pandas as pd
import numpy as np
import torch
from huggingface_hub import login

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, roc_auc_score

from itertools import chain

In [2]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
#if cats = True, category models
cats = False

**Test data**

In [5]:
testdata = pd.read_csv("datasets/gendered_test_with_categories.csv")

In [6]:
#Beck's symptom categories

test_affective = [3, 6, 12, 15]
test_motivational = [4, 11]
test_cognitive = [14, 21]
test_cog_distortions = [5, 7, 8, 9, 10, 16]
test_behavioral = [13, 17, 19, 22]
test_physiological = [18, 20, 23]

test_symptom_cat = [test_affective, test_motivational, test_cognitive, test_cog_distortions, test_behavioral, test_physiological]
test_symptom_cat_names = ['Affective', 'Motivational', 'Cognitive', 'Cog_distortions', 'Behavioral', 'Physiological']

In [7]:
#Splitting into male and female subsets
male = testdata[testdata['Gender']==0]
female = testdata[testdata['Gender']==1]

male = male.reset_index()
female = female.reset_index()

In [8]:
#login()

**Load models**

In [9]:
class MBERTClass(torch.nn.Module):
    def __init__(self):
        super(MBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("mental/mental-bert-base-uncased")
        self.l2 = torch.nn.Dropout(0.2)
        if cats == False:
            self.l3 = torch.nn.Linear(768, 21)
        if cats == True:
            self.l3 = torch.nn.Linear(768, 5)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [10]:
if cats == False:
    repl_model = torch.load('models/actual-fine-tuned-mBERT.pt')
    gendered_model = torch.load('models/gendered-mBERT.pt')
    aug_model = torch.load('models/irr-syn-eda-3-mBERT.pt')
    syn_model = torch.load('models/fail_social_syn_eda-mBERT.pt')
    trans_model = torch.load('models/fail_social_back_trans-mBERT.pt')
    sample_model = torch.load('models/fail_social_sample-mBERT.pt')
    models = [repl_model, gendered_model, aug_model, syn_model, trans_model, sample_model]
    model_names = ['repl_model', 'gendered_model', 'aug_model', 'syn_model', 'trans_model', 'sample_model']

if cats == True:
    category_model = torch.load('models/category-actual-fine-tuned-mBERT.pt')
    syn_model = torch.load('models/all-syn-eda-mBERT.pt')
    trans_model = torch.load('models/all-back_trans-mBERT.pt')
    sample_model = torch.load('models/all-sample-mBERT.pt')
    models = [category_model, syn_model, trans_model, sample_model]
    model_names = ['category_model', 'syn_model', 'trans_model', 'sample_model']

**Create testing sets**

In [11]:
testdata.columns[3:24]

Index(['Sadness', 'Pessimism', 'Sense_of_failure', 'Loss_of_Pleasure',
       'Guilty_feelings', 'Sense_of_punishment', 'Self-dislike',
       'Self-incrimination', 'Suicidal_ideas', 'Crying', 'Agitation',
       'Social_withdrawal', 'Indecision', 'Feelings_of_worthlessness',
       'Loss_of_energy', 'Change_of_sleep', 'Irritability',
       'Changes_in_appetite', 'Concentration_difficulty',
       'Tiredness_or_fatigue', 'Loss_of_interest_in_sex'],
      dtype='object')

In [12]:
if cats == False:
    testdata['list'] = testdata[testdata.columns[3:24]].values.tolist()
    male['list'] = male[male.columns[4:25]].values.tolist()
    female['list'] = female[female.columns[4:25]].values.tolist()
    
    new_test = testdata[['Sentence', 'list']].copy()
    new_male_test = male[['Sentence', 'list']].copy()
    new_female_test = female[['Sentence', 'list']].copy()

#Categories
if cats == True:
    testdata['list'] = testdata[testdata.columns[26:31]].values.tolist()
    male['list'] = male[male.columns[27:32]].values.tolist()
    female['list'] = female[female.columns[27:32]].values.tolist()

    new_test = testdata[['Sentence', 'list']].copy()
    new_male_test = male[['Sentence', 'list']].copy()
    new_female_test = female[['Sentence', 'list']].copy()

In [13]:
MAX_LEN = 128
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")

torch.manual_seed(10)

<torch._C.Generator at 0x2371f411b10>

In [14]:
#class to tokenize the data and create the dataset for the model

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Sentence
        self.targets = dataframe.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [15]:
testing_set = CustomDataset(new_test, tokenizer, MAX_LEN)
male_testing_set = CustomDataset(new_male_test, tokenizer, MAX_LEN)
female_testing_set = CustomDataset(new_female_test, tokenizer, MAX_LEN)

In [16]:
model_names

['repl_model',
 'gendered_model',
 'aug_model',
 'syn_model',
 'trans_model',
 'sample_model']

**Evaluation and fairness measures**

In [17]:
test_params = {'batch_size': 32,
                'shuffle': False,
                'num_workers': 0
                }

test_loader = DataLoader(testing_set, **test_params)
male_test_loader = DataLoader(male_testing_set, **test_params)
female_test_loader = DataLoader(female_testing_set, **test_params)

In [18]:
#evaluation total

def eval_total(model, test_loader):

    model.eval()

    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    outputs, targets = fin_outputs, fin_targets
    outputs = (np.array(outputs) >= 0.5).astype(int)
    targets = [[int(num) for num in sublist] for sublist in targets]
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro', zero_division = 0.0)
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro', zero_division = 0.0)
    f1_score_weighted = metrics.f1_score(targets, outputs, average='weighted', zero_division = 0.0)
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"F1 Score (Weighted) = {f1_score_weighted}")

    #precision_micro = precision_score(targets, outputs, average='micro', zero_division = 0.0)
    precision_macro = precision_score(targets, outputs, average='macro', zero_division = 0.0)
    #recall_micro = recall_score(targets, outputs, average='micro', zero_division = 0.0)
    recall_macro = recall_score(targets, outputs, average='macro', zero_division = 0.0)
    
    #print(f"Precision (Micro) = {precision_micro}")
    print(f"Precision (Macro) = {precision_macro}")
    #print(f"Recall (Micro) = {recall_micro}")
    print(f"Recall (Macro) = {recall_macro}")    
    
    #auc_micro = roc_auc_score(targets, outputs, average='micro')
    #print(f"AUC = {auc_micro}")
    return targets, outputs, accuracy, f1_score_micro, f1_score_macro, f1_score_weighted, precision_macro, recall_macro

In [19]:
#Categories model F1 Score per category

def category_performance(targets, outputs):

    all_accuracy = []
    all_f1_score = []
    #print(targets[0])
    for i in range(len(targets[0])):
            category_targets = [category[i] for category in targets]
            category_outputs = [category[i] for category in outputs]
            accuracy = metrics.accuracy_score(category_targets, category_outputs)
            f1_score = metrics.f1_score(category_targets, category_outputs, zero_division = 0.0)
            print('\nCategory: ', testdata.columns[i+26])
            print(f"Accuracy Score = {accuracy}")
            print(f"F1 Score = {f1_score}")
            all_accuracy.append(accuracy)
            all_f1_score.append(f1_score)
    return all_accuracy, all_f1_score

def symptom_performance(targets, outputs):

    all_accuracy = []
    all_f1_score = []
    for i in range(len(targets[0])):
        symptom_targets = [symptom[i] for symptom in targets]
        symptom_outputs = [symptom[i] for symptom in outputs]
        accuracy = metrics.accuracy_score(symptom_targets, symptom_outputs)
        f1_score = metrics.f1_score(symptom_targets, symptom_outputs, zero_division = 0.0)
        print('\nSymptom: ', testdata.columns[i+3])
        print(f"Accuracy Score = {accuracy}")
        print(f"F1 Score = {f1_score}")
    
        precision = precision_score(symptom_targets, symptom_outputs, zero_division = 0.0)
        recall = recall_score(symptom_targets, symptom_outputs, zero_division = 0.0)
    
        print(f"Precision = {precision}")
        print(f"Recall = {recall}")  
    
        try:
            auc = roc_auc_score(symptom_targets, symptom_outputs)
            print(f"AUC = {auc}")
        except ValueError or TypeError:
            pass
        all_accuracy.append(accuracy)
        all_f1_score.append(f1_score)
    return all_accuracy, all_f1_score

In [20]:
#function that returns the TPR and FPR rate, as well as the number of positive targets
def TPR_FPR(targets, outputs):
    TP = 0
    FN = 0
    FP = 0
    TN = 0

    a = 1

    for i in range(len(outputs)):
        if outputs[i]==targets[i]==1:
          TP +=1  
        if outputs[i]==0 and targets[i]==1:
          FN +=1
        if outputs[i]==1 and targets[i]==0:
           FP +=1
        if outputs[i]==targets[i]==0:
           TN +=1
    
    TP_rate = (TP+a)/(FN+TP+a)
    FP_rate = (FP+a)/(FP+TN+a)
    P = TP + FN

    return TP_rate, FP_rate, P

In [21]:
#category model equal opportunity and equalized odds
def category_equal_odds(male_targets, female_targets, male_outputs, female_outputs):
    equal_opp_all = []
    FP_diff_all = []
    P_all = []
    
    for i in range(len(male_targets[0])):
        print('Category: ', testdata.columns[i+26])
        male_category_targets = [category[i] for category in male_targets]
        female_category_targets = [category[i] for category in female_targets]
        male_category_outputs = [category[i] for category in male_outputs]
        female_category_outputs = [category[i] for category in female_outputs]
    
        male_TP_rate, male_FP_rate, male_P = TPR_FPR(male_category_targets, male_category_outputs)
        female_TP_rate, female_FP_rate, female_P = TPR_FPR(female_category_targets, female_category_outputs)
        P_category = male_P + female_P
        P_all.append(P_category)
        equal_opp = female_TP_rate/male_TP_rate
        FP_diff = female_FP_rate/male_FP_rate
        equal_opp_all.append(equal_opp)
        FP_diff_all.append(FP_diff)
        #equal_odds = max(equal_opp, FP_diff)
        #equal_odds_all.append(equal_odds)
        print('Equal opportunity: ', equal_opp)
        print('FP Difference: ', FP_diff)

    return equal_opp_all, FP_diff_all, P_all 

def symptom_equal_odds(male_targets, female_targets, male_outputs, female_outputs):
    equal_opp_all = []
    FP_diff_all = []
    P_all = []

    for i in range(len(male_targets[0])):
        print('Symptom: ', testdata.columns[i+3])
        male_symptom_targets = [symptom[i] for symptom in male_targets]
        female_symptom_targets = [symptom[i] for symptom in female_targets]
        male_symptom_outputs = [symptom[i] for symptom in male_outputs]
        female_symptom_outputs = [symptom[i] for symptom in female_outputs]
    
        male_TP_rate, male_FP_rate, male_P = TPR_FPR(male_symptom_targets, male_symptom_outputs)
        female_TP_rate, female_FP_rate, female_P = TPR_FPR(female_symptom_targets, female_symptom_outputs)
        P_symptom = male_P + female_P
        P_all.append(P_symptom)
        equal_opp = female_TP_rate/male_TP_rate
        FP_diff = female_FP_rate/male_FP_rate
        FP_diff_all.append(FP_diff)
        equal_opp_all.append(equal_opp)
        print('Equal opportunity: ', equal_opp)
        print('FPR Diff: ', FP_diff)
    return equal_opp_all, FP_diff_all, P_all 

In [22]:
len(testdata)

208

In [24]:
for i in range(len(models)):
    print(model_names[i])
    model = models[i]

    print('Total')
    targets, outputs, accuracy, f1_score_micro, f1_score_macro, f1_score_weighted, precision_macro, recall_macro = eval_total(model, test_loader)
    #print(targets, outputs, accuracy, f1_score_micro, f1_score_macro, f1_score_weighted, precision_macro, auc_micro)
    print('Male performance')
    male_targets, male_outputs, male_accuracy, male_f1_score_micro, male_f1_score_macro, male_f1_score_weighted, male_precision_macro, male_recall_macro = eval_total(model, male_test_loader)
    print('Female performance')
    female_targets, female_outputs, female_accuracy, female_f1_score_micro, female_f1_score_macro, female_f1_score_weighted, female_precision_macro, female_recall_macro = eval_total(model, female_test_loader)

    if cats == True:
        print('Total')
        all_accuracy, all_f1_score = category_performance(targets, outputs)
        print('Male')
        male_all_accuracy, male_all_f1_score = category_performance(male_targets, male_outputs)
        print('Female')
        female_all_accuracy, female_all_f1_score = category_performance(female_targets, female_outputs)
        all_equal_opp, all_fp_diff, P_all = category_equal_odds(male_targets, female_targets, male_outputs, female_outputs)

    if cats == False:
        print('Total')
        all_accuracy, all_f1_score = symptom_performance(targets, outputs)
        print('Male')
        male_all_accuracy, male_all_f1_score = symptom_performance(male_targets, male_outputs)
        print('Female')
        female_all_accuracy, female_all_f1_score = symptom_performance(female_targets, female_outputs)     
        all_equal_opp, all_fp_diff, P_all = symptom_equal_odds(male_targets, female_targets, male_outputs, female_outputs)
        len_P = 0
        test_index = []
        FP_category_add = 0
        equ_opp_category_add = 0

        for i in range(len(targets[0])):
            print(FP_category_add)
            FP_category_add += all_fp_diff[i]*P_all[i]
            equ_opp_category_add += all_equal_opp[i]*P_all[i]
            len_P += P_all[i]

        FP_diffs = FP_category_add/len_P
        equ_opps = equ_opp_category_add/len_P  
        print('FPR Difference: ', FP_diffs)
        print('Equal Opportunity: ', equ_opps)     

repl_model
Total
Accuracy Score = 0.7980769230769231
F1 Score (Micro) = 0.8
F1 Score (Macro) = 0.5319605520035766
F1 Score (Weighted) = 0.7714181665611801
Precision (Macro) = 0.5730385487528344
Recall (Macro) = 0.5165260263086351
Male performance
Accuracy Score = 0.7738095238095238
F1 Score (Micro) = 0.8285714285714286
F1 Score (Macro) = 0.5640223092604044
F1 Score (Weighted) = 0.8157537964680821
Precision (Macro) = 0.5691609977324262
Recall (Macro) = 0.5791819291819292
Female performance
Accuracy Score = 0.8145161290322581
F1 Score (Micro) = 0.768
F1 Score (Macro) = 0.47942873300016153
F1 Score (Weighted) = 0.7129037364331481
Precision (Macro) = 0.49561087061087067
Recall (Macro) = 0.48494152046783623
Total

Symptom:  Sadness
Accuracy Score = 0.9519230769230769
F1 Score = 0.7916666666666666
Precision = 0.76
Recall = 0.8260869565217391
AUC = 0.8968272620446534

Symptom:  Pessimism
Accuracy Score = 0.9951923076923077
F1 Score = 0.9333333333333333
Precision = 0.875
Recall = 1.0
AUC = 0.9

KeyboardInterrupt: 

In [None]:
#category model weighted macro equal odds

len_P = 0
test_index = []
FP_category_add = 0
equ_opp_category_add = 0

for i in range(len(targets[0])):
    print(FP_category_add)
    FP_category_add += FP_diff_all[i]*P_all[i]
    equ_opp_category_add += equal_opp_all[i]*P_all[i]
    len_P += P_all[i]

FP_diffs = FP_category_add/len_P
equ_opps = equ_opp_category_add/len_P  
print('FPR Difference: ', FP_diffs)
print('Equal Opportunity: ', equ_opps)

In [19]:
#performance per symptom

for i in range(len(targets[0])):
    symptom_targets = [symptom[i] for symptom in targets]
    symptom_outputs = [symptom[i] for symptom in outputs]
    #accuracy = metrics.accuracy_score(symptom_targets, symptom_outputs)
    f1_score = metrics.f1_score(symptom_targets, symptom_outputs, zero_division = 0.0)
    print('\nSymptom: ', testdata.columns[i+3])
    #print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score = {f1_score}")
    
    precision = precision_score(symptom_targets, symptom_outputs, zero_division = 0.0)
    recall = recall_score(symptom_targets, symptom_outputs, zero_division = 0.0)
    
    print(f"Precision = {precision}")
    print(f"Recall = {recall}")  
    
    try:
        auc = roc_auc_score(symptom_targets, symptom_outputs)
        print(f"AUC = {auc}")
    except ValueError:
        pass


Symptom:  Sadness
F1 Score = 0.7916666666666666
Precision = 0.76
Recall = 0.8260869565217391
AUC = 0.8968272620446534

Symptom:  Pessimism
F1 Score = 0.9333333333333333
Precision = 0.875
Recall = 1.0
AUC = 0.9975124378109453

Symptom:  Sense_of_failure
F1 Score = 0.8571428571428571
Precision = 0.8571428571428571
Recall = 0.8571428571428571
AUC = 0.9260838663823739

Symptom:  Loss_of_Pleasure
F1 Score = 0.8620689655172413
Precision = 0.8333333333333334
Recall = 0.8928571428571429
AUC = 0.9325396825396824

Symptom:  Guilty_feelings
F1 Score = 0.0
Precision = 0.0
Recall = 0.0
AUC = 0.5

Symptom:  Sense_of_punishment
F1 Score = 0.0
Precision = 0.0
Recall = 0.0

Symptom:  Self-dislike
F1 Score = 0.6666666666666666
Precision = 0.8333333333333334
Recall = 0.5555555555555556
AUC = 0.7752652149637075

Symptom:  Self-incrimination
F1 Score = 0.0
Precision = 0.0
Recall = 0.0

Symptom:  Suicidal_ideas
F1 Score = 0.9411764705882353
Precision = 1.0
Recall = 0.8888888888888888
AUC = 0.94444444444444

**Categories evaluation**

In [20]:
#Categories evaluation

names = 0
for symptoms in test_symptom_cat:
    symptom_targets = testdata[test_symptom_cat_names[names]]
    symptom_outputs = []
    for ins in range(len(testdata)):
        ins_output=outputs[ins]
        res = []
        for i in symptoms:
            res.append(ins_output[i-3])

        if 1 in res:
            symptom_outputs.append(1)
        else:
            symptom_outputs.append(0)

    print('Category: ', test_symptom_cat_names[names])
    
    accuracy = metrics.accuracy_score(symptom_targets, symptom_outputs)
    f1_score_micro = metrics.f1_score(symptom_targets, symptom_outputs, average='micro', zero_division = 0.0)
    f1_score_macro = metrics.f1_score(symptom_targets, symptom_outputs, average='macro', zero_division = 0.0)
    f1_score_weighted = metrics.f1_score(symptom_targets, symptom_outputs, average='weighted', zero_division = 0.0)
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"F1 Score (Weighted) = {f1_score_weighted}")

    precision_micro = precision_score(symptom_targets, symptom_outputs, average='micro', zero_division = 0.0)
    precision_macro = precision_score(symptom_targets, symptom_outputs, average='macro', zero_division = 0.0)
    recall_micro = recall_score(symptom_targets, symptom_outputs, average='micro', zero_division = 0.0)
    recall_macro = recall_score(symptom_targets, symptom_outputs, average='macro', zero_division = 0.0)
    
    print(f"Precision (Micro) = {precision_micro}")
    print(f"Precision (Macro) = {precision_macro}")
    print(f"Recall (Micro) = {recall_micro}")
    print(f"Recall (Macro) = {recall_macro}")    
    
    auc_micro = roc_auc_score(symptom_targets, symptom_outputs, average='micro')
    print(f"AUC = {auc_micro}")
    names += 1

Category:  Affective
Accuracy Score = 0.8894230769230769
F1 Score (Micro) = 0.8894230769230769
F1 Score (Macro) = 0.8632478632478633
F1 Score (Weighted) = 0.8914365548980933
Precision (Micro) = 0.8894230769230769
Precision (Macro) = 0.8516349977905435
Recall (Micro) = 0.8894230769230769
Recall (Macro) = 0.87825311942959
AUC = 0.87825311942959
Category:  Motivational
Accuracy Score = 0.9855769230769231
F1 Score (Micro) = 0.9855769230769231
F1 Score (Macro) = 0.9477167993297025
F1 Score (Weighted) = 0.9857908220811448
Precision (Micro) = 0.9855769230769231
Precision (Macro) = 0.9348958333333333
Recall (Micro) = 0.9855769230769231
Recall (Macro) = 0.9614853195164076
AUC = 0.9614853195164076
Category:  Cognitive
Accuracy Score = 0.9807692307692307
F1 Score (Micro) = 0.9807692307692307
F1 Score (Macro) = 0.8838637632607482
F1 Score (Weighted) = 0.9797491732165098
Precision (Micro) = 0.9807692307692307
Precision (Macro) = 0.9299999999999999
Recall (Micro) = 0.9807692307692307
Recall (Macro) 

In [21]:
#categories evaluation male

names = 0
for symptoms in test_symptom_cat:
    male_symptom_targets = male[test_symptom_cat_names[names]]
    male_symptom_outputs = []
    for ins in range(len(male)):
        ins_output=male_outputs[ins]
        res = []
        for i in symptoms:
            res.append(ins_output[i-3])

        if 1 in res:
            male_symptom_outputs.append(1)
        else:
            male_symptom_outputs.append(0)

    print('Category: ', test_symptom_cat_names[names])
    
    accuracy = metrics.accuracy_score(male_symptom_targets, male_symptom_outputs)
    f1_score_micro = metrics.f1_score(male_symptom_targets, male_symptom_outputs, average='micro', zero_division = 0.0)
    f1_score_macro = metrics.f1_score(male_symptom_targets, male_symptom_outputs, average='macro', zero_division = 0.0)
    f1_score_weighted = metrics.f1_score(male_symptom_targets, male_symptom_outputs, average='weighted', zero_division = 0.0)
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"F1 Score (Weighted) = {f1_score_weighted}")

    precision_micro = precision_score(male_symptom_targets, male_symptom_outputs, average='micro', zero_division = 0.0)
    precision_macro = precision_score(male_symptom_targets, male_symptom_outputs, average='macro', zero_division = 0.0)
    recall_micro = recall_score(male_symptom_targets, male_symptom_outputs, average='micro', zero_division = 0.0)
    recall_macro = recall_score(male_symptom_targets, male_symptom_outputs, average='macro', zero_division = 0.0)
    
    print(f"Precision (Micro) = {precision_micro}")
    print(f"Precision (Macro) = {precision_macro}")
    print(f"Recall (Micro) = {recall_micro}")
    print(f"Recall (Macro) = {recall_macro}")    
    
    auc_micro = roc_auc_score(male_symptom_targets, male_symptom_outputs, average='micro')
    print(f"AUC = {auc_micro}")
    names += 1

Category:  Affective
Accuracy Score = 0.9047619047619048
F1 Score (Micro) = 0.9047619047619048
F1 Score (Macro) = 0.8908382066276803
F1 Score (Weighted) = 0.906618397846468
Precision (Micro) = 0.9047619047619048
Precision (Macro) = 0.8783699059561129
Recall (Micro) = 0.9047619047619048
Recall (Macro) = 0.9091525423728815
AUC = 0.9091525423728815
Category:  Motivational
Accuracy Score = 0.9761904761904762
F1 Score (Micro) = 0.9761904761904762
F1 Score (Macro) = 0.922077922077922
F1 Score (Weighted) = 0.9761904761904762
Precision (Micro) = 0.9761904761904762
Precision (Macro) = 0.922077922077922
Recall (Micro) = 0.9761904761904762
Recall (Macro) = 0.922077922077922
AUC = 0.922077922077922
Category:  Cognitive
Accuracy Score = 1.0
F1 Score (Micro) = 1.0
F1 Score (Macro) = 1.0
F1 Score (Weighted) = 1.0
Precision (Micro) = 1.0
Precision (Macro) = 1.0
Recall (Micro) = 1.0
Recall (Macro) = 1.0
AUC = 1.0
Category:  Cog_distortions
Accuracy Score = 1.0
F1 Score (Micro) = 1.0
F1 Score (Macro) = 

In [22]:
#categories evaluation female

names = 0
for symptoms in test_symptom_cat:
    female_symptom_targets = female[test_symptom_cat_names[names]]
    female_symptom_outputs = []
    for ins in range(len(female)):
        ins_output=female_outputs[ins]
        res = []
        for i in symptoms:

            res.append(ins_output[i-3])

        if 1 in res:
            female_symptom_outputs.append(1)
        else:
            female_symptom_outputs.append(0)

    print('Category: ', test_symptom_cat_names[names])
    
    accuracy = metrics.accuracy_score(female_symptom_targets, female_symptom_outputs)
    f1_score_micro = metrics.f1_score(female_symptom_targets, female_symptom_outputs, average='micro', zero_division = 0.0)
    f1_score_macro = metrics.f1_score(female_symptom_targets, female_symptom_outputs, average='macro', zero_division = 0.0)
    f1_score_weighted = metrics.f1_score(female_symptom_targets, female_symptom_outputs, average='weighted', zero_division = 0.0)
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"F1 Score (Weighted) = {f1_score_weighted}")

    precision_micro = precision_score(female_symptom_targets, female_symptom_outputs, average='micro', zero_division = 0.0)
    precision_macro = precision_score(female_symptom_targets, female_symptom_outputs, average='macro', zero_division = 0.0)
    recall_micro = recall_score(female_symptom_targets, female_symptom_outputs, average='micro', zero_division = 0.0)
    recall_macro = recall_score(female_symptom_targets, female_symptom_outputs, average='macro', zero_division = 0.0)
    
    print(f"Precision (Micro) = {precision_micro}")
    print(f"Precision (Macro) = {precision_macro}")
    print(f"Recall (Micro) = {recall_micro}")
    print(f"Recall (Macro) = {recall_macro}")    
    
    auc_micro = roc_auc_score(female_symptom_targets, female_symptom_outputs, average='micro')
    print(f"AUC = {auc_micro}")
    names += 1

Category:  Affective
Accuracy Score = 0.8951612903225806
F1 Score (Micro) = 0.8951612903225806
F1 Score (Macro) = 0.8586832646620497
F1 Score (Weighted) = 0.8957403066029065
Precision (Micro) = 0.8951612903225806
Precision (Macro) = 0.8548387096774193
Recall (Micro) = 0.8951612903225806
Recall (Macro) = 0.8627659574468085
AUC = 0.8627659574468085
Category:  Motivational
Accuracy Score = 0.9758064516129032
F1 Score (Micro) = 0.9758064516129032
F1 Score (Macro) = 0.8935622317596567
F1 Score (Weighted) = 0.9750519174858093
Precision (Micro) = 0.9758064516129032
Precision (Macro) = 0.92002442002442
Recall (Micro) = 0.9758064516129032
Recall (Macro) = 0.8706896551724138
AUC = 0.8706896551724138
Category:  Cognitive
Accuracy Score = 0.9758064516129032
F1 Score (Micro) = 0.9758064516129032
F1 Score (Macro) = 0.6938271604938271
F1 Score (Weighted) = 0.9686977299880524
Precision (Micro) = 0.9758064516129032
Precision (Macro) = 0.9878048780487805
Recall (Micro) = 0.9758064516129032
Recall (Macro

In [21]:
#per symptom

#collecting the values for later aggregated category evaluation
equal_opps_all = []
FP_diff_all = []
P_all = []

for i in range(len(targets[0])):
    print('Symptom: ', testdata.columns[i+3])
    male_symptom_targets = [symptom[i] for symptom in male_targets]
    female_symptom_targets = [symptom[i] for symptom in female_targets]
    male_symptom_outputs = [symptom[i] for symptom in male_outputs]
    female_symptom_outputs = [symptom[i] for symptom in female_outputs]
    
    male_TP_rate, male_FP_rate, male_P = TPR_FPR(male_symptom_targets, male_symptom_outputs)
    female_TP_rate, female_FP_rate, female_P = TPR_FPR(female_symptom_targets, female_symptom_outputs)
    P_symptom = male_P + female_P
    P_all.append(P_symptom)
    equal_opp = female_TP_rate/male_TP_rate
    FP_diff = female_FP_rate/male_FP_rate
    FP_diff_all.append(FP_diff)
    equal_opps_all.append(equal_opp)
    print('Equal opportunity: ', equal_opp)
    print('FPR Diff: ', FP_diff)

Symptom:  Sadness
Equal opportunity:  0.9545454545454547
FPR Diff:  0.6260869565217392
Symptom:  Pessimism
Equal opportunity:  1.0
FPR Diff:  1.3278688524590165
Symptom:  Sense_of_failure
Equal opportunity:  0.75
FPR Diff:  1.3278688524590165
Symptom:  Loss_of_Pleasure
Equal opportunity:  0.85
FPR Diff:  0.5377358490566038
Symptom:  Guilty_feelings
Equal opportunity:  0.25
FPR Diff:  0.6967213114754098
Symptom:  Sense_of_punishment
Equal opportunity:  1.0
FPR Diff:  0.68
Symptom:  Self-dislike
Equal opportunity:  1.3125
FPR Diff:  0.3237704918032787
Symptom:  Self-incrimination
Equal opportunity:  1.0
FPR Diff:  0.68
Symptom:  Suicidal_ideas
Equal opportunity:  0.8571428571428571
FPR Diff:  0.6890756302521008
Symptom:  Crying
Equal opportunity:  1.5
FPR Diff:  0.6451612903225806
Symptom:  Agitation
Equal opportunity:  1.25
FPR Diff:  0.6532258064516129
Symptom:  Social_withdrawal
Equal opportunity:  0.6666666666666666
FPR Diff:  0.6422764227642277
Symptom:  Indecision
Equal opportunity

In [68]:
#category model weighted macro equal odds

len_P = 0
test_index = []
FP_category_add = 0
equ_opp_category_add = 0

for i in range(len(targets[0])):
    print(FP_category_add)
    FP_category_add += FP_diff_all[i]*P_all[i]
    equ_opp_category_add += equal_opp_all[i]*P_all[i]
    len_P += P_all[i]

FP_diffs = FP_category_add/len_P
equ_opps = equ_opp_category_add/len_P  
print('FPR Difference: ', FP_diffs)
print('Equal Opportunity: ', equ_opps)

0
48.63157894736842
88.63157894736842
91.89604175728579
107.69091355215758
FPR Difference:  0.9912853609945992
Equal Opportunity:  0.9296659580787677


In [47]:
#micro equal odds

names = 0
for i in test_symptom_cat:
    test_index = []
    male_symptom_targets = []
    female_symptom_targets = []
    male_symptom_outputs = []
    female_symptom_outputs = []
    for j in i:
        test_index.append(j)
        male_symptom_targets.append([symptom[j-3] for symptom in male_targets])
        female_symptom_targets.append([symptom[j-3] for symptom in female_targets])
        male_symptom_outputs.append([symptom[j-3] for symptom in male_outputs])
        female_symptom_outputs.append([symptom[j-3] for symptom in female_outputs])
    print('Category: ', test_symptom_cat_names[names])
    print('Symptoms: ', testdata.columns[test_index])
    
    male_symptom_targets = list(chain(*male_symptom_targets))
    female_symptom_targets = list(chain(*female_symptom_targets))
    male_symptom_outputs = list(chain(*male_symptom_outputs))
    female_symptom_outputs = list(chain(*female_symptom_outputs))
    
    male_TP_rate, male_FP_rate, male_TP = TPR_FPR(male_symptom_targets, male_symptom_outputs)
    female_TP_rate, female_FP_rate, female_TP = TPR_FPR(female_symptom_targets, female_symptom_outputs)
    equal_opp = female_TP_rate/male_TP_rate
    FP_diff = female_FP_rate/male_FP_rate
    #equal_odds = max(equal_opp, FP_diff)
    print('Equal opportunity: ', equal_opp)
    print('FPR Difference: ', FP_diff)
    names += 1

Category:  Affective
Symptoms:  Index(['Sadness', 'Loss_of_Pleasure', 'Crying', 'Indecision'], dtype='object')
Equal opportunity:  0.892156862745098
FPR Difference:  0.5726600985221675
Category:  Motivational
Symptoms:  Index(['Pessimism', 'Suicidal_ideas'], dtype='object')
Equal opportunity:  0.9
FPR Difference:  1.35
Category:  Cognitive
Symptoms:  Index(['Social_withdrawal', 'Concentration_difficulty'], dtype='object')
Equal opportunity:  0.4
FPR Difference:  0.6612244897959184
Category:  Cog_distortions
Symptoms:  Index(['Sense_of_failure', 'Guilty_feelings', 'Sense_of_punishment',
       'Self-dislike', 'Self-incrimination', 'Feelings_of_worthlessness'],
      dtype='object')
Equal opportunity:  0.6818181818181818
FPR Difference:  0.6680272108843538
Category:  Behavioral
Symptoms:  Index(['Agitation', 'Loss_of_energy', 'Irritability', 'Tiredness_or_fatigue'], dtype='object')
Equal opportunity:  1.0606060606060608
FPR Difference:  0.3333333333333333
Category:  Physiological
Symptom

In [48]:
#macro equal odds

names = 0
for i in test_symptom_cat:
    len_category = 0
    test_index = []
    FP_category_add = 0
    equ_opp_category_add = 0
    for j in i:
        test_index.append(j)
        FP_category_add += FP_diff_all[j-3]
        equ_opp_category_add += equal_opps_all[j-3]
        len_category += 1
    FP_diffs = FP_category_add/len_category
    equ_opps = equ_opp_category_add/len_category
    print('Category: ', test_symptom_cat_names[names])
    print('Symptoms: ', testdata.columns[test_index]) 
    print('FPR Difference: ', FP_diffs)
    print('Equal Opportunity: ', equ_opps)
    names += 1

Category:  Affective
Symptoms:  Index(['Sadness', 'Loss_of_Pleasure', 'Crying', 'Indecision'], dtype='object')
FPR Difference:  0.6264263518440834
Equal Opportunity:  0.8886363636363637
Category:  Motivational
Symptoms:  Index(['Pessimism', 'Suicidal_ideas'], dtype='object')
FPR Difference:  1.0084722413555587
Equal Opportunity:  0.9285714285714286
Category:  Cognitive
Symptoms:  Index(['Social_withdrawal', 'Concentration_difficulty'], dtype='object')
FPR Difference:  0.6626016260162603
Equal Opportunity:  0.5
Category:  Cog_distortions
Symptoms:  Index(['Sense_of_failure', 'Guilty_feelings', 'Sense_of_punishment',
       'Self-dislike', 'Self-incrimination', 'Feelings_of_worthlessness'],
      dtype='object')
FPR Difference:  0.726931077031553
Equal Opportunity:  0.8854166666666666
Category:  Behavioral
Symptoms:  Index(['Agitation', 'Loss_of_energy', 'Irritability', 'Tiredness_or_fatigue'], dtype='object')
FPR Difference:  0.4723198694972889
Equal Opportunity:  1.0337962962962963
Cat

In [49]:
#weighted macro equal odds

names = 0
for i in test_symptom_cat:
    len_P = 0
    test_index = []
    FP_category_add = 0
    equ_opp_category_add = 0
    for j in i:
        test_index.append(j)
        FP_category_add += FP_diff_all[j-3]*P_all[j-3]
        equ_opp_category_add += equal_opps_all[j-3]*P_all[j-3]
        len_P += P_all[j-3]
    if FP_category_add > 0:
        FP_diffs = FP_category_add/len_P
    else:
        FP_diffs = 0
    if equ_opp_category_add > 0:
        equ_opps = equ_opp_category_add/len_P
    else:
        equ_opps = 0    
    print('Category: ', test_symptom_cat_names[names])
    print('Symptoms: ', testdata.columns[test_index]) 
    print('FPR Difference: ', FP_diffs)
    print('Equal Opportunity: ', equ_opps)
    names += 1

Category:  Affective
Symptoms:  Index(['Sadness', 'Loss_of_Pleasure', 'Crying', 'Indecision'], dtype='object')
FPR Difference:  0.5902955908324437
Equal Opportunity:  0.9250757575757577
Category:  Motivational
Symptoms:  Index(['Pessimism', 'Suicidal_ideas'], dtype='object')
FPR Difference:  0.9685476649676266
Equal Opportunity:  0.9196428571428571
Category:  Cognitive
Symptoms:  Index(['Social_withdrawal', 'Concentration_difficulty'], dtype='object')
FPR Difference:  0.6533628972653364
Equal Opportunity:  0.5757575757575757
Category:  Cog_distortions
Symptoms:  Index(['Sense_of_failure', 'Guilty_feelings', 'Sense_of_punishment',
       'Self-dislike', 'Self-incrimination', 'Feelings_of_worthlessness'],
      dtype='object')
FPR Difference:  0.7318878900052882
Equal Opportunity:  0.9505208333333334
Category:  Behavioral
Symptoms:  Index(['Agitation', 'Loss_of_energy', 'Irritability', 'Tiredness_or_fatigue'], dtype='object')
FPR Difference:  0.48599144533284316
Equal Opportunity:  1.145

**Exploration of results with high equal odds/equal opportunity results**

In [52]:
for i in range(len(outputs)):
    if outputs[i][19]==1:
        print(testdata.iloc[i]['Sentence'], testdata.iloc[i]['Tiredness_or_fatigue'], testdata.iloc[i]['Gender'])

I'm sorry this is so badly written i just pulled an all nighter and now I'm tired lol 1 1
Omfg I feel enlightened 0 1
i burnt out and i still am unmotivated and feeling like shit. 1 0
I have sleeping problems and thats why i usually skip school(thats why im stressed all the time). 1 0
Unmotivated, lazy, over emotional, feeling like shit, i over eat or eat too little, nothing is enjoyable or interesting. 1 0
If i have a good night sleep, I love to wake up early and do my own projects or something but if i dont get a good night sleep(more likely to happen) im unmotivated an lazy and feeling like shit. 1 0
Also i dont feel anxious when i meditate. 0 0
At the moment i dont have any good meds and i feel super awful and unmotivated. 0 0
I’m tired, and a little dizzy I’m hoping they subside. 0 0


In [53]:
testdata[testdata['Tiredness_or_fatigue']==1]

Unnamed: 0,Sentence,Label,Severity,Sadness,Pessimism,Sense_of_failure,Loss_of_Pleasure,Guilty_feelings,Sense_of_punishment,Self-dislike,...,Loss_of_energy,Change_of_sleep,Irritability,Changes_in_appetite,Concentration_difficulty,Tiredness_or_fatigue,Loss_of_interest_in_sex,Subject,Gender,list
33,I'm sorry this is so badly written i just pull...,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,Subject 6900,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
62,i burnt out and i still am unmotivated and fee...,1,2,1,1,0,1,0,0,0,...,1,0,0,0,1,1,0,Subject 2961,0,"[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, ..."
74,I have sleeping problems and thats why i usual...,1,1,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,Subject 2961,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
81,"Unmotivated, lazy, over emotional, feeling lik...",1,1,0,0,0,1,0,0,0,...,0,0,0,1,0,1,0,Subject 2961,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
88,"If i have a good night sleep, I love to wake u...",1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,Subject 2961,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
94,Also i started smoking because i coudnt handle...,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,Subject 2961,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [79]:
def TP_FP_FN(targets, outputs):
    TP = 0
    FN = 0
    FP = 0
    for i in range(len(outputs)):
        if outputs[i]==targets[i]==1:
          TP += 1
        if outputs[i]==0 and targets[i]==1:
          FN += 1
        if outputs[i]==1 and targets[i]==0:
           FP += 1
    return TP, FP, FN

In [86]:
names = 0
for i in test_symptom_cat:
    test_index = []
    male_TP = 0
    female_TP = 0
    male_FP = 0
    female_FP = 0
    male_FN = 0
    female_FN = 0
    for j in i:
        test_index.append(j)
        male_symptom_targets = [symptom[j-3] for symptom in male_targets]
        male_symptom_outputs = [symptom[j-3] for symptom in male_outputs]

        female_symptom_targets = [symptom[j-3] for symptom in female_targets]
        female_symptom_outputs = [symptom[j-3] for symptom in female_outputs]

        male_symptom_TP, male_symptom_FP, male_symptom_FN = TP_FP_FN(male_symptom_targets, male_symptom_outputs)
        female_symptom_TP, female_symptom_FP, female_symptom_FN = TP_FP_FN(female_symptom_targets, female_symptom_outputs)
        male_TP += male_symptom_TP
        female_TP += female_symptom_TP
        male_FP += male_symptom_FP
        female_FP += female_symptom_FP
        male_FN += male_symptom_FN
        female_FN += female_symptom_FN
    print('Category: ', test_symptom_cat_names[names])
    print('Symptoms: ', testdata.columns[test_index]) 
    print('Male TP, FP, FN: ', male_TP, male_FP, male_FN)
    print('Female TP, FP, FN: ', female_TP, female_FP, female_FN)
    names += 1

Category:  affective
Symptoms:  Index(['Sadness', 'Loss_of_Pleasure', 'Crying', 'Indecision'], dtype='object')
Male TP, FP, FN:  23 6 4
Female TP, FP, FN:  25 5 8
Category:  motivational
Symptoms:  Index(['Pessimism', 'Suicidal_ideas'], dtype='object')
Male TP, FP, FN:  7 0 0
Female TP, FP, FN:  8 1 1
Category:  cognitive
Symptoms:  Index(['Social_withdrawal', 'Concentration_difficulty'], dtype='object')
Male TP, FP, FN:  7 0 0
Female TP, FP, FN:  1 0 3
Category:  cog_distortions
Symptoms:  Index(['Sense_of_failure', 'Guilty_feelings', 'Sense_of_punishment',
       'Self-dislike', 'Self-incrimination', 'Feelings_of_worthlessness'],
      dtype='object')
Male TP, FP, FN:  11 1 3
Female TP, FP, FN:  5 1 5
Category:  behavioral
Symptoms:  Index(['Agitation', 'Loss_of_energy', 'Irritability', 'Tiredness_or_fatigue'], dtype='object')
Male TP, FP, FN:  10 5 3
Female TP, FP, FN:  9 2 2
Category:  physiological
Symptoms:  Index(['Change_of_sleep', 'Changes_in_appetite', 'Loss_of_interest_in_se

In [82]:
male_symptom_targets = [symptom[19] for symptom in male_targets]
male_symptom_outputs = [symptom[19] for symptom in male_outputs]

female_symptom_targets = [symptom[19] for symptom in female_targets]
female_symptom_outputs = [symptom[19] for symptom in female_outputs]

male_symptom_TP, male_symptom_FP, male_symptom_FN = TP_FP_FN(male_symptom_targets, male_symptom_outputs)
female_symptom_TP, female_symptom_FP, female_symptom_FN = TP_FP_FN(female_symptom_targets, female_symptom_outputs)

In [83]:
print(female_symptom_TP, female_symptom_FP, female_symptom_FN)

1 1 0


**Analysis of wrong predictions**

In [54]:
wrong_predictions = []

for i in range(len(outputs)):
    output = outputs[i]
    target = targets[i]
    if not np.array_equal(output, target):
        wrong_predictions.append([testdata['Sentence'][i], target, output.tolist()])

In [55]:
len(wrong_predictions)

42

In [56]:
wrong_predictions_df = pd.DataFrame(wrong_predictions, columns=['Sentence', 'Target', 'Prediction'])

In [57]:
for name in wrong_predictions_df.columns:
    count = wrong_predictions_df[name].value_counts()
    print(count)

Sentence
I try to pay attention, I really do, but I just naturally zone out in conversations a lot or forget to do things.                                                                                      1
(im not suicidal tho, and i feel sad and broken reading your message.                                                                                                                                  1
I cant finnish anything [help please]                                                                                                                                                                  1
If i have a good night sleep, I love to wake up early and do my own projects or something but if i dont get a good night sleep(more likely to happen) im unmotivated an lazy and feeling like shit.    1
Also i dont feel anxious when i meditate.                                                                                                                                                  