**Imports**

In [26]:
import pandas as pd
import numpy as np
import torch
from huggingface_hub import login

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, roc_auc_score, multilabel_confusion_matrix, ConfusionMatrixDisplay

from itertools import chain

from scipy import stats

import math

try:
    import cPickle as pickle
except ImportError:  # Python 3.x
    import pickle

In [27]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [28]:
pd.set_option('display.max_columns', None)

**Test data**

In [29]:
import io

class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)

In [96]:
#opening the model files
with open('9-CV-Category-history.p', 'rb') as fp:
    history = CPU_Unpickler(fp).load()

In [97]:
#train files depends on model
train = pd.read_csv('all-gendered.csv')
#train = pd.read_csv('all_CV_sample.csv')

In [None]:
#login()

**Load models**

In [11]:
class MBERTClass(torch.nn.Module):
    def __init__(self):
        super(MBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("mental/mental-bert-base-uncased")
        self.l2 = torch.nn.Dropout(0.2)
        #self.l3 = torch.nn.Linear(768, 21)
        self.l3 = torch.nn.Linear(768, 6)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

**Create testing sets**

In [12]:
MAX_LEN = 128
tokenizer = AutoTokenizer.from_pretrained("mental/mental-bert-base-uncased")

torch.manual_seed(10)

<torch._C.Generator at 0x1c3ff435af0>

In [13]:
#class to tokenize the data and create the dataset for the model

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Sentence
        self.targets = dataframe.list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

**Evaluation and fairness measures**

In [14]:
#evaluation total

def eval_total(model, test_loader):

    model.eval()

    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    outputs, targets = fin_outputs, fin_targets
    outputs = (np.array(outputs) >= 0.5).astype(int)
    targets = [[int(num) for num in sublist] for sublist in targets]
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro', zero_division = 0.0)
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro', zero_division = 0.0)
    f1_score_weighted = metrics.f1_score(targets, outputs, average='weighted', zero_division = 0.0)
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"F1 Score (Weighted) = {f1_score_weighted}")

    #precision_micro = precision_score(targets, outputs, average='micro', zero_division = 0.0)
    precision_macro = precision_score(targets, outputs, average='macro', zero_division = 0.0)
    #recall_micro = recall_score(targets, outputs, average='micro', zero_division = 0.0)
    recall_macro = recall_score(targets, outputs, average='macro', zero_division = 0.0)
    
    #print(f"Precision (Micro) = {precision_micro}")
    print(f"Precision (Macro) = {precision_macro}")
    #print(f"Recall (Micro) = {recall_micro}")
    print(f"Recall (Macro) = {recall_macro}")    
    
    #auc_micro = roc_auc_score(targets, outputs, average='micro')
    #print(f"AUC = {auc_micro}")
    return targets, outputs, accuracy, f1_score_micro, f1_score_macro, f1_score_weighted, precision_macro, recall_macro

In [15]:
#Categories model F1 Score per category

def category_performance(targets, outputs):

    all_accuracy = []
    all_f1_score = []
    #print(targets[0])
    try:
        print(targets[0])
    except:
        print('No Targets')
        return
    for i in range(len(targets[0])):
            category_targets = [category[i] for category in targets]
            category_outputs = [category[i] for category in outputs]
            accuracy = metrics.accuracy_score(category_targets, category_outputs)
            f1_score = metrics.f1_score(category_targets, category_outputs, zero_division = 0.0)
            print('\nCategory: ', testdata.columns[i+26])
            print(f"Accuracy Score = {accuracy}")
            print(f"F1 Score = {f1_score}")
            all_accuracy.append(accuracy)
            all_f1_score.append(f1_score)
    return all_accuracy, all_f1_score

In [16]:
#function that returns the TPR and FPR rate, as well as the number of positive targets
def TPR_FPR(targets, outputs):
    TP = 0
    FN = 0
    FP = 0
    TN = 0

    a = 1

    for i in range(len(outputs)):
        if outputs[i]==targets[i]==1:
          TP +=1  
        if outputs[i]==0 and targets[i]==1:
          FN +=1
        if outputs[i]==1 and targets[i]==0:
           FP +=1
        if outputs[i]==targets[i]==0:
           TN +=1
    
    TP_rate = (TP+a)/(FN+TP+a)
    FP_rate = (FP+a)/(FP+TN+a)
    P = TP + FN
    
    return TP_rate, FP_rate, P

In [17]:
#category model equal opportunity and equalized odds
def category_equal_odds(male_targets, female_targets, male_outputs, female_outputs):

    #s=0.0001
    #equal_odds_all = []
    P_all = []
    m_P_all = []
    f_P_all = []
    all_m_TP = []
    all_m_FP = []
    all_f_TP = []
    all_f_FP = []
    all_equal_opp = []
    all_fp_diff = []

    try:
        print(male_targets[0])
    except:
        print('No Targets')
        return
    for i in range(len(male_targets[0])):
            print('Category: ', testdata.columns[i+26])
            male_category_targets = [category[i] for category in male_targets]
            female_category_targets = [category[i] for category in female_targets]
            male_category_outputs = [category[i] for category in male_outputs]
            female_category_outputs = [category[i] for category in female_outputs]
    
            male_TP_rate, male_FP_rate, male_P = TPR_FPR(male_category_targets, male_category_outputs)
            female_TP_rate, female_FP_rate, female_P = TPR_FPR(female_category_targets, female_category_outputs)
            P_category = male_P + female_P
            P_all.append(P_category)
            m_P_all.append(male_P)
            f_P_all.append(female_P)
            equal_opp = male_TP_rate/female_TP_rate
            FP_diff = male_FP_rate/female_FP_rate
            #equal_odds = max(equal_opp, FP_diff)
            #equal_odds_all.append(equal_odds)
            print('Equal opportunity: ', equal_opp)
            print('FPR Difference: ', FP_diff)
            all_equal_opp.append(equal_opp)
            all_fp_diff.append(FP_diff)
            all_m_TP.append(male_TP_rate)
            all_m_FP.append(male_FP_rate)
            all_f_TP.append(female_TP_rate)
            all_f_FP.append(female_FP_rate)
    return all_equal_opp, all_fp_diff, all_m_TP, all_m_FP, all_f_TP, all_f_FP, P_all, m_P_all, f_P_all

In [98]:
all_targets = []
all_outputs = []
performance = {'targets':[], 'outputs':[], 'prec':[], 'recall':[], 'm_acc':[], 'm_f1_micro':[], 'm_f1_macro':[],
'm_f1_weighted': [], 'm_prec':[], 'm_recall':[], 'f_acc':[], 'f_f1_micro':[], 
'f_f1_macro':[], 'f_f1_weighted': [], 'f_prec':[], 'f_recall':[], 'category_acc':[],
'category_f1':[], 'm_category_acc':[], 'm_category_f1':[], 'f_category_acc':[],
'f_category_f1':[], 'category_equ_opp': [], 'category_fpr':[], 'Male_TP':[], 
'Male_FP':[], 'Female_TP': [], 'Female_FP':[], 'P':[], 'm_P': [], 'f_P': []}

for i in range(len(history['val'])):
    print(i)
    #print(history['val'][i])
    testdata = train.iloc[history['val'][i]].copy()
    testdata = testdata.reset_index(drop=True)

    male = testdata[testdata['Gender']==0].copy()
    female = testdata[testdata['Gender']==1].copy()

    male = male.reset_index()
    female = female.reset_index()

    cv_model = MBERTClass()

    cv_model.load_state_dict(history['models'][i])

    #Categories
    testdata['list'] = testdata[testdata.columns[26:32]].values.tolist()
    male['list'] = male[male.columns[27:33]].values.tolist()
    female['list'] = female[female.columns[27:33]].values.tolist()

    new_test = testdata[['Sentence', 'list']].copy()
    new_male_test = male[['Sentence', 'list']].copy()
    new_female_test = female[['Sentence', 'list']].copy()

    testing_set = CustomDataset(new_test, tokenizer, MAX_LEN)
    male_testing_set = CustomDataset(new_male_test, tokenizer, MAX_LEN)
    female_testing_set = CustomDataset(new_female_test, tokenizer, MAX_LEN)
    
    test_params = {'batch_size': 32,
                'shuffle': False,
                'num_workers': 0
                }

    test_loader = DataLoader(testing_set, **test_params)
    male_test_loader = DataLoader(male_testing_set, **test_params)
    female_test_loader = DataLoader(female_testing_set, **test_params)

    targets, outputs, accuracy, f1_score_micro, f1_score_macro, f1_score_weighted, precision_macro, recall_macro = eval_total(cv_model, test_loader)
    #print(targets, outputs, accuracy, f1_score_micro, f1_score_macro, f1_score_weighted, precision_macro, auc_micro)
    male_targets, male_outputs, male_accuracy, male_f1_score_micro, male_f1_score_macro, male_f1_score_weighted, male_precision_macro, male_recall_macro = eval_total(cv_model, male_test_loader)
    female_targets, female_outputs, female_accuracy, female_f1_score_micro, female_f1_score_macro, female_f1_score_weighted, female_precision_macro, female_recall_macro = eval_total(cv_model, female_test_loader)

    all_accuracy, all_f1_score = category_performance(targets, outputs)
    male_all_accuracy, male_all_f1_score = category_performance(male_targets, male_outputs)
    try:
        female_all_accuracy, female_all_f1_score = category_performance(female_targets, female_outputs)

        all_equal_opp, all_fp_diff, male_TP_rate, male_FP_rate, female_TP_rate, female_FP_rate, P_all, m_P_all, f_P_all = category_equal_odds(male_targets, female_targets, male_outputs, female_outputs)
    
    except:
        print('No female targets')

    all_targets.append(targets)
    all_outputs.append(outputs)
    
    performance['prec'].append(precision_macro)
    performance['recall'].append(recall_macro)
    performance['category_acc'].append(all_accuracy)
    performance['category_f1'].append(all_f1_score)

    performance['m_acc'].append(male_accuracy)
    performance['m_f1_macro'].append(male_f1_score_macro)
    performance['m_f1_micro'].append(male_f1_score_micro)
    performance['m_f1_weighted'].append(male_f1_score_weighted)
    performance['m_prec'].append(male_precision_macro)
    performance['m_recall'].append(male_recall_macro)
    performance['m_category_acc'].append(male_all_accuracy)
    performance['m_category_f1'].append(male_all_f1_score)

    performance['f_acc'].append(female_accuracy)
    performance['f_f1_macro'].append(female_f1_score_macro)
    performance['f_f1_micro'].append(female_f1_score_micro)
    performance['f_f1_weighted'].append(female_f1_score_weighted)
    performance['f_prec'].append(female_precision_macro)
    performance['f_recall'].append(female_recall_macro)
    performance['f_category_acc'].append(female_all_accuracy)
    performance['f_category_f1'].append(female_all_f1_score)

    performance['category_equ_opp'].append(all_equal_opp)
    performance['category_fpr'].append(all_fp_diff)

    performance['Male_TP'].append(male_TP_rate)
    performance['Male_FP'].append(male_FP_rate)
    performance['Female_TP'].append(female_TP_rate)
    performance['Female_FP'].append(female_FP_rate)
    performance['P'].append(P_all)
    performance['m_P'].append(m_P_all)
    performance['f_P'].append(f_P_all)

0


Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.4605809128630705
F1 Score (Micro) = 0.6095238095238096
F1 Score (Macro) = 0.448161318521527
F1 Score (Weighted) = 0.6043637490852081
Precision (Macro) = 0.4932332988598973
Recall (Macro) = 0.42898255398255386
Accuracy Score = 0.8360655737704918
F1 Score (Micro) = 0.6470588235294118
F1 Score (Macro) = 0.3333333333333333
F1 Score (Weighted) = 0.6296296296296295
Precision (Macro) = 0.34722222222222215
Recall (Macro) = 0.3222222222222222
Accuracy Score = 0.3333333333333333
F1 Score (Micro) = 0.6069246435845214
F1 Score (Macro) = 0.44654291519045614
F1 Score (Weighted) = 0.6022118270061299
Precision (Macro) = 0.495436875038114
Recall (Macro) = 0.42631613175950306
[1, 0, 0, 1, 0, 0]

Category:  Affective
Accuracy Score = 0.7178423236514523
F1 Score = 0.7017543859649122

Category:  Motivational
Accuracy Score = 0.8464730290456431
F1 Score = 0.5842696629213483

Category:  Cognitive
Accuracy Score = 0.9253112033195021
F1 Score = 0.3076923076923077

Category:  Cog_distortions


Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.835820895522388
F1 Score (Micro) = 0.3783783783783784
F1 Score (Macro) = 0.18055555555555555
F1 Score (Weighted) = 0.38636363636363635
Precision (Macro) = 0.1323529411764706
Recall (Macro) = 0.28571428571428575
Accuracy Score = 0.8524590163934426
F1 Score (Micro) = 0.5833333333333334
F1 Score (Macro) = 0.23137254901960783
F1 Score (Weighted) = 0.5197860962566846
Precision (Macro) = 0.19444444444444442
Recall (Macro) = 0.28571428571428575
Accuracy Score = 0.821917808219178
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0
F1 Score (Weighted) = 0.0
Precision (Macro) = 0.0
Recall (Macro) = 0.0
[1, 0, 0, 0, 0, 0]

Category:  Affective
Accuracy Score = 0.8955223880597015
F1 Score = 0.4166666666666667

Category:  Motivational
Accuracy Score = 0.9626865671641791
F1 Score = 0.0

Category:  Cognitive
Accuracy Score = 1.0
F1 Score = 0.0

Category:  Cog_distortions
Accuracy Score = 0.9925373134328358
F1 Score = 0.0

Category:  Behavioral
Accuracy Score = 0.9850746268656716
F1 Score

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.8194444444444444
F1 Score (Micro) = 0.6923076923076923
F1 Score (Macro) = 0.5944444444444444
F1 Score (Weighted) = 0.6935897435897436
Precision (Macro) = 0.6722222222222222
Recall (Macro) = 0.5727513227513227
Accuracy Score = 0.9347826086956522
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0
F1 Score (Weighted) = 0.0
Precision (Macro) = 0.0
Recall (Macro) = 0.0
Accuracy Score = 0.6153846153846154
F1 Score (Micro) = 0.7346938775510204
F1 Score (Macro) = 0.6103174603174603
F1 Score (Weighted) = 0.7265567765567766
Precision (Macro) = 0.6944444444444443
Recall (Macro) = 0.5727513227513227
[0, 1, 0, 0, 0, 0]

Category:  Affective
Accuracy Score = 0.8888888888888888
F1 Score = 0.6666666666666666

Category:  Motivational
Accuracy Score = 0.9583333333333334
F1 Score = 0.6666666666666666

Category:  Cognitive
Accuracy Score = 1.0
F1 Score = 0.0

Category:  Cog_distortions
Accuracy Score = 0.9583333333333334
F1 Score = 0.4

Category:  Behavioral
Accuracy Score = 0.97222222222222

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.8604651162790697
F1 Score (Micro) = 0.6129032258064516
F1 Score (Macro) = 0.4348583877995642
F1 Score (Weighted) = 0.5857895846510647
Precision (Macro) = 0.4473684210526316
Recall (Macro) = 0.48095238095238096
Accuracy Score = 0.3684210526315789
F1 Score (Micro) = 0.6785714285714286
F1 Score (Macro) = 0.47301587301587295
F1 Score (Weighted) = 0.6556067588325654
Precision (Macro) = 0.561111111111111
Recall (Macro) = 0.48095238095238096
Accuracy Score = 0.9454545454545454
F1 Score (Micro) = 0.0
F1 Score (Macro) = 0.0
F1 Score (Weighted) = 0.0
Precision (Macro) = 0.0
Recall (Macro) = 0.0
[1, 1, 1, 0, 0, 0]

Category:  Affective
Accuracy Score = 0.937984496124031
F1 Score = 0.7647058823529411

Category:  Motivational
Accuracy Score = 0.9534883720930233
F1 Score = 0.4

Category:  Cognitive
Accuracy Score = 0.9689922480620154
F1 Score = 0.3333333333333333

Category:  Cog_distortions
Accuracy Score = 0.9612403100775194
F1 Score = 0.4444444444444444

Category:  Behavioral
Ac

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.5529411764705883
F1 Score (Micro) = 0.5862068965517241
F1 Score (Macro) = 0.3842072157122618
F1 Score (Weighted) = 0.5641176914068996
Precision (Macro) = 0.4296743571256909
Recall (Macro) = 0.3636166832324328
Accuracy Score = 0.42424242424242425
F1 Score (Micro) = 0.5675675675675675
F1 Score (Macro) = 0.35714285714285715
F1 Score (Weighted) = 0.5034632034632035
Precision (Macro) = 0.4557017543859649
Recall (Macro) = 0.3229166666666667
Accuracy Score = 0.583941605839416
F1 Score (Micro) = 0.5925925925925926
F1 Score (Macro) = 0.3914562443486997
F1 Score (Weighted) = 0.5834538473245097
Precision (Macro) = 0.4134944213375586
Recall (Macro) = 0.3864219114219114
[0, 0, 0, 1, 0, 0]

Category:  Affective
Accuracy Score = 0.7941176470588235
F1 Score = 0.7286821705426356

Category:  Motivational
Accuracy Score = 0.8529411764705882
F1 Score = 0.5454545454545454

Category:  Cognitive
Accuracy Score = 0.9352941176470588
F1 Score = 0.0

Category:  Cog_distortions
Accuracy Score =

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.8070175438596491
F1 Score (Micro) = 0.7169811320754716
F1 Score (Macro) = 0.5435045574580458
F1 Score (Weighted) = 0.6875877610761332
Precision (Macro) = 0.6468253968253969
Recall (Macro) = 0.5537878787878788
Accuracy Score = 0.4864864864864865
F1 Score (Micro) = 0.7272727272727273
F1 Score (Macro) = 0.5539682539682539
F1 Score (Weighted) = 0.6957771787960468
Precision (Macro) = 0.6676406926406927
Recall (Macro) = 0.5482323232323232
Accuracy Score = 0.961038961038961
F1 Score (Micro) = 0.5714285714285714
F1 Score (Macro) = 0.13333333333333333
F1 Score (Weighted) = 0.8
Precision (Macro) = 0.1111111111111111
Recall (Macro) = 0.16666666666666666
[1, 1, 1, 1, 1, 0]

Category:  Affective
Accuracy Score = 0.9210526315789473
F1 Score = 0.7906976744186046

Category:  Motivational
Accuracy Score = 0.956140350877193
F1 Score = 0.6153846153846154

Category:  Cognitive
Accuracy Score = 0.956140350877193
F1 Score = 0.2857142857142857

Category:  Cog_distortions
Accuracy Score = 0

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.7794117647058824
F1 Score (Micro) = 0.6666666666666666
F1 Score (Macro) = 0.48594771241830065
F1 Score (Weighted) = 0.6423423423423423
Precision (Macro) = 0.6589635854341737
Recall (Macro) = 0.4859477124183007
Accuracy Score = 0.7794117647058824
F1 Score (Micro) = 0.6666666666666666
F1 Score (Macro) = 0.48594771241830065
F1 Score (Weighted) = 0.6423423423423423
Precision (Macro) = 0.6589635854341737
Recall (Macro) = 0.4859477124183007
Accuracy Score = nan
F1 Score (Micro) = 0.0
F1 Score (Macro) = nan
F1 Score (Weighted) = nan
Precision (Macro) = nan
Recall (Macro) = nan
[1, 0, 0, 0, 0, 0]

Category:  Affective
Accuracy Score = 0.9411764705882353
F1 Score = 0.8823529411764706

Category:  Motivational
Accuracy Score = 0.9705882352941176
F1 Score = 0.6666666666666666

Category:  Cognitive
Accuracy Score = 0.9705882352941176
F1 Score = 0.5

Category:  Cog_distortions
Accuracy Score = 0.9411764705882353
F1 Score = 0.0

Category:  Behavioral
Accuracy Score = 0.897058823529

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.7261904761904762
F1 Score (Micro) = 0.6976744186046512
F1 Score (Macro) = 0.5519841269841269
F1 Score (Weighted) = 0.6782446311858078
Precision (Macro) = 0.6121794871794872
Recall (Macro) = 0.5108858858858859
Accuracy Score = 0.7261904761904762
F1 Score (Micro) = 0.6976744186046512
F1 Score (Macro) = 0.5519841269841269
F1 Score (Weighted) = 0.6782446311858078
Precision (Macro) = 0.6121794871794872
Recall (Macro) = 0.5108858858858859
Accuracy Score = nan
F1 Score (Micro) = 0.0
F1 Score (Macro) = nan
F1 Score (Weighted) = nan
Precision (Macro) = nan
Recall (Macro) = nan
[1, 0, 0, 0, 0, 0]

Category:  Affective
Accuracy Score = 0.8214285714285714
F1 Score = 0.7619047619047619

Category:  Motivational
Accuracy Score = 0.9761904761904762
F1 Score = 0.75

Category:  Cognitive
Accuracy Score = 0.9642857142857143
F1 Score = 0.0

Category:  Cog_distortions
Accuracy Score = 0.9642857142857143
F1 Score = 0.4

Category:  Behavioral
Accuracy Score = 0.9642857142857143
F1 Score = 

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Score = 0.6744186046511628
F1 Score (Micro) = 0.6363636363636364
F1 Score (Macro) = 0.3680555555555556
F1 Score (Weighted) = 0.6412037037037037
Precision (Macro) = 0.4086586586586587
Recall (Macro) = 0.36798941798941803
Accuracy Score = 0.5333333333333333
F1 Score (Micro) = 0.7333333333333333
F1 Score (Macro) = 0.27619047619047615
F1 Score (Weighted) = 0.7193277310924371
Precision (Macro) = 0.3148148148148148
Recall (Macro) = 0.24621212121212122
Accuracy Score = 0.704225352112676
F1 Score (Micro) = 0.6
F1 Score (Macro) = 0.3458689458689459
F1 Score (Weighted) = 0.6127512127512127
Precision (Macro) = 0.3630952380952381
Recall (Macro) = 0.37400793650793646
[1, 0, 0, 0, 0, 0]

Category:  Affective
Accuracy Score = 0.7906976744186046
F1 Score = 0.75

Category:  Motivational
Accuracy Score = 0.9418604651162791
F1 Score = 0.0

Category:  Cognitive
Accuracy Score = 0.9767441860465116
F1 Score = 0.5

Category:  Cog_distortions
Accuracy Score = 0.9302325581395349
F1 Score = 0.625

Cate

In [99]:
for val_fold in all_targets:
    for fold_targets in val_fold:
        performance['targets'].append(fold_targets)

for val_fold in all_outputs:
    for fold_targets in val_fold:
        performance['outputs'].append(fold_targets)

In [104]:
#creating file with the fairness measures
#with open('CV_category_perf..p', 'wb') as fp:
#    pickle.dump(performance, fp, protocol=pickle.HIGHEST_PROTOCOL)