In [3]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/250.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wandb>=0.10.32 (from sim

In [4]:
# Import all the library that is necessary for analysis
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from argparse import ArgumentParser
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, train_test_split
from scipy.stats import pearsonr, spearmanr
import warnings
import pandas as pd
from sys import exit
import logging
import torch
warnings.filterwarnings("ignore")
##########################################

##### Code

In [3]:
#!pip install huggingface_hub



In [1]:
#from huggingface_hub import login
#login("")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
# Define functions that is necessary for analysis
def pearson_corr(preds, labels):
    return pearsonr(preds, labels)[0]

def spearman_corr(preds, labels):
    return spearmanr(preds, labels)[0]

def accuracy(preds, labels):
    return sum([p == l for p, l in zip(preds, labels)]) /len(labels)

def precision(preds, labels):
    return precision_score(y_true=labels, y_pred=preds)

def recall(preds, labels):
    return recall_score(y_true=labels, y_pred=preds)

def f1(preds, labels):
    return f1_score(y_true=labels, y_pred=preds)

In [6]:
#First fine-tuning roberta batch=8
def train(colname, train_df, eval_df, text_cols,
          output_dir, model="roberta", num_labels=2,
          num_train_epochs=5,
          train_batch_size=8, gradient_accumulation_steps=2,
          max_seq_length=512,
          cross_validate=False,
          balance_labels=True):
    print("Train size: %d" % len(train_df))
    print("Eval size: %d" % len(eval_df))

    print(train_df.head())
    print(eval_df.head())

    print("Is CUDA available? " + str(torch.cuda.is_available()))

    if balance_labels:
        most_common = train_df["labels"].value_counts().idxmax()
        print("Most common label is: %s" % most_common)
        most_common_df = train_df[train_df["labels"]==most_common]
        concat_list = [most_common_df]
        for label, group in train_df[train_df["labels"]!=most_common].groupby("labels"):
            concat_list.append(group.sample(replace=True, n=len(most_common_df)))
        train_df = pd.concat(concat_list)
        print("Train size: %d" % len(train_df))
        print(train_df["labels"].value_counts())

    # Shuffle training data
    train_df = train_df.sample(frac=1)
    save_dir = output_dir + "/" + colname + "_train_size=" + str(len(train_df))

    model_args = ClassificationArgs()
    model_args.reprocess_input_data = True
    model_args.overwrite_output_dir = True
    model_args.evaluate_during_training = True  # change if needed
    model_args.max_seq_length = int(max_seq_length / len(text_cols))
    model_args.num_train_epochs = num_train_epochs #
    model_args.evaluate_during_training_steps = int(len(train_df) / train_batch_size) # after each epoch
    model_args.save_steps = int(len(train_df) / train_batch_size)# added
    model_args.save_eval_checkpoints = True
    model_args.save_model_every_epoch = True
    model_args.wandb_project = colname
    model_args.train_batch_size = train_batch_size #
    model_args.output_dir = save_dir
    model_args.best_model_dir = save_dir +"/best_model"
    model_args.cache_dir = save_dir + "/cache"
    model_args.tensorboard_dir = save_dir + "/tensorboard"
    model_args.regression = num_labels == 1
    model_args.gradient_accumulation_steps = gradient_accumulation_steps
    model_args.wandb_kwargs = {"reinit": True}
    model_args.fp16 = False
    model_args.fp16_opt_level = "O0"
    model_args.no_cache = False
    #model_args.no_save = cross_validate
    model_args.save_optimizer_and_scheduler = True
    model_args.metric_for_best_model = 'eval_f1'


    model = ClassificationModel(model.split("-")[0], model,
                                use_cuda=torch.cuda.is_available(),
                                num_labels=num_labels,
                                args=model_args)

    model.train_model(train_df,
                      eval_df=eval_df,
                      accuracy=accuracy,
                      precision=precision,
                      recall=recall,
                      f1=f1,
                      args={"use_multiprocessing": False,
                            "process_count": 1,
                            "use_multiprocessing_for_evaluation": False,
                              "push_to_hub" : True},)
    return model

def predict(fname, model_path, model=None,
            model_type="roberta-base", predict_list=None,
          index_list=None, index_colname="index"):

    print(model_path)

    if model is None:
        model = ClassificationModel(
            model_type.split("-")[0], model_path
        )

    preds, outputs = model.predict(predict_list)
    with open(model_path + '/' + fname + '_preds.txt', 'w') as f:
        f.write(f"{index_colname}\tpred\n")
        for index, pred in zip(index_list, preds):
            f.write(f"{index}\t{pred}\n")

In [None]:
#data preparation for focusing question
#df = pd.read_csv('paired_annotations.csv').sample(frac=1)
#split 20% of data as testing data. Test data following the same distribution with training data.
#train_data_fq, test_data_fq = train_test_split(df, test_size=0.2, random_state=42, stratify=df['focusing_question'])
#test_data_fq.to_csv('test_data_fq.csv', index=False)
#train_data_fq.to_csv('train_data_fq.csv', index=False)

In [8]:
train_data_fq = pd.read_csv('train_data_fq.csv').sample(frac=1)
train_data = train_data_fq[~train_data_fq['focusing_question'].isnull()] #change to high_uptake later
model_type = 'roberta-base'#model selected
text_cols = 'student_text,teacher_text'.split(",") #seperate students text and teachers text
output_dir = 'ncte_roberta_8'
model = None
train_data = train_data.rename(columns={text_cols[0]: 'text_a',
                                                    text_cols[1]: 'text_b',
                                                    'focusing_question': 'labels'})
cols = ['text_a', 'text_b','labels']

In [9]:
n = 5
kf = KFold(n_splits=n, random_state=42, shuffle=True)
k = 0

for train_index, val_index in kf.split(train_data):
    print("Split %d" % k)
    output_dir_k = output_dir + "/" + 'focusing_question' + "_k%d" % k

    train_df = train_data.iloc[train_index] #traning data
    eval_df = train_data.iloc[val_index] #validation data

    model = train('focusing_question', train_df, eval_df, text_cols, output_dir=output_dir_k,
                  model=model_type, num_train_epochs=5, balance_labels=True,
                  cross_validate=True)

    predict_list = eval_df[["text_a", "text_b"]].values.tolist()#

    index_list = eval_df[["text_a", "text_b", "labels"]].values.tolist()#updated

    fname = 'focusing_question' + "_" + 'paired_annotations'+ "_split_%d" % k

    predict(fname,
            output_dir_k,
            model,
            model_type,
            predict_list=predict_list,
            index_list=index_list,
            index_colname='index')


    k += 1
    train_df, eval_df = train_test_split(train_data, test_size=0.2)

#model = train('focusing_question', train_df, eval_df, text_cols, output_dir, model_type, num_train_epochs=5, balance_labels=True)


Split 0
Train size: 1502
Eval size: 376
     exchange_idx  OBSID                                             text_a  \
1107     2661_153   2661      I’m gonna go with the International Dateline.   
765       208_108    208                Don’t we never add the denominator?   
439        248_83    248               ‘Cause there’s already a zero there?   
1785      4426_59   4426  You have to make a group of four fourths and t...   
1726      189_316    189                                 I didn’t get that.   

                                                 text_b  student_on_task  \
1107  The International Dateline.  Good.  As we’re g...                1   
765   We don’t add denominators, and we’re gonna thi...                1   
439                       Okay.  What else?  Student B?                1   
1785  Okay.  So you’re saying that seven fourths cou...                1   
1726  Now this is – I’m saying this assuming that wa...                1   

      teacher_on_task  high_

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 5:   0%|          | 0/318 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/318 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/318 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/318 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/318 [00:00<?, ?it/s]

ncte_roberta_8/focusing_question_k0


  0%|          | 0/47 [00:00<?, ?it/s]

Split 1
Train size: 1502
Eval size: 376
     exchange_idx  OBSID                                             text_a  \
1107     2661_153   2661      I’m gonna go with the International Dateline.   
765       208_108    208                Don’t we never add the denominator?   
439        248_83    248               ‘Cause there’s already a zero there?   
1785      4426_59   4426  You have to make a group of four fourths and t...   
1726      189_316    189                                 I didn’t get that.   

                                                 text_b  student_on_task  \
1107  The International Dateline.  Good.  As we’re g...                1   
765   We don’t add denominators, and we’re gonna thi...                1   
439                       Okay.  What else?  Student B?                1   
1785  Okay.  So you’re saying that seven fourths cou...                1   
1726  Now this is – I’m saying this assuming that wa...                1   

      teacher_on_task  high_

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Training loss,▅▃▂█▂▁▁▁▁▁▁▁▁▁▁
accuracy,▁▇▇████
auprc,█▆▆▁▂▂▁
auroc,█▄▄▃▁▁▁
eval_loss,▁▃▃▆▆▆█
f1,▁▆▆▅██▇
fn,▁▇▇███▇
fp,█▂▂▁▁▁▁
global_step,▁▁▂▂▂▃▃▄▄▄▄▅▅▅▆▆▇▇▇▇██
lr,██▇▇▆▆▅▅▄▃▃▂▂▂▁

0,1
Training loss,0.00013
accuracy,0.84574
auprc,0.43928
auroc,0.79998
eval_loss,1.22863
f1,0.5
fn,28.0
fp,30.0
global_step,795.0
lr,0.0


Running Epoch 0 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

ncte_roberta_8/focusing_question_k1


  0%|          | 0/47 [00:00<?, ?it/s]

Split 2
Train size: 1502
Eval size: 376
     exchange_idx  OBSID                                             text_a  \
765       208_108    208                Don’t we never add the denominator?   
1726      189_316    189                                 I didn’t get that.   
1312      4490_54   4490                             Number eight is $4.35.   
716      2541_452   2541                              I’m on twelve, right?   
1670      678_188    678  Because I started writing like that and I didn...   

                                                 text_b  student_on_task  \
765   We don’t add denominators, and we’re gonna thi...                1   
1726  Now this is – I’m saying this assuming that wa...                1   
1312   That is correct, $4.35.  Number nine, Student B?                1   
716           No, you did – what did you do, Student K?                0   
1670  Okay.  The good news is that you figured out t...                1   

      teacher_on_task  high_

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Training loss,█▇▄▃▂▁▇▁▁▁▁▁▁▁▁
accuracy,▁▆▆▆██▇
auprc,▂██▁▅▅▄
auroc,▁▂▂▂▇▇█
eval_loss,▁▄▄▇▇▇█
f1,▂██▁██▄
fn,▄▁▁█▂▂▅
fp,█▅▅▁▃▃▂
global_step,▁▁▂▂▂▃▃▄▄▄▄▅▅▅▆▆▇▇▇▇██
lr,██▇▇▆▆▅▅▄▃▃▂▂▂▁

0,1
Training loss,0.00019
accuracy,0.8484
auprc,0.50665
auroc,0.82554
eval_loss,1.10756
f1,0.47706
fn,33.0
fp,24.0
global_step,795.0
lr,0.0


Running Epoch 0 of 5:   0%|          | 0/320 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/320 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/320 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/320 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/320 [00:00<?, ?it/s]

ncte_roberta_8/focusing_question_k2


  0%|          | 0/47 [00:00<?, ?it/s]

Split 3
Train size: 1503
Eval size: 375
     exchange_idx  OBSID                                             text_a  \
1107     2661_153   2661      I’m gonna go with the International Dateline.   
765       208_108    208                Don’t we never add the denominator?   
439        248_83    248               ‘Cause there’s already a zero there?   
1785      4426_59   4426  You have to make a group of four fourths and t...   
831      1098_303   1098  This is the entry.  And that’s the attic.  And...   

                                                 text_b  student_on_task  \
1107  The International Dateline.  Good.  As we’re g...                1   
765   We don’t add denominators, and we’re gonna thi...                1   
439                       Okay.  What else?  Student B?                1   
1785  Okay.  So you’re saying that seven fourths cou...                1   
831      Now, this is the outside of your house, right?                1   

      teacher_on_task  high_

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Training loss,█▃▃▁▂▂▁▅▁▁▁▁▁▁▁▁
accuracy,▁▆▅▂▅▅█
auprc,███▅▁▁▁
auroc,█▅▅▄▂▂▁
eval_loss,▁▃▂▇██▇
f1,█▇▅▃▁▁▁
fn,▁▅▅▄▆▆█
fp,█▄▄▆▃▃▁
global_step,▁▁▂▂▂▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇███
lr,██▇▇▆▆▅▅▄▄▃▃▂▂▁▁

0,1
Training loss,0.00015
accuracy,0.83777
auprc,0.43503
auroc,0.73327
eval_loss,1.38452
f1,0.45045
fn,39.0
fp,22.0
global_step,800.0
lr,0.0


Running Epoch 0 of 5:   0%|          | 0/317 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/317 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/317 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/317 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/317 [00:00<?, ?it/s]

ncte_roberta_8/focusing_question_k3


  0%|          | 0/47 [00:00<?, ?it/s]

Split 4
Train size: 1503
Eval size: 375
     exchange_idx  OBSID                                             text_a  \
1107     2661_153   2661      I’m gonna go with the International Dateline.   
439        248_83    248               ‘Cause there’s already a zero there?   
1785      4426_59   4426  You have to make a group of four fourths and t...   
1726      189_316    189                                 I didn’t get that.   
831      1098_303   1098  This is the entry.  And that’s the attic.  And...   

                                                 text_b  student_on_task  \
1107  The International Dateline.  Good.  As we’re g...                1   
439                       Okay.  What else?  Student B?                1   
1785  Okay.  So you’re saying that seven fourths cou...                1   
1726  Now this is – I’m saying this assuming that wa...                1   
831      Now, this is the outside of your house, right?                1   

      teacher_on_task  high_

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Training loss,█▃▄▁▂▇▁▁▁▁▁▁▁▁▁
accuracy,█▁▁█▇▇▆
auprc,█▃▃▁▄▄▃
auroc,▅██▂▁▁▁
eval_loss,▁▃▃▅███
f1,▅▄▄▃██▁
fn,▇▁▁█▅▅▇
fp,▂██▁▃▃▂
global_step,▁▁▂▂▂▃▃▄▄▄▄▅▅▅▆▆▇▇▇▇██
lr,█▇▇▆▆▆▅▄▄▃▃▂▂▂▁

0,1
Training loss,0.0005
accuracy,0.864
auprc,0.43942
auroc,0.70546
eval_loss,0.99087
f1,0.42697
fn,30.0
fp,21.0
global_step,790.0
lr,0.0


Running Epoch 0 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

ncte_roberta_8/focusing_question_k4


  0%|          | 0/47 [00:00<?, ?it/s]

Save the best model to huggingface

In [11]:
from transformers import AutoModel, AutoTokenizer, pipeline

#the path to the specific model you want to save
model_name = "/content/ncte_roberta_8/focusing_question_k2/focusing_question_train_size=2558/best_model"
model = AutoModel.from_pretrained("/content/ncte_roberta_8/focusing_question_k2/focusing_question_train_size=2558/best_model")  # Load your pre-trained model
tokenizer = AutoTokenizer.from_pretrained("/content/ncte_roberta_8/focusing_question_k2/focusing_question_train_size=2558/best_model")  # Load your tokenizer

pipeline("text-classification", model=model, tokenizer=tokenizer)  # Required to initialize the repository

repo_name = "ncte_roberta_8"

# Create a new repository
model.push_to_hub(
    repo_name,
    private=True,  # Set to True if you want to make the repository private
    use_filename=True
)
tokenizer.push_to_hub(repo_name, private=True)

Some weights of RobertaModel were not initialized from the model checkpoint at /content/ncte_roberta_8/focusing_question_k2/focusing_question_train_size=2558/best_model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'RobertaModel' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'D

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/baldwin23/ncte_roberta_8/commit/80afcf86da560cff8868eae3a86ea79c62051f61', commit_message='Upload tokenizer', commit_description='', oid='80afcf86da560cff8868eae3a86ea79c62051f61', pr_url=None, pr_revision=None, pr_num=None)

Test the model with the testing data

In [12]:
# load and organize the test data
test_data_fq = pd.read_csv('test_data_fq.csv').sample(frac=1)
test_data = test_data_fq[~test_data_fq['focusing_question'].isnull()]
text_cols = 'student_text,teacher_text'.split(",") #seperate students text and teachers text
test_data = test_data.rename(columns={text_cols[0]: 'text_a',
                                                    text_cols[1]: 'text_b',
                                                    'focusing_question': 'labels'})
cols = ['text_a', 'text_b','labels']

In [15]:
# the path to the best model checkpoint
best_model_path = "/content/ncte_roberta_8/focusing_question_k2/focusing_question_train_size=2558/best_model" #from colab
output_dir_k = 'ncte_roberta_8'
# Load the model from the checkpoint
model = ClassificationModel('roberta', best_model_path, use_cuda=torch.cuda.is_available())

predict_list = test_data[["text_a", "text_b"]].values.tolist()

index_list = test_data[["text_a", "text_b", "labels"]].values.tolist()

predict('best_model',
            output_dir_k,
            model,
            model_type,
            predict_list=predict_list,
            index_list=index_list,
            index_colname='index') #predict the outcome with test data

ncte_roberta_8


  0%|          | 0/59 [00:00<?, ?it/s]

In [19]:
df_predictions = pd.read_csv("/content/ncte_roberta_8/best_model_preds.txt", sep='\t')
#df_predictions.head()
print("*F1 = ", f1(df_predictions['pred'], test_data['labels']))
print("accuracy = ", accuracy(df_predictions['pred'], test_data['labels']))
print("precision = ", precision(df_predictions['pred'], test_data['labels']))
print("recall = ", recall(df_predictions['pred'], test_data['labels']))

*F1 =  0.4830917874396135
accuracy =  0.7723404255319148
precision =  0.37037037037037035
recall =  0.6944444444444444
