Reinforcement Learning for Large Language Models

Winter 23/24 Semester

Final Group Project

Kateryna Smykovska, Jakob Schmitter, Suvi Lehtosalo, Megan Horikawa

This notebook was made by Megan Horikawa

7b Chat - https://colab.research.google.com/drive/1D6Sw8CzEnVXWA1DoX-4lx9XJe3u6XIym?usp=sharing

GITHUB : https://github.com/cpllab/syntaxgym-core  

Dataset page on HF:https://huggingface.co/datasets/cpllab/syntaxgym

Webpage: https://cpllab.github.io/syntaxgym-core/quickstart.html   

Metric Card: https://huggingface.co/spaces/cpllab/syntaxgym


In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
!pip install accelerate
!pip install bitsandbytes
!pip install sentencepiece
!pip install torch
!pip install datasets
!pip install evaluate
!pip install huggingface_hub


Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl (102.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.0
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [

In [None]:
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer
import torch

import datasets
import evaluate
import numpy as np
import pandas as pd
import json
import re

from google.colab import userdata
my_secret_key = userdata.get('HF_TOKEN')

In [None]:
# initalize Model

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf",token=my_secret_key)

model = LlamaForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    load_in_4bit=True,
    device_map="auto",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    token=my_secret_key
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [None]:
# build the syntax gym dataset
import json
from datasets import load_dataset
import numpy as np


# use datasets for the following tasks: Subordination, Subordination orc-orc, Reflexive prep fem& masc,cleft, center_embeding,

subordination_dataset = load_dataset("cpllab/syntaxgym", "subordination", split='test', trust_remote_code=True)

subordination_orc_dataset = load_dataset("cpllab/syntaxgym", "subordination_orc-orc", split='test', trust_remote_code=True)

center_embed_dataset = load_dataset("cpllab/syntaxgym", "center_embed",split='test', trust_remote_code=True)

reflexive_prep_fem_dataset = load_dataset("cpllab/syntaxgym", "reflexive_prep_fem", split='test', trust_remote_code=True)

reflexive_prep_masc_dataset = load_dataset("cpllab/syntaxgym", "reflexive_prep_masc", split='test', trust_remote_code=True)

cleft_dataset = load_dataset("cpllab/syntaxgym", "cleft", split='test', trust_remote_code=True)

# make a list of the datasets to iterate through?

dataset_list = []

dataset_list.append(subordination_dataset)
dataset_list.append(subordination_orc_dataset)
dataset_list.append(center_embed_dataset)
dataset_list.append(reflexive_prep_fem_dataset)
dataset_list.append(reflexive_prep_masc_dataset)
dataset_list.append(cleft_dataset)




In [None]:
# define log likelihood function (taken from homework with small changes for llama)

def get_log_prob_of_completion(
        model,
        tokenizer,
        prompt,
        completion,
        device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
):
        """
        Convenience function for computing the log probability of a completion
        given a prompt.
        """
        # tokenize the prompt and the completion
        # truncate so as to fit into to maximal context window of llama2
        # which is 2048 tokens
        input_ids = tokenizer(
                prompt + completion,
                return_tensors='pt',
                truncation=True,
                max_length=2048,
        )['input_ids'].to(device)

        # separately tokenize prompt
        # so as to access the logits for the completion only
        # when scoring the completion
        input_ids_prompt = tokenizer(
                prompt,
                return_tensors='pt',
                truncation=True,
                max_length=2048
        )['input_ids'].to(device)

        # create attention mask and position ids
        attention_mask = (input_ids != tokenizer.eos_token_id).to(dtype=torch.int64)
        position_ids = attention_mask.cumsum(-1)-1
        # get the logits for the completion
        with torch.no_grad():
                out = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        position_ids=position_ids
                )

        # get the logits of the completion
        # for that, make a tensor of the logits
        # for the completion only
        # in particular, we shift the indices by one to the left to access logits of the
        # actual sequence tokens
        logits_completion = out.logits[:, :-1]
        logits_completion = logits_completion.squeeze()
        # get the log probabilities for the completion
        log_probs = torch.nn.functional.log_softmax(
                logits_completion,
                dim=-1
        )
        # retrieve the logit corresponding to the actual completion tokens
        try:
                log_completion_tokens = log_probs.gather(
                        dim=-1,
                        index=input_ids[:, 1:].squeeze().unsqueeze(-1)
                )
        except:
                log_completion_tokens = log_probs.gather(
                        dim=-1,
                        index=input_ids[:, 1:].unsqueeze(-1)
                )

        continuationConditionalLogProbs = log_completion_tokens[
                (input_ids_prompt.shape[-1]-1):
        ]
        completion_log_prob = torch.mean(
                continuationConditionalLogProbs
        ).cpu()

        return completion_log_prob

In [None]:
# initalize lists for the data to be put into a dataframe later

# label of the sentence within the task
condition_names = []
# this is the subdataset name
suite_names = []

prompts = []
completions = []
log_prob_list = []
predictions = []

# index of the item within the dataset this will
item_index_list = []
# item index within the dataset that will increment after each lineitem is processed
item_index = 0

#regex patter for parsing the prediction arguments
regex_pattern = r'\((\d+);%([^%]+)%\)'

# start loop for processing the data in each dataset

for dataset in dataset_list:
  # index value to use to incriment through values in each dataset
  i = 0

  print(f'Currently working on {dataset}')
  # now begin iterating through each entry of the dataset
  while i < len(dataset):
    # parse the data from json format of each individual entry in dataset
    data = json.loads(json.dumps(dataset[i]))
    suite_name = data['suite_name']
    prediction = data['predictions']
    conditions = data['conditions']

    #after stepping into conditions section can pull content
    condition_name= conditions['condition_name']
    content = conditions['content']
    regions = conditions['regions']


    # parse regions of the sentence and return nested list of sentences
    region_list = []
    for item in regions:
      region_list.append(item['content'])

    # parse prediction into tuple of indices and sentence label
    index_condition_name = re.findall(regex_pattern, prediction[0])

    # convert nested sentences list and condition names into a dictionary
    # Keys are the condition names and values are the nested sentences.
    sent_dictionary = {condition_name[i]: region_list[i] for i in range (len(condition_name))}

    # go through first pair in the index_condition_name tuple
    for pair in index_condition_name:

      suite_names.append(suite_name)
      predictions.append(prediction)


      # get index
      index = int(pair[0]) -1
      # get category name
      category_name = pair[1]
      condition_names.append(category_name)


      # retrive the correct sentence from the dictionary
      sentence = sent_dictionary.get(category_name)

      # concatenate a string up until the index to create prompt
      prompt = ""
      completion = sentence[index]
      idx = 0
      while idx < index:
        prompt += " " + sentence[idx]
        idx += 1
      # here the log probability given the prompt and completion would then be calculated and added to a list
      prompts.append(prompt)
      completions.append(completion)
      # log probability function
      log_prob = get_log_prob_of_completion(model, tokenizer, prompt, completion)
      log_prob_list.append(log_prob)

      #add index of item in the dataset
      item_index_list.append(item_index)
      #increment item_index
    item_index += 1
    print(f'task {i} finished')
    #increment i to go to next line item in the dataset
    i+=1

Currently working on Dataset({
    features: ['suite_name', 'item_number', 'conditions', 'predictions'],
    num_rows: 23
})
task 0 finished
task 1 finished
task 2 finished
task 3 finished
task 4 finished
task 5 finished
task 6 finished
task 7 finished
task 8 finished
task 9 finished
task 10 finished
task 11 finished
task 12 finished
task 13 finished
task 14 finished
task 15 finished
task 16 finished
task 17 finished
task 18 finished
task 19 finished
task 20 finished
task 21 finished
task 22 finished
Currently working on Dataset({
    features: ['suite_name', 'item_number', 'conditions', 'predictions'],
    num_rows: 23
})
task 0 finished
task 1 finished
task 2 finished
task 3 finished
task 4 finished
task 5 finished
task 6 finished
task 7 finished
task 8 finished
task 9 finished
task 10 finished
task 11 finished
task 12 finished
task 13 finished
task 14 finished
task 15 finished
task 16 finished
task 17 finished
task 18 finished
task 19 finished
task 20 finished
task 21 finished
task 

In [None]:
# make pandas dataframe

df = pd.DataFrame(list(zip(item_index_list, suite_names,condition_names,prompts ,completions, log_prob_list, predictions)), columns = ['task','suite name', 'condition' , 'prompt','completion','log_prob', 'prediction'])


# next I need to evaluate the predictions.....


df.head(40)



Unnamed: 0,task,suite name,condition,prompt,completion,log_prob,prediction
0,0,subordination,sub_no-matrix,As the doctor studied the book,.,tensor(-5.1427),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
1,0,subordination,no-sub_no-matrix,The doctor studied the book,.,tensor(-1.8918),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
2,0,subordination,sub_matrix,As the doctor studied the book,", the nurse walked into the room .",tensor(-2.4924),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
3,0,subordination,no-sub_matrix,The doctor studied the book,", the nurse walked into the room .",tensor(-3.5332),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
4,1,subordination,sub_no-matrix,After the man shot the bird,.,tensor(-5.0617),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
5,1,subordination,no-sub_no-matrix,The man shot the bird,.,tensor(-1.6878),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
6,1,subordination,sub_matrix,After the man shot the bird,", he loaded his gun .",tensor(-3.2634),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
7,1,subordination,no-sub_matrix,The man shot the bird,", he loaded his gun .",tensor(-3.7068),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
8,2,subordination,sub_no-matrix,Because the students did not like the material,.,tensor(-2.2049),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...
9,2,subordination,no-sub_no-matrix,The students did not like the material,.,tensor(-1.3439),[((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%)...


In [None]:
# export to csv

df.to_csv('/content/drive/My Drive/RLProject/SyntaxGym_7b.csv', index=False)

CSV Located here: https://drive.google.com/file/d/1TqoFyd-8RQWPQ39shdzh8TmH8y2gPOh1/view?usp=drive_link

Next will evaluate results based on predictions given in datasets

In [1]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

#import results

Mounted at /content/drive


In [2]:
df = pd.read_csv('/content/drive/My Drive/RLProject/SyntaxGym_7b.csv')
# convert tensors in dataframe

def remove_tensor(df, column_name):
    # Define a regular expression pattern to extract the number inside the parenthesis
    pattern = r'tensor\((-?\d+\.\d+)\)'

    # Extract the number from the column using the defined pattern
    df['temp'] = df[column_name].str.extract(pattern)

    # Replace the 'tensor()' part with an empty string
    df[column_name] = df[column_name].str.replace(r'tensor\((-?\d+\.\d+)\)', r'\1')

    # Drop the temporary column
    df.drop(columns=['temp'], inplace=True)

    return df

df = remove_tensor(df, 'log_prob')

df.head()

  df[column_name] = df[column_name].str.replace(r'tensor\((-?\d+\.\d+)\)', r'\1')


Unnamed: 0,task,suite name,condition,prompt,completion,log_prob,prediction
0,0,subordination,sub_no-matrix,As the doctor studied the book,.,-5.1427,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...
1,0,subordination,no-sub_no-matrix,The doctor studied the book,.,-1.8918,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...
2,0,subordination,sub_matrix,As the doctor studied the book,", the nurse walked into the room .",-2.4924,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...
3,0,subordination,no-sub_matrix,The doctor studied the book,", the nurse walked into the room .",-3.5332,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...
4,1,subordination,sub_no-matrix,After the man shot the bird,.,-5.0617,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...


In [3]:
# make seperate dataframes for each subset since things don't seem to be going well generally moving through df

suite_names = []

subordination_df = df[df['suite name']== 'subordination']
subordination_orc_df = df[df['suite name']== 'subordination_orc-orc']
center_embed_df =df[df['suite name']== 'center_embed']
reflexive_prep_fem_df = df[df['suite name']== 'reflexive_prep_fem']
reflexive_prep_masc_df = df[df['suite name']== 'reflexive_prep_masc']
cleft_df = df[df['suite name']== 'cleft']

suite_names.append(subordination_df)
suite_names.append(subordination_orc_df)
suite_names.append(center_embed_df)
suite_names.append(reflexive_prep_fem_df)
suite_names.append(reflexive_prep_masc_df)
suite_names.append(cleft_df)

# lists to make another df to store evaluatation of accuracy.
suite_name_list_df = []
prediction_list_df = []
results_list_df =[]


# make a method to take in the task dataframe and return true or false depending on prediction. The prediction for each condition is the same for each task
def subordination_eval(task_df):
  # [(sub_no-matrix > no-sub_no-matrix) & (sub_matrix < no-sub_matrix)]
  log_prob_sub_no_matrix = task_df.loc[task_df['condition'] == 'sub_no-matrix', 'log_prob'].iloc[0]
  log_prob_no_sub_no_matrix = task_df.loc[task_df['condition'] == 'no-sub_no-matrix', 'log_prob'].iloc[0]
  log_prob_sub_matrix = task_df.loc[task_df['condition'] == 'sub_matrix', 'log_prob'].iloc[0]
  log_prob_no_sub_matrix = task_df.loc[task_df['condition'] == 'no-sub_matrix', 'log_prob'].iloc[0]
  result = (log_prob_sub_no_matrix > log_prob_no_sub_no_matrix) & (log_prob_sub_matrix < log_prob_no_sub_matrix)
  return result

def subordination_orc_eval(task_df):
  #['((5;%sub_no-matrix%) > (5;%no-sub_no-matrix%) ) & ((5;%sub_matrix%) < (5;%no-sub_matrix%) )']
  log_prob_sub_no_matrix = task_df.loc[task_df['condition'] == 'sub_no-matrix', 'log_prob'].iloc[0]
  log_prob_no_sub_no_matrix = task_df.loc[task_df['condition'] == 'no-sub_no-matrix', 'log_prob'].iloc[0]
  log_prob_sub_matrix = task_df.loc[task_df['condition'] == 'sub_matrix', 'log_prob'].iloc[0]
  log_prob_no_sub_matrix = task_df.loc[task_df['condition'] == 'no-sub_matrix', 'log_prob'].iloc[0]
  result = (log_prob_sub_no_matrix > log_prob_no_sub_no_matrix) & (log_prob_sub_matrix < log_prob_no_sub_matrix)
  return result

def center_embed_eval(task_df):
  #['( (6;%plaus%) + (7;%plaus%) ) < ( (6;%implaus%) + (7;%implaus%) )']
  log_prob_plaus1 = task_df.loc[task_df['condition'] == 'plaus', 'log_prob'].iloc[0]
  log_prob_plaus2 = task_df.loc[task_df['condition'] == 'plaus', 'log_prob'].iloc[1]
  log_prob_implaus1 =task_df.loc[task_df['condition'] == 'implaus', 'log_prob'].iloc[0]
  log_prob_implaus2 =task_df.loc[task_df['condition'] == 'implaus', 'log_prob'].iloc[1]
  result = (log_prob_plaus1 + log_prob_plaus2 ) < (log_prob_implaus1 + log_prob_implaus2)
  return result

def reflexive_prep_eval(task_df):
  # ['( (7;%match_sing%) < (7;%mismatch_sing%) ) & ( (7;%match_plural%) < (7;%mismatch_plural%) )'] same for fem and masc
  log_prob_match_sing = task_df.loc[task_df['condition'] == 'match_sing', 'log_prob'].iloc[0]
  log_prob_mismatch_sing = task_df.loc[task_df['condition'] == 'mismatch_sing', 'log_prob'].iloc[0]
  log_prob_match_plural =task_df.loc[task_df['condition'] == 'match_plural', 'log_prob'].iloc[0]
  log_prob_mismatch_plural =task_df.loc[task_df['condition'] == 'mismatch_plural', 'log_prob'].iloc[0]
  result = (log_prob_match_sing < log_prob_mismatch_sing ) & (log_prob_match_plural < log_prob_mismatch_plural)
  return result

def cleft_eval(task_df):
  #['(6np_mismatch-6np_match%)+(((5;%vp_mismatch%)+(6;%vp_mismatch%))-((5;%vp_match%)+(6;%vp_match%)))>0']
  log_prob_np_mismatch = task_df.loc[task_df['condition'] == 'np_mismatch', 'log_prob'].iloc[0]
  log_prob_np_match = task_df.loc[task_df['condition'] == 'np_match', 'log_prob'].iloc[0]
  log_prob_vp_mismatch1 = task_df.loc[task_df['condition'] == 'vp_mismatch', 'log_prob'].iloc[0]
  log_prob_vp_mismatch2 = task_df.loc[task_df['condition'] == 'vp_mismatch', 'log_prob'].iloc[1]
  log_prob_vp_match1 = task_df.loc[task_df['condition'] == 'vp_match', 'log_prob'].iloc[0]
  log_prob_vp_match2 = task_df.loc[task_df['condition'] == 'vp_match', 'log_prob'].iloc[1]

  #apparently need to convert to numerical values
  log_prob_np_mismatch = pd.to_numeric(log_prob_np_mismatch)
  log_prob_np_match = pd.to_numeric(log_prob_np_match)
  log_prob_vp_mismatch1 = pd.to_numeric(log_prob_vp_mismatch1)
  log_prob_vp_mismatch2 = pd.to_numeric(log_prob_vp_mismatch2)
  log_prob_vp_match1 = pd.to_numeric(log_prob_vp_match1)
  log_prob_vp_match2 = pd.to_numeric(log_prob_vp_match2)

  result = (log_prob_np_mismatch - log_prob_np_match) + ((log_prob_vp_mismatch1 + log_prob_vp_mismatch2) - (log_prob_vp_match1 + log_prob_vp_match2)) > 0
  return result

In [4]:
#initalize counter for task #
task_counter = 0

another_index = 0


for df in suite_names:
  #go through each of the dataframes with index resetting to 0 each time.
  another_index = 0

  while another_index < df['task'].nunique():
    # go task by task

    task = df[df['task'] == task_counter]

    #next need to evaluate predictions based on suite name
    if task['suite name'].iat[0] == 'subordination':
      accuracy_eval = subordination_eval(task)
      results_list_df.append(accuracy_eval)
    if task['suite name'].iat[0] == 'subordination_orc-orc':
      accuracy_eval = subordination_orc_eval(task)
      results_list_df.append(accuracy_eval)
    if task['suite name'].iat[0] == 'center_embed':
      accuracy_eval = center_embed_eval(task)
      results_list_df.append(accuracy_eval)
    if task['suite name'].iat[0] == 'reflexive_prep_fem' or task['suite name'].iat[0] == 'reflexive_prep_masc' :
      accuracy_eval = reflexive_prep_eval(task)
      results_list_df.append(accuracy_eval)
    if task['suite name'].iat[0] == 'cleft':
      accuracy_eval = cleft_eval(task)
      results_list_df.append(accuracy_eval)

    #add prediction and suite_name to lists
    suite_name_list_df.append(task['suite name'].iat[0])
    prediction_list_df.append(task['prediction'].iat[0])

    #increment task #
    task_counter +=1
    another_index +=1

print('done!')



done!


In [5]:


eval_df = pd.DataFrame(list(zip(suite_name_list_df, prediction_list_df,results_list_df)), columns = ['suite name','prediction', 'correct_prediction'])

eval_df.head(50)

Unnamed: 0,suite name,prediction,correct_prediction
0,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True
1,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True
2,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True
3,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True
4,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True
5,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True
6,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True
7,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True
8,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,False
9,subordination,['((3;%sub_no-matrix%) > (3;%no-sub_no-matrix%...,True


In [6]:

eval_df.to_csv('/content/drive/My Drive/RLProject/SyntaxGym_7b_eval.csv', index=False)

Evaluation CSV: https://drive.google.com/file/d/1--lYrSMH-_T-MZysLZ5ylRaFq1-CPI_Y/view?usp=drive_link


In [11]:
accuracy_df = eval_df.groupby('correct_prediction').count()
accuracy_df

Unnamed: 0_level_0,suite name,prediction
correct_prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
False,87,87
True,65,65


In [12]:
print(65/(144+8))

0.4276315789473684


Accuracy, Recall, F1 Score
