#Processing Datasets

In [1]:
!pip install datasets

from datasets import load_dataset
import requests
import json
import pandas as pd
from datasets import Dataset

import numpy as np
import torch
from torch.utils.data import DataLoader
import logging
from tqdm import tqdm
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import RandomSampler, SequentialSampler

import time
import datetime
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
import copy



# Experiments

In [7]:
# hyperparams
batch_size = 2
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
sample_every = 100
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# torch Dataset
class GPT2Dataset(torch.utils.data.Dataset):

  def __init__(self, txt_list, tokenizer, max_length=768):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>' + txt['prompt'] + txt['answer'], truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]

# helper methods
def train_valid_test(task_folder='multiArith/'):
  '''split easy-cot data with 8:1:1 ratio'''
  raw = load_dataset("json", data_files="easy_cot/" + task_folder + "data.json")
  data = Dataset.from_pandas(pd.DataFrame(data=raw))
  train_test_split = data.train_test_split(test_size=0.2, shuffle=False, seed=42)
  train = train_test_split['train']
  test = train_test_split['test']
  valid_test_split = test.train_test_split(test_size=0.5, shuffle=False, seed=42)
  valid = valid_test_split['train']
  test = valid_test_split['test']
  return train['train'], valid['train'], test['train']


def save_model(model, tokenizer, output_dir):
  if not os.path.exists(output_dir):
      os.makedirs(output_dir)

  print("Saving model to %s" % output_dir)

  model_to_save = model.module if hasattr(model, 'module') else model
  model_to_save.save_pretrained(output_dir)
  tokenizer.save_pretrained(output_dir)


def load_model(output_dir):
  model = GPT2LMHeadModel.from_pretrained(output_dir)
  tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
  model.to(device)
  return model, tokenizer

def extract_answer(output, is_generated=False):
  if is_generated:
    pattern = r'-->(.*)'
  else:
    pattern = r'-->(.*?)<'
  m = re.search(pattern, output)
  if m:
      return m.group(1)
  else:
      return None

def get_accuracy(model, tokenizer, test_set):
  model.eval()
  correct = 0
  total = 0

  for sample in test_set:
    prompt = sample['prompt']
    answer = extract_answer(sample['answer'])
    len_prompt = len(prompt)

    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    generated = generated.to(device)

    sample_outputs = model.generate(
                                    generated,
                                    #bos_token_id=random.randint(1,30000),
                                    do_sample=True,
                                    max_new_tokens=100,
                                    num_return_sequences=1,
                                    pad_token_id=tokenizer.eos_token_id
                                    )

    for i, sample_output in enumerate(sample_outputs):

      # print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
      
      extracted_answer = extract_answer(tokenizer.decode(sample_output, skip_special_tokens=True)[len_prompt:], is_generated=True)
#       print(sample['prompt'])
#       print(f'extracted: {extracted_answer}, truth: {answer}')
#       print(tokenizer.decode(sample_output, skip_special_tokens=True)[len_prompt:])
#       print("")
#       print("")
      if extracted_answer is not None and extracted_answer.strip() == answer.strip():
        correct += 1
    
      total += 1
    
  return correct/total


def three_run(model, tokenizer, test_set):
  avg_accuracy = get_accuracy(model, tokenizer, test_set)
#   for _ in range(3):
#     avg_accuracy += get_accuracy(model, tokenizer, test_set)
  #avg_accuracy /= 3
  print(f'Average accuracy over 3 runs: {avg_accuracy}')
    
def run_test(model, tokenizer, valid, test):
  # print("")
  # print("====== Experiment Results =======")
  # print("Running Default GPT2 on Task ...")
  # default_model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration).to(device)
  # default_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
  # print('validation--> ', end='')
  # three_run(default_model, default_tokenizer, valid)
  # print('test--> ', end='')
  # three_run(default_model, default_tokenizer, test)
  model.eval()
  print("")
  print("======== Start of Results ========")
  if valid is not None:
    print('validation--> ', end='')
    three_run(model, tokenizer, valid)
  print('test--> ', end='')
  three_run(model, tokenizer, test)
  print("======== End of Results ========")

## Baseline 2. Auto-CoT for in-context learning

In [29]:
def test_all(task_folder='multiArith/', model_dir='/kaggle/input/easycot_coinflip/other/hf/1/coinFlip'):
    
    default_configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
    default_model = GPT2LMHeadModel.from_pretrained("gpt2", config=default_configuration).to(device)
    default_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
    model, tokenizer = load_model(model_dir)

    # test split
    raw = load_dataset("json", data_files="/kaggle/input/easy-cot/easy_cot/" + task_folder + "data.json", field="data")
    # test = context-augmented, test_cpy = original prompt
    data = Dataset.from_pandas(pd.DataFrame(data=raw))
    train_test_split = data.train_test_split(test_size=0.2, shuffle=False, seed=42)
    test = train_test_split['test']['train']
    test = test[:168]
    print(test[0])
    test_cpy = copy.deepcopy(test)
    # load demos
    raw = load_dataset("json", data_files="/kaggle/input/easy-cot/easy_cot/" + task_folder + "demos.json", field="data")
    demos = raw['train']
    
    context = ""
    
    for sample in demos:
        tmp_str = context + sample['demo'] + "\n"
        if len(tmp_str.split()) > 600:
            break
        context = tmp_str
        
    for sample in test:
        sample['prompt'] = context + sample['prompt']

    print(len(context.split()))
    print("")
    print("Running Fine-tune-CoT...")
    run_test(model, tokenizer, None, test_cpy)
    print("")
    print("Running Auto-CoT...")
    run_test(default_model, default_tokenizer, None, test)
    print("")
    print("Running Easy-CoT...")
    run_test(model, tokenizer, None, test)
    print("")
  

In [8]:
test_all('coinFlip/','/kaggle/input/easycot_coinflip/other/hf/1/coinFlip')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1 [00:00<?, ?it/s]

{'answer': 'Gabino does not flip the coin, so the coin remains heads up. Kayla flips the coin, so the coin is now tails up. Laurie does not flip the coin, so the coin remains tails up. Familia flips the coin, so the coin is now heads up again. Therefore, the answer is yes, the coin is still heads up.-->yes<|endoftext|>', 'cluster': 6, 'prompt': 'A coin is heads up. Gabino does not flip the coin. Kayla flips the coin. Laurie does not flip the coin. Familia flips the coin. Is the coin still heads up? Note that "flip" here means "reverse"###'}


  0%|          | 0/1 [00:00<?, ?it/s]

552

Running Fine-tune-CoT...

test--> Average accuracy over 3 runs: 0.6363636363636364

Running Auto-CoT...

test--> Average accuracy over 3 runs: 0.1717171717171717

Running Easy-CoT...

test--> Average accuracy over 3 runs: 0.5555555555555556



In [9]:
test_all('lastLetter/','/kaggle/input/easycot_letter/other/hf/1/lastLetter')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-74debde25acb3cc5/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-74debde25acb3cc5/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

{'answer': 'The last letter of "Deanna" is "a". The last letter of "Terri" is "i". The last letter of "Gabriela" is "a". The last letter of "Jonah" is "h". So the final answer would be "aijah".-->aiah<|endoftext|>', 'cluster': 3, 'prompt': 'Take the last letters of each words in "Deanna Terri Gabriela Jonah" and concatenate them###'}
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-0c71972977e96315/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-0c71972977e96315/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

196

Running Fine-tune-CoT...

test--> Average accuracy over 3 runs: 0.05

Running Auto-CoT...

test--> Average accuracy over 3 runs: 0.0

Running Easy-CoT...

test--> Average accuracy over 3 runs: 0.0



In [19]:
test_all('commonSenseQA/','/kaggle/input/easycot_csqa/other/hf/1/commonSenseQA')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1 [00:00<?, ?it/s]

{'answer': 'A squirrel is a small rodent that is found in trees. They are also found in forests. They are not found on streets because they are afraid of cars. They are also not found in yards because they are afraid of dogs. The only other place they might be found is in a park.-->A<|endoftext|>', 'cluster': 7, 'prompt': 'Where would a brave squirrel be found? Answer choices: (A) street, (B) forest, (C) tree, (D) yard, (E) park###'}


  0%|          | 0/1 [00:00<?, ?it/s]

581

Running Fine-tune-CoT...

test--> Average accuracy over 3 runs: 0.13333333333333333

Running Auto-CoT...

test--> Average accuracy over 3 runs: 0.05

Running Easy-CoT...

test--> Average accuracy over 3 runs: 0.125



In [31]:
test_all('strategyQA/','/kaggle/input/easycot_sqa/other/hf/1/strategyQA')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1 [00:00<?, ?it/s]

{'answer': 'Cuba Libre is an alcoholic drink, and alcohol can interfere with sleep. So, no, Cuba Libre is not likely to help with insomnia.-->No<|endoftext|>', 'cluster': 5, 'prompt': 'Would Cuba Libre consumption help with insomnia###'}


  0%|          | 0/1 [00:00<?, ?it/s]

394

Running Fine-tune-CoT...

test--> Average accuracy over 3 runs: 0.23809523809523808

Running Auto-CoT...

test--> Average accuracy over 3 runs: 0.08928571428571429

Running Easy-CoT...

test--> Average accuracy over 3 runs: 0.0



In [22]:
test_all('multiArith/','/kaggle/input/easycot_ma/other/hf/1/multiArith')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1 [00:00<?, ?it/s]

{'answer': "We know that Bianca scored 45 points. We know that everyone else scored 6 points each. We know that the total score was 75 points. We want to know how many players were on Bianca's team. We can set up an equation to solve this problem. x + 6 + 6 + ... + 6 = 75 x + 6(n-1) = 75 x = 75 - 6(n-1) x = 75 - 6n + 6 x = 69 - 6n We want to find out how many-->5<|endoftext|>", 'cluster': 7, 'prompt': "Bianca's team won their dodgeball game and scored 75 points total. If Bianca scored 45 of the points and everyone else scored 6 points each, how many players were on her team###"}


  0%|          | 0/1 [00:00<?, ?it/s]

556

Running Fine-tune-CoT...

test--> Average accuracy over 3 runs: 0.01680672268907563

Running Auto-CoT...

test--> Average accuracy over 3 runs: 0.03361344537815126

Running Easy-CoT...

test--> Average accuracy over 3 runs: 0.058823529411764705



In [33]:
test_all('gsm8k/','/kaggle/input/easycot_gsm8k/other/hf/1/gsm8k')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/1 [00:00<?, ?it/s]

{'answer': 'Arnel has ten boxes of pencils. He keeps ten pencils and shares the remaining pencils equally with his five friends. If his friends got eight pencils each, that means Arnel gave them 40 pencils in total. If he gave them 40 pencils in total, that means he had 50 pencils to begin with. So each box of pencils has 5 pencils in it.-->5<|endoftext|>', 'cluster': 7, 'prompt': 'Arnel had ten boxes of pencils with the same number of pencils in each box.  He kept ten pencils and shared the remaining pencils equally with his five friends. If his friends got eight pencils each, how many pencils are in each box###'}


  0%|          | 0/1 [00:00<?, ?it/s]

537

Running Fine-tune-CoT...

test--> Average accuracy over 3 runs: 0.017857142857142856

Running Auto-CoT...

test--> Average accuracy over 3 runs: 0.005952380952380952

Running Easy-CoT...

test--> Average accuracy over 3 runs: 0.02976190476190476

