In [None]:
!pip install datasets

from datasets import load_dataset
import requests
import json
import pandas as pd
from datasets import Dataset

import numpy as np
import torch
from torch.utils.data import DataLoader
import logging
from tqdm import tqdm
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import RandomSampler, SequentialSampler

import time
import datetime
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
import copy



#Processing Datasets

## Raw Datasets

In [None]:
def load_json(dir='fine_tune_cot/', dataset='coin_flip.json'):
  '''json file --> dictionary'''
  url = 'https://raw.githubusercontent.com/ice188/easy_cot/main/datasets/' + dir + dataset
  return json.loads(requests.get(url).text)

def train_test_valid(raw_data,test_size=0.2):
  '''split dataset by 8:1:1 ratio'''
  data = raw_data.train_test_split(test_size=0.2)
  train = data['train']
  test_split = data['test'].train_test_split(test_size=0.5)
  valid = test_split['train']
  test = test_split['test']
  return train, valid, test

def load_data(file_name):
  '''return train, valid, test splits given raw dataset with 8:1:1'''
  raw = load_json('zero_shot_cot/', file_name)
  if file_name not in ['multiarith.json']:
    raw = raw['examples']
  data = Dataset.from_pandas(pd.DataFrame(data=raw))
  return train_test_valid(data)

In [None]:
# cqa (huggingFace)
cqa_train = load_dataset("tau/commonsense_qa", split="train")
cqa_valid = load_dataset("tau/commonsense_qa", split="validation")
cqa_test  = load_dataset("tau/commonsense_qa", split="test")

# gsm8k (huggingFace)
gsm8k_train = load_dataset("gsm8k", 'main', split="train")
gsm8k_test_data = load_dataset("gsm8k", 'main', split="test")
gsm8k_test_data = gsm8k_test_data.train_test_split(test_size=0.5)
gsm8k_valid = gsm8k_test_data['train']
gsm8k_test = gsm8k_test_data['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/160k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [None]:
coin_train, coin_valid, coin_test = load_data('coin_flip.json')
letter_train, letter_valid, letter_test = load_data('last_letter.json')
ma_train, ma_valid, ma_test = load_data('multiarith.json')
sqa_train, sqa_valid, sqa_test = load_data('strategy_qa.json')

In [None]:
sqa_train[0]

{'input': "Were there eight humans on Noah's Ark?",
 'target_scores': {'No': 0, 'Yes': 1},
 'target': "Yes. Noah only took his family aboard the Ark. Noah brought his wife, three sons, and his sons' wives. Four couples lived on the Ark, eight total people."}

## Fine-tune-CoT Base Dataset
Courtesy of https://github.com/itsnamgyu/reasoning-teacher, reasoning chains generated with on the default teacher model text-davinci-002 are avaliable to use as base dataset.

In [None]:
# format: {'index':[{'sample_index', 'completion_index', 'question', 'anwser','reasoning_prompt', 'reasoning_completion','prompt','completion'}]}
coinFlip = load_json(dataset='coin_flip.json')['data']
lastLetter = load_json(dataset='last_letter.json')['data']
commonSenseQA = load_json(dataset='commonsense_qa.json')['data']
strategyQA = load_json(dataset='strategy_qa.json')['data']
gsm8k = load_json(dataset='gsm8k.json')['data']
multiArith = load_json(dataset='multiarith.json')['data']

## Generating Easy-CoT Datasets
Code adapted from https://github.com/amazon-science/auto-cot/blob/main/run_demo.py

In [None]:
!pip install sentence_transformers

In [None]:
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import json
import matplotlib.pyplot as plt
import argparse

task = 'gsm8k'    # choices=["gsm8k", "commonsense_qa","multiarith", "strategy_qa", "coin_flip", "last_letter"]
max_ra_len = 5    # maximum reasoning chain length
encoder = 'all-MiniLM-L6-v2' # sentence encoder for clustering
sampling = 'center'          # whether to sample the cluster center first
random_seed = 129
silent = True

def run_auto_cot(dataset, task='gsm8k', save_dir='easy_cot/', max_ra_len=5,random_seed = 192,encoder = 'all-MiniLM-L6-v2',sampling = 'center'):
  '''apply auto cot on base dataset, output easy_cot dataset'''

  encoder = SentenceTransformer(encoder)

  if task == "last_letter":
      max_ra_len = 7
      num_clusters = 4
  elif task == "commonsens_qa":
      num_clusters = 7
  elif task == "strategy_qa":
      num_clusters = 6
  else:
      num_clusters = 8

  corpus = []
  question = []
  rationale = []
  answer = []

  for idx in list(dataset.keys()):
    sample = dataset[idx][0]
    q = 'Q: ' + sample['question'] + '\nA:'
    a = sample['answer']
    r = sample['reasoning_completion']

    corpus.append(q)
    question.append(q)
    rationale.append(r)
    answer.append(a)

  # run auto-cot
  corpus_embeddings = encoder.encode(corpus)
  clustering_model = KMeans(n_clusters=num_clusters, random_state=random_seed, n_init='auto')
  clustering_model.fit(corpus_embeddings)
  cluster_assignment = clustering_model.labels_
  clustered_sentences = [[] for i in range(num_clusters)]
  dist = clustering_model.transform(corpus_embeddings)
  clustered_dists = [[] for i in range(num_clusters)]
  clustered_idx = [[] for i in range(num_clusters)]

  for sentence_id, cluster_id in enumerate(cluster_assignment):
      clustered_sentences[cluster_id].append(corpus[sentence_id])
      clustered_dists[cluster_id].append(dist[sentence_id][cluster_id])
      clustered_idx[cluster_id].append(sentence_id)

  datas = []
  demos = []
  get_demo = True

  for i in range(len(clustered_dists)):
      # print("Cluster ", i+1)
      tmp = list(map(list, zip(range(len(clustered_dists[i])), clustered_dists[i])))
      top_min_dist = sorted(tmp, key=lambda x: x[1], reverse=False)
      get_demo = True

      for element in top_min_dist:
          min_idx = element[0]
          c_rationale = rationale[clustered_idx[i][min_idx]].strip()
          a = answer[clustered_idx[i][min_idx]].strip()
          q = question[clustered_idx[i][min_idx]]
          r = c_rationale.replace("\n\n", "\n").replace("\n", " ").strip()
          r = " ".join(r.split())

          p = q[3:-4] + "###"
          c = i+1

          if get_demo and len(question[clustered_idx[i][min_idx]].strip().split()) <= 60 \
              and len(c_rationale.replace("\n\n", "\n").split("\n")) <= max_ra_len and c_rationale[-1] == "." and a != "":

              # demo for icl
              d = p + r + "-->" + a
              demo = {"demo": d, "cluster": c}
              demos.append(demo)
              get_demo = False # one demo for each cluster

          else:
            # data for fine-tuning
            data = {"prompt": p, "answer": r + '-->' + a + '<|endoftext|>', "cluster": c}
            datas.append(data)

  with open(save_dir + 'demos.json', 'w', encoding="utf-8") as write_f:
      json.dump(demos, write_f, indent=4, ensure_ascii=False)

  with open(save_dir + 'data.json', 'w', encoding="utf-8") as write_f:
      json.dump(datas, write_f, indent=4, ensure_ascii=False)

  # y_km = clustering_model.fit_predict(corpus_embeddings)
  # pca_model = PCA(n_components=2, random_state=random_seed)
  # transformed = pca_model.fit_transform(corpus_embeddings)
  # centers = pca_model.transform(clustering_model.cluster_centers_)

  # plt.scatter(x=transformed[:, 0], y=transformed[:, 1], c=y_km, s=50, cmap=plt.cm.Paired, alpha=0.4)
  # plt.scatter(centers[:, 0],centers[:, 1],
  #         s=250, marker='*', label='centroids',
  #         edgecolor='black',
  #         c=np.arange(0,num_clusters),cmap=plt.cm.Paired,)
  # plt.xticks([])
  # plt.yticks([])
  # plt.savefig(save_dir+".png", dpi=600)



In [None]:
run_auto_cot(coinFlip, 'coin_flip', 'easy_cot/coinFlip/')
run_auto_cot(lastLetter, 'last_letter', 'easy_cot/lastLetter/')
run_auto_cot(multiArith, 'multi_arith', 'easy_cot/multiArith/')
run_auto_cot(gsm8k, 'gsm8k', 'easy_cot/gsm8k/')
run_auto_cot(commonSenseQA, 'commonsense_qa', 'easy_cot/commonSenseQA/')
run_auto_cot(strategyQA, 'strategy_qa', 'easy_cot/strategyQA/')

## Convert json files to hugggingface datasets

In [None]:
data = load_dataset("json", data_files="easy_cot/multiArith/data.json")
demo = load_dataset("json", data_files="easy_cot/multiArith/demos.json")

In [None]:
data['train'][0]

In [None]:
demo['train'][0]

# Experiments

In [None]:
# hyperparams
batch_size = 2
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
epochs = 5
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8
sample_every = 100
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# torch Dataset
class GPT2Dataset(torch.utils.data.Dataset):

  def __init__(self, txt_list, tokenizer, max_length=768):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer('<|startoftext|>' + txt['prompt'] + txt['answer'], truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]

# helper methods
def train_valid_test(task_folder='multiArith/'):
  '''split easy-cot data with 8:1:1 ratio'''
  raw = load_dataset("json", data_files="easy_cot/" + task_folder + "data.json")
  data = Dataset.from_pandas(pd.DataFrame(data=raw))
  train_test_split = data.train_test_split(test_size=0.2, shuffle=False, seed=42)
  train = train_test_split['train']
  test = train_test_split['test']
  valid_test_split = test.train_test_split(test_size=0.5, shuffle=False, seed=42)
  valid = valid_test_split['train']
  test = valid_test_split['test']
  return train['train'], valid['train'], test['train']


def save_model(model, tokenizer, output_dir):
  if not os.path.exists(output_dir):
      os.makedirs(output_dir)

  print("Saving model to %s" % output_dir)

  model_to_save = model.module if hasattr(model, 'module') else model
  model_to_save.save_pretrained(output_dir)
  tokenizer.save_pretrained(output_dir)


def load_model(output_dir):
  model = GPT2LMHeadModel.from_pretrained(output_dir)
  tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
  model.to(device)
  return model, tokenizer

def extract_answer(output, is_generated=False):
  if is_generated:
    pattern = r'-->(.*)'
  else:
    pattern = r'-->(.*?)<'
  m = re.search(pattern, output)
  if m:
      return m.group(1)
  else:
      return None

def get_accuracy(model, tokenizer, test_set):
  model.eval()
  correct = 0
  total = 0

  for sample in test_set:
    prompt = sample['prompt']
    answer = extract_answer(sample['answer'])
    len_prompt = len(prompt)

    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    generated = generated.to(device)

    sample_outputs = model.generate(
                                    generated,
                                    #bos_token_id=random.randint(1,30000),
                                    do_sample=True,
                                    top_k=50,
                                    max_length = 300,
                                    top_p=0.95,
                                    num_return_sequences=1,
                                    pad_token_id=tokenizer.eos_token_id
                                    )

    for i, sample_output in enumerate(sample_outputs):
      # print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
      extracted_answer = extract_answer(tokenizer.decode(sample_output, skip_special_tokens=True)[len_prompt:], is_generated=True)
      # print(f'extracted: {extracted_answer}, truth: {answer}')

      if extracted_answer is not None and extracted_answer.strip() == answer.strip():
        correct += 1
      total += 1

  return correct/total

def three_run(model, tokenizer, test_set):
  # avg_accuracy = 0
  avg_accuracy = get_accuracy(model, tokenizer, test_set)
  # for _ in range(3):
  #   avg_accuracy += get_accuracy(model, tokenizer, test_set)

  # avg_accuracy /= 3
  print(f'Average accuracy over 3 runs: {avg_accuracy}')

def run_test(model, tokenizer, valid, test):
  # print("")
  # print("====== Experiment Results =======")
  # print("Running Default GPT2 on Task ...")
  # default_model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration).to(device)
  # default_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
  # print('validation--> ', end='')
  # three_run(default_model, default_tokenizer, valid)
  # print('test--> ', end='')
  # three_run(default_model, default_tokenizer, test)

  print("")
  if valid is not None:
    print('validation--> ', end='')
    three_run(model, tokenizer, valid)
  print('test--> ', end='')
  three_run(model, tokenizer, test)
  print("======== End of Results ========")

## Baseline 1. Fine-tune-CoT for fine-tuning small models
Training code adapted from https://colab.research.google.com/drive/13dZVYEOMhXhkXWfvSMVM1TTtUDrT6Aeh?usp=sharing#scrollTo=gFsCTp_mporB

In [None]:
def finetune_model(task_folder='multiArith/', model_dir='./model_saved/'):
  '''fine-tune gpt2-small on Easy-CoT data of provided task, save the model in model_dir'''

  tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')

  # data split
  train, valid, test = train_valid_test(task_folder)
  train_dataset = GPT2Dataset(train, tokenizer)
  val_dataset = GPT2Dataset(valid, tokenizer)

  train_dataloader = DataLoader(
              train_dataset,
              sampler = RandomSampler(train_dataset),
              batch_size = batch_size
          )

  # validation_dataloader = DataLoader(
  #             val_dataset,
  #             sampler = SequentialSampler(val_dataset),
  #             batch_size = batch_size
  #         )

  # load pretrained model and scheduler
  model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
  model.resize_token_embeddings(len(tokenizer))

  optimizer = AdamW(model.parameters(),
                    lr = learning_rate,
                    eps = epsilon
                  )
  total_steps = len(train_dataloader) * epochs

  scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps = warmup_steps,
                                              num_training_steps = total_steps)
  return test

  # train
  def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

  total_t0 = time.time()
  training_stats = []
  model = model.to(device)

  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      for step, batch in enumerate(train_dataloader):

          b_input_ids = batch[0].to(device)
          b_labels = batch[0].to(device)
          b_masks = batch[1].to(device)

          model.zero_grad()

          outputs = model(  b_input_ids,
                            labels=b_labels,
                            attention_mask = b_masks,
                            token_type_ids=None
                          )

          loss = outputs[0]

          batch_loss = loss.item()
          total_train_loss += batch_loss

          # Get sample every x batches.
          if step % sample_every == 0 and not step == 0:

              elapsed = format_time(time.time() - t0)
              print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

              model.eval()

              sample_outputs = model.generate(
                                      bos_token_id=random.randint(1,30000),
                                      do_sample=True,
                                      top_k=50,
                                      max_length = 200,
                                      top_p=0.95,
                                      num_return_sequences=1,
                                      pad_token_id=tokenizer.pad_token_id
                                  )
              for i, sample_output in enumerate(sample_outputs):
                    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

              model.train()

          loss.backward()

          optimizer.step()

          scheduler.step()

      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)

      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))

      # # ========================================
      # #               Validation
      # # ========================================

      # print("")
      # print("Running Validation...")

      # t0 = time.time()

      # model.eval()

      # total_eval_loss = 0
      # nb_eval_steps = 0

      # # Evaluate data for one epoch
      # for batch in validation_dataloader:

      #     b_input_ids = batch[0].to(device)
      #     b_labels = batch[0].to(device)
      #     b_masks = batch[1].to(device)

      #     with torch.no_grad():

      #         outputs  = model(b_input_ids,
      #                         attention_mask = b_masks,
      #                         labels=b_labels)

      #         loss = outputs[0]

      #     batch_loss = loss.item()
      #     total_eval_loss += batch_loss

      # avg_val_loss = total_eval_loss / len(validation_dataloader)

      # validation_time = format_time(time.time() - t0)

      # print("  Validation Loss: {0:.2f}".format(avg_val_loss))
      # print("  Validation took: {:}".format(validation_time))

      # # Record all statistics from this epoch.
      # training_stats.append(
      #     {
      #         'epoch': epoch_i + 1,
      #         'Training Loss': avg_train_loss,
      #         'Valid. Loss': avg_val_loss,
      #         'Training Time': training_time,
      #         'Validation Time': validation_time
      #     }
      # )

  print("")
  print("Training complete!")
  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
  save_model(model, tokenizer, model_dir)

  # test
  run_test(model, tokenizer, valid, test)

In [None]:
def load_and_test(task_folder='multiArith/', model_dir='./multiarith_model/'):
  model, tokenizer = load_model(model_dir)

  # data split
  train, valid, test = train_valid_test(task_folder)
  train_dataset = GPT2Dataset(train, tokenizer)
  val_dataset = GPT2Dataset(valid, tokenizer)

  # test
  run_test(model, tokenizer, valid, test)

### experiment playground

In [None]:
finetune_model(task_folder='multiArith/', model_dir='finetuned_model/multiArith/')

## Baseline 2. Auto-CoT for in-context learning + Easy-CoT testing

In [None]:
def test_all(task_folder='multiArith/', model_dir='/finetuned_model/multiArith'):

    default_model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration).to(device)
    default_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
    model, tokenizer = load_model(model_dir)

    # test split
    raw = load_dataset("json", data_files="/kaggle/input/easy-cot/easy_cot/" + task_folder + "data.json", field="data")
    # test = context-augmented, test_cpy = original prompt
    data = Dataset.from_pandas(pd.DataFrame(data=raw))
    train_test_split = data.train_test_split(test_size=0.2, shuffle=False, seed=42)
    test = train_test_split['test']['train']
    test_cpy = copy.deepcopy(test)
    # load demos
    raw = load_dataset("json", data_files="easy_cot/" + task_folder + "demos.json")
    demos = raw['train']

    context = ""
    for sample in demos:
        context = context + sample['demo'] + "\n"

    for sample in test:
        sample['prompt'] = context + sample['prompt']

    print("")
    print("Running Fine-tune-CoT...")
    run_test(model, default_tokenizer, None, test_cpy)
    print("Running Auto-CoT...")
    run_test(default_model, default_tokenizer, None, test)
    print("Running Easy-CoT...")
    run_test(model, default_tokenizer, None, test)


### experiment playground

In [None]:
# see result.ipynb for cell outputs
test_all('coinFlip/','finetuned_model/coinFlip/')