# Installing Required Libraries

In [None]:
! pip install -q transformers

In [None]:
! pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
import opendatasets as od
import pandas as pd
import numpy as np
import time
import gc
import json

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
import pickle
from google.colab import drive

# Model

In [None]:
#Loading the Flan T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
#This function takes the given text/prompt, encodes it, feeds it to the model and then returns the decoded model output
def generate(input_text):
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
  output = model.generate(input_ids, max_length=100)
  return tokenizer.decode(output[0], skip_special_tokens=True)

# Data

Kaggle Username: sallyzhao <br>
Key: 1d245367f7ccd8d3e6c7f97663c2af02

In [None]:
#Download the dataset (use the above username and key when prompted)
od.download("https://www.kaggle.com/datasets/konradb/chain-of-thought-collection/data/CoT_collection.json")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: sallyzhao
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/konradb/chain-of-thought-collection
Downloading chain-of-thought-collection.zip to ./chain-of-thought-collection


100%|██████████| 1.17G/1.17G [00:16<00:00, 75.4MB/s]





In [None]:
#Specify file path that will be used to extract the actual data below
file_path = "./chain-of-thought-collection/CoT_collection.json"

In [None]:
# The data needs to be processed in chunks to avoid a long runtime, and consequently a runtime disconnect
# Initialize an empty list to store processed chunks
chunks = []
chunk_size = 1000  # Number of items to process per chunk

# Open the JSON file and load it line by line
with open(file_path, 'r') as file:
    data = json.load(file)  # Load the entire file as a dictionary

    # Convert the dictionary to an iterable list of items (key-value pairs)
    items = list(data.items())

    # Process in chunks
    for i in range(0, len(items), chunk_size):
        chunk = items[i:i+chunk_size]

        # Convert the chunk into a DataFrame
        chunk_df = pd.DataFrame([{"id": key, **value} for key, value in chunk])
        chunks.append(chunk_df)  # Append the chunk DataFrame

In [None]:
# Concatenate all chunks into a single DataFrame
cot_df = pd.concat(chunks, ignore_index=True)

In [None]:
# Randomly sample 10 prompts
sample_df = cot_df.sample(n=10)
sample_df.head()

Unnamed: 0,id,source,target,rationale,config,task,prompt
1290510,1547335,"Q: A solution is what type of mixture, where t...",homogeneous,A solution is a type of mixture where particle...,none,sciq,Multiple Choice (Closed Book)
1498474,353222,The site of the battle is located near U.S. Hi...,Don Mueller,"In order to answer the question ""Who was the U...",none,drop,2
605290,661704,"In this task, you are given an ambiguous quest...",No,The clarification statement is not related to ...,task_227,clariq,none
39817,1674455,Find the topic. \nQuestion: What did Valerian...,Religion_in_ancient_Rome,"The given answer ""subversive foreign cult"" is ...",none,squad_v2,Topic Prediction - Question and Answer Pair
1768764,1100106,"In this task, you are given a sentence from th...",True,The sentence describes the finding that IFN-γ ...,task_1164,coda_19,none


## Save the random sample

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#The sample is saved as a pickle file so that the same sample can be used across all our experiments which would allow for a fair comparison of results
sample_df_pickle = open("sample_df.pickle", "wb")
pickle.dump(sample_df, sample_df_pickle)
sample_df_pickle.close()

# Inference time

In [None]:
#Create lists that will store the model's responses and inference time for each prompt in the sample
responses_q5 = []
inf_times = []

#Loop over the sample prompts
for prompt in np.array(sample_df["source"]):
  truncated_prompt = prompt[-400:] # in case it overflows
  #Record inference time
  start_time = time.time()
  response_128=generate(truncated_prompt)
  end_time = time.time()
  #Store results
  responses_q5.append(response_128)
  inf_times.append(end_time - start_time)

print(inf_times)

[3.0589399337768555, 0.17417025566101074, 0.11562705039978027, 0.16708111763000488, 0.12141776084899902, 0.12113070487976074, 1.1127870082855225, 0.36274194717407227, 0.12082076072692871, 0.89927077293396]
