# Installing Required Libraries

In [None]:
! pip install -q transformers

In [None]:
! pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
import opendatasets as od
import pandas as pd
import numpy as np
import time
import gc
import json
import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
import pickle
from google.colab import drive

In [None]:
!pip install triton

Collecting triton
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.1.0


# Model

In [None]:
#Loading the Flan T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
#This function takes the given text/prompt, encodes it, feeds it to the model and then returns the decoded model output
def generate(model, input_text):
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
  output = model.generate(input_ids, max_length=100)
  return tokenizer.decode(output[0], skip_special_tokens=True)

# Data

Kaggle username: sallyzhao<br>
Key: 1d245367f7ccd8d3e6c7f97663c2af02

In [None]:
#Download the dataset (use the above username and key when prompted)
od.download("https://www.kaggle.com/datasets/konradb/chain-of-thought-collection/data/CoT_collection.json")

Skipping, found downloaded files in "./chain-of-thought-collection" (use force=True to force download)


In [None]:
#Specify file path that will be used to extract the actual data below
file_path = "./chain-of-thought-collection/CoT_collection.json"

In [None]:
# The data needs to be processed in chunks to avoid a long runtime, and consequently a runtime disconnect
# Initialize an empty list to store processed chunks
chunks = []
chunk_size = 1000  # Number of items to process per chunk

# Open the JSON file and load it line by line
with open(file_path, 'r') as file:
    data = json.load(file)  # Load the entire file as a dictionary

    # Convert the dictionary to an iterable list of items (key-value pairs)
    items = list(data.items())

    # Process in chunks
    for i in range(0, len(items), chunk_size):
        chunk = items[i:i+chunk_size]

        # Convert the chunk into a DataFrame
        chunk_df = pd.DataFrame([{"id": key, **value} for key, value in chunk])
        chunks.append(chunk_df)  # Append the chunk DataFrame

In [None]:
# Concatenate all chunks into a single DataFrame
cot_df = pd.concat(chunks, ignore_index=True)

# Load the random sample

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#A random sample of the dataset was previously saved as a pickle file to ensure consistency across experiments; the saved sample will be used here
pickle_path = "/content/drive/MyDrive/HPML Project/"
infile = open(pickle_path + 'sample_df.pickle','rb')
sample_df = pickle.load(infile)

In [None]:
sample_df.head()

Unnamed: 0,id,source,target,rationale,config,task,prompt
1290510,1547335,"Q: A solution is what type of mixture, where t...",homogeneous,A solution is a type of mixture where particle...,none,sciq,Multiple Choice (Closed Book)
1498474,353222,The site of the battle is located near U.S. Hi...,Don Mueller,"In order to answer the question ""Who was the U...",none,drop,2
605290,661704,"In this task, you are given an ambiguous quest...",No,The clarification statement is not related to ...,task_227,clariq,none
39817,1674455,Find the topic. \nQuestion: What did Valerian...,Religion_in_ancient_Rome,"The given answer ""subversive foreign cult"" is ...",none,squad_v2,Topic Prediction - Question and Answer Pair
1768764,1100106,"In this task, you are given a sentence from th...",True,The sentence describes the finding that IFN-γ ...,task_1164,coda_19,none


# Inference time

In [None]:
#Loop through the possible modes for torch.compile()
for mode in ["default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]:
  #Compile the model using the current mode
  compiled_model = torch.compile(model, mode=mode)
  #Create a list to store the model's inference time for each prompt in the sample
  inf_times = []

  #Loop over the sample prompts
  for prompt in np.array(sample_df["source"]):
    truncated_prompt = prompt[-400:] # in case it overflows
    #Record inference time for the JIT compiled model
    start_time = time.time()
    response_128=generate(compiled_model, truncated_prompt)
    end_time = time.time()
    #Store results
    inf_times.append(end_time - start_time)

  print(f"With mode {mode}:")
  print(inf_times)
  print(sum(inf_times))

With mode default:
[2.9285085201263428, 0.12691307067871094, 0.10618710517883301, 0.1588575839996338, 0.08421921730041504, 0.05346035957336426, 0.699059247970581, 0.20413422584533691, 0.07903361320495605, 0.7847964763641357]
5.22516942024231
With mode reduce-overhead:
[0.1466538906097412, 0.1069638729095459, 0.06228065490722656, 0.1465466022491455, 0.09828805923461914, 0.07962799072265625, 0.6926250457763672, 0.18719816207885742, 0.10437273979187012, 0.5164504051208496]
2.141007423400879
With mode max-autotune:
[0.10953187942504883, 0.07515597343444824, 0.06440901756286621, 0.07607054710388184, 0.053212881088256836, 0.052812814712524414, 0.5171701908111572, 0.1642749309539795, 0.05546879768371582, 0.5823173522949219]
1.7504243850708008
With mode max-autotune-no-cudagraphs:
[0.10691165924072266, 0.0784604549407959, 0.05929446220397949, 0.07324457168579102, 0.05315756797790527, 0.05660438537597656, 0.49998974800109863, 0.17525196075439453, 0.06762957572937012, 0.4555950164794922]
1.62613