# Installing Required Libraries

In [None]:
! pip install -q transformers

In [None]:
! pip install opendatasets



In [None]:
import opendatasets as od
import pandas as pd
import numpy as np
import time
import gc
import json

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
import pickle
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


# Data

Kaggle Username: sallyzhao<br>
Key: 1d245367f7ccd8d3e6c7f97663c2af02

In [None]:
#Download the dataset (use the above username and key when prompted)
od.download("https://www.kaggle.com/datasets/konradb/chain-of-thought-collection/data/CoT_collection.json")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: sallyzhao
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/konradb/chain-of-thought-collection
Downloading chain-of-thought-collection.zip to ./chain-of-thought-collection


100%|██████████| 1.17G/1.17G [00:58<00:00, 21.7MB/s]





In [None]:
#Specify file path that will be used to extract the actual data below
file_path = "./chain-of-thought-collection/CoT_collection.json"

In [None]:
# The data needs to be processed in chunks to avoid a long runtime, and consequently a runtime disconnect
# Initialize an empty list to store processed chunks
chunks = []
chunk_size = 1000  # Number of items to process per chunk

# Open the JSON file and load it line by line
with open(file_path, 'r') as file:
    data = json.load(file)  # Load the entire file as a dictionary

    # Convert the dictionary to an iterable list of items (key-value pairs)
    items = list(data.items())

    # Process in chunks
    for i in range(0, len(items), chunk_size):
        chunk = items[i:i+chunk_size]

        # Convert the chunk into a DataFrame
        chunk_df = pd.DataFrame([{"id": key, **value} for key, value in chunk])
        chunks.append(chunk_df)  # Append the chunk DataFrame

In [None]:
# Concatenate all chunks into a single DataFrame
cot_df = pd.concat(chunks, ignore_index=True)

# Load the random sample

In [None]:
#The sample is saved as a pickle file so that the same sample can be used across all our experiments which would allow for a fair comparison of results
pickle_path = "/content/drive/MyDrive/HPML Project/"
infile = open(pickle_path + 'sample_df.pickle','rb')
sample_df = pickle.load(infile)

In [None]:
sample_df.head()

Unnamed: 0,id,source,target,rationale,config,task,prompt
1290510,1547335,"Q: A solution is what type of mixture, where t...",homogeneous,A solution is a type of mixture where particle...,none,sciq,Multiple Choice (Closed Book)
1498474,353222,The site of the battle is located near U.S. Hi...,Don Mueller,"In order to answer the question ""Who was the U...",none,drop,2
605290,661704,"In this task, you are given an ambiguous quest...",No,The clarification statement is not related to ...,task_227,clariq,none
39817,1674455,Find the topic. \nQuestion: What did Valerian...,Religion_in_ancient_Rome,"The given answer ""subversive foreign cult"" is ...",none,squad_v2,Topic Prediction - Question and Answer Pair
1768764,1100106,"In this task, you are given a sentence from th...",True,The sentence describes the finding that IFN-γ ...,task_1164,coda_19,none


# Models

In [None]:
#This function takes the given text/prompt, encodes it, feeds it to the model and then returns the decoded model output
def generate(model, input_text):
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
  output = model.generate(input_ids, max_length=100)
  return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
#Loading the Flan T5 tokenizer, original model and quantized versions of the model
import torch
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small").to("cuda")
model_fp16 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto", torch_dtype=torch.float16)
model_int8 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto", load_in_8bit=True)
model_int4 = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto", load_in_4bit=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `Bit

# Inference time

In [None]:
#Iterate through the available models (each has weights of a different precision)
for m in [model, model_fp16, model_int8, model_int4]:
  #Create a list to store the current model's inference time for each prompt in the sample
  inf_times = []

  #Loop over the sample prompts
  for prompt in np.array(sample_df["source"]):
    truncated_prompt = prompt[-400:] # in case it overflows
    #Record inference time
    start_time = time.time()
    response_128=generate(m, truncated_prompt)
    end_time = time.time()
    #Store results
    inf_times.append(end_time - start_time)

  print(inf_times)
  print(sum(inf_times))
  print()

[1.822930097579956, 0.08257293701171875, 0.05627584457397461, 0.07435131072998047, 0.06304502487182617, 0.06487226486206055, 0.5561370849609375, 0.19888687133789062, 0.0538334846496582, 0.47102904319763184]
3.4439339637756348

[0.45842599868774414, 0.11093592643737793, 0.07559061050415039, 0.1022336483001709, 0.06895112991333008, 0.07449197769165039, 0.7344677448272705, 0.23816275596618652, 0.07538509368896484, 0.6316535472869873]
2.570298433303833

[0.5242321491241455, 0.2589993476867676, 0.2377164363861084, 0.26098132133483887, 0.2038278579711914, 0.22847437858581543, 1.8113822937011719, 0.6424469947814941, 0.24937868118286133, 1.8599977493286133]
6.277437210083008





[0.3303236961364746, 0.21607351303100586, 0.15540766716003418, 0.22592711448669434, 1.122720718383789, 0.3211066722869873, 0.1860504150390625, 0.49634790420532227, 0.1904432773590088, 1.163017988204956]
4.407418966293335

