In [None]:
!pip install bitsandbytes
!pip install --upgrade torch torchvision torchaudio



In [None]:
!pip install datasets



In [None]:
import torch
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset, DatasetDict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
folder_path = '/content/drive/My Drive/GT/cs6220' # Change the path to the folder where the assignment is stored in Google Drive.

# Files in the folder -
os.listdir(folder_path)

os.chdir(folder_path)

print('Current working directory -', os.getcwd())

Current working directory - /content/drive/My Drive/GT/cs6220


In [None]:
torch.cuda.is_available()

True

In [None]:
# https://medium.com/@rakeshrajpurohit/model-quantization-with-hugging-face-transformers-and-bitsandbytes-integration-b4c9983e8996
path = "openai-community/openai-gpt"
tokenizer = AutoTokenizer.from_pretrained(path)

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
device_map = {"": 0}

model = AutoModelForCausalLM.from_pretrained(
    path,
    device_map=device_map,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer.padding_side = 'right'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
ds = load_dataset("ndavidson/sat-math-chain-of-thought")["train"]
system_msg = "Answer the following math problem. Please provide the correct answer and corresponding letter."

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_msg},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

ds = ds.filter(lambda example: example["is_correct"] == True)
print(f"There are {len(ds)} correct questions and answers")

ds = ds.map(create_conversation, remove_columns=ds.features,batched=False)

dataset_split = ds.train_test_split(test_size=0.1) # 80% train, 20% test + validation

train_dataset = dataset_split['train']

test_val_split = dataset_split['test'].train_test_split(test_size=0.5)
test_dataset = test_val_split['train']
valid_dataset = test_val_split['test']

train_ds = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'valid': valid_dataset
})

There are 32494 correct questions and answers


In [None]:
!pip install trl



In [None]:
from trl import setup_chat_format
model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
print(model)

OpenAIGPTLMHeadModel(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(40480, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (attn): Attention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (lm_head): Linear(in_features=768,

In [None]:
from peft import LoraConfig

peft_config = LoraConfig(
        lora_alpha=32,
        lora_dropout=0.05,
        r=32,
        bias="none",
        target_modules={"c_attn", "c_proj"},
        task_type="CAUSAL_LM",
)

In [None]:
from transformers import TrainingArguments

output_dir = f'./outputs'

args = TrainingArguments(
    output_dir=output_dir,              # output directory
    num_train_epochs=1,                 # number of epochs to train
    per_device_train_batch_size=1,      # Per device batch size to be loaded in device
    gradient_accumulation_steps=4,      # Gradient accumulation steps for mini-batches
    gradient_checkpointing=False,        # Gradient checkpoint
    optim="adamw_torch_fused",
    logging_steps=25,                   # Logging steps
    save_strategy="steps",              # Save strategy to be steps, can also be epoch
    learning_rate=1e-3,
    bf16=True,                          # fp16 to be loaded and if your gpu supports bf16 then use that
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    max_steps=1000,                     # Max steps will override the training length
    save_steps=100,                     # Save checkpoint after every save_steps
    overwrite_output_dir = 'True'      # will override the dir content
)

In [None]:
from trl import SFTTrainer

max_seq_length = 512 # max sequence length for model and packing of the dataset
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_ds['train'],
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)

# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss


KeyboardInterrupt: 

In [None]:
!pip install huggingface_hub
from huggingface_hub import login





In [None]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
repo_name = "MichaelHu03/CS6220-GPT"
tokenizer.push_to_hub(repo_name)
model.push_to_hub(repo_name)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/125M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MichaelHu03/CS6220-GPT/commit/4a2fd601beb80d61fad4b875bd404ce014dd799c', commit_message='Upload model', commit_description='', oid='4a2fd601beb80d61fad4b875bd404ce014dd799c', pr_url=None, pr_revision=None, pr_num=None)