In [None]:
# If you are using PyTorch backendpy
!pip install torch==2.0.1
!pip install transformers @ git+https://github.com/huggingface/transformers@de9255de27abfcae4a1f816b904915f0b1e23cd9
#lightning @ git+https://github.com/Lightning-AI/lightning@master
!pip install tokenizers==0.13.3
!pip install peft
!pip install jsonargparse[signatures]  # CLI
!pip install bitsandbytes==0.39.1 # quantize
!pip install accelerate @ git+https://github.com/huggingface/accelerate@e0f5e030098aada5e112708eee3537475dea3a83
!pip install datasets==2.13.1  # quantize/gptq.py
!pip install zstandard==0.19.0  # prepare_redpajama.py
!pip install scipy
!pip install loralib==0.1.1
!pip install einops==0.6.1

In [2]:
import pandas as pd

# from google.colab import drive
# drive.mount('/content/drive')

import pandas as pd

# Specify the path to your CSV file in Google Drive
file_path = 'train_samples.csv'

# Read the CSV file into a DataFrame
training_data = pd.read_csv(file_path)
training_data = training_data[:100]

In [None]:
training_data.head(3)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
# model_name = "tiiuae/falcon-7b-instruct"
model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b-instruct",
    load_in_4bit=True,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    "tiiuae/falcon-7b-instruct",
)

In [5]:
import torch
torch.cuda.empty_cache()

In [6]:
gen_config = model.generation_config
gen_config.max_new_tokens = 200
gen_config.temperature = 0.0
gen_config.num_return_sequences = 1
gen_config.pad_token_id = tokenizer.eos_token_id
gen_config.eos_token_id = tokenizer.eos_token_id

In [7]:
test_prompt = training_data.loc[0, 'question']

encoding = tokenizer(test_prompt, return_tensors= "pt").to(model.device)

In [8]:
test_prompt

'Prompt: "The formulation includes 450.0 kg of GGBFS and Metakaolin powder at a ratio of 90.0/10.0. 1716.0 kg of aggregate, and water-to-powder ratio is 0.33. Powder oxide composition is: SiO2: 34.38, Al2O3: 17.87, Fe2O3: 0.75, CaO: 36.8. The amount of the dry fraction of Na2O from the activator solution is 15.44 kg. The amount of dry fraction in the activator of SiO2 is 30.87 kg, the amount of Na(OH) is 53.0 kg, with a molarity of 14.0. The mix also includes 22.5 kg of superplasticizer. and 54.0 kg of extra water.The compressive strength for a cylindrical sample (100.0 mm/200.0 mm) after 7 days was? answer"'

In [9]:
# Execute for testing purposes
# import torch

# with torch.inference_mode():
#    outputs = model.generate(input_ids = encoding.input_ids, attention_mask = encoding.attention_mask,generation_config = gen_config )
# print(tokenizer.decode(outputs[0], skip_special_tokens = True))

In [11]:
import transformers
from datasets import load_dataset, Dataset
import pandas as pd


def gen_prompt(text_input):
    return f"""{text_input["question"]}: {text_input["answer"]}
    """.strip()

def gen_and_tok_prompt(text_input):
    full_input = gen_prompt(text_input)
    tok_full_prompt = tokenizer(full_input, padding = True , truncation =True)
    return tok_full_prompt

# X = {
#     'question': ['a', 'c', 'e'],
#     'answer': ['b', 'd', 'f']
# }

df = training_data.copy()

data = Dataset.from_pandas(df[['question', 'answer']])

In [12]:
tokenizer.pad_token = tokenizer.eos_token

In [13]:
data = data.map(gen_and_tok_prompt)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [14]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [15]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [16]:
torch.cuda.empty_cache()

In [17]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 4718592 || all params: 3613463424 || trainable%: 0.13058363808693696


In [None]:
# torch.enable_grad()

In [None]:
# data

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

In [19]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=4,
    logging_steps=25,
    output_dir="results", # give the location where you want to store checkpoints
    save_strategy='epoch',
    optim="paged_adamw_8bit",
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.05,
    report_to='tensorboard'
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [21]:
model.save_pretrained('results/concrete_falcon')

In [2]:
import torch
torch.cuda.empty_cache()

In [None]:
from peft import PeftConfig, PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

config = PeftConfig.from_pretrained('results/concrete_falcon')
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_4bit=True,
#     device_map='auto',
    trust_remote_code=True,

)

tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_name_or_path)

model_inf = PeftModel.from_pretrained(model, 'results/concrete_falcon')


In [None]:
import torch


prompt = f"""{test_prompt}: """.strip()

# encode the prompt
encoding = tokenizer(prompt, return_tensors= "pt").to(model.device)

gen_config = model_inf.generation_config
gen_config.max_new_tokens = 200
gen_config.temperature = 0.0
gen_config.num_return_sequences = 1
gen_config.pad_token_id = tokenizer.eos_token_id
gen_config.eos_token_id = tokenizer.eos_token_id

# do the inference
with torch.inference_mode():
    outputs = model.generate(input_ids = encoding.input_ids,
                             attention_mask = encoding.attention_mask,
                             generation_config = gen_config )
print(tokenizer.decode(outputs[0], skip_special_tokens = True ))