## Setting up

In [6]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl


In [7]:
from huggingface_hub import login
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
import warnings 
warnings.filterwarnings('ignore')

In [9]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch,torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import AutoModelForCausalLM,AutoTokenizer,BitsAndBytesConfig,TrainingArguments,pipeline

2024-08-02 18:43:49.846749: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-02 18:43:49.846859: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-02 18:43:49.976734: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Loading and processing the dataset

In [10]:
df = pd.read_csv("/kaggle/input/sovai-docs/code_prompts_balanced.csv",sep=';',index_col = "Unnamed: 0")
df.head()

Unnamed: 0,code,value
0,sov.data('wikipedia/views'),retrieve comprehensive Wikipedia views data
1,sov.data('corprisk/events'),Retrieve data on corporate risk events.
2,sov.data('factors/alternative'),Obtain alternative factor details
3,"sov.data('news/daily', start_date='2017-03-30'...",fetch news data for specified tickers
4,sov.data('factors/alternative'),Fetch data on alternative market factors


In [11]:
# Shuffle the DataFrame
df = df.sample(frac=1, random_state=85).reset_index(drop=True)

In [12]:
# Define the prompt generation functions
def generate_prompt(row):
    return f"""Convert the description into functional and efficient code. Return only the code without any additional explanations or comments.\n\nDESCRIPTION:{row.value}\n\nCODE:{row.code}<|eot_id|>""".strip()

def generate_test_prompt(row):
     return f"""Convert the description into functional and efficient code. Return only the code without any additional explanations or comments.\n\nDESCRIPTION:{row.value}\n\nCODE:""".strip()

In [13]:
# # Generate prompts for training data
df.loc[:,'text'] = df.apply(generate_prompt, axis=1)

In [14]:
# Convert to datasets
ds = Dataset.from_pandas(df[["text"]])
ds['text'][3]

"Convert the description into functional and efficient code. Return only the code without any additional explanations or comments.\n\nDESCRIPTION:fetch relative ratios insights for TSLA and META since 2018\n\nCODE:sov.data('ratios/relative', start_date='2018-01-01', tickers=['TSLA', 'META'])<|eot_id|>"

## Loading the model and tokenizer

In [15]:
base_model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

In [16]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [17]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [18]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [19]:
tokenizer.eos_token

'<|eot_id|>'

### **Few-shot prompt** 

In [20]:
pipe = pipeline(task="text-generation", 
                model=model,
                tokenizer=tokenizer, 
                max_new_tokens=64, 
                temperature=0.1)

In [21]:
prompt=''
for p in df.loc[:3,'text']:
    prompt+=p+"\n\n"

prompt+=generate_test_prompt(df.iloc[10])
prompt

"Convert the description into functional and efficient code. Return only the code without any additional explanations or comments.\n\nDESCRIPTION:Investigate potential accounting misstatements or fraud risks\n\nCODE:sov.data('corprisk/accounting')<|eot_id|>\n\nConvert the description into functional and efficient code. Return only the code without any additional explanations or comments.\n\nDESCRIPTION:Obtain corporate risk details\n\nCODE:sov.data('corprisk/risks')<|eot_id|>\n\nConvert the description into functional and efficient code. Return only the code without any additional explanations or comments.\n\nDESCRIPTION:retrieve relative ratios for TSLA and META from 2018\n\nCODE:sov.data('ratios/relative', start_date='2018-01-01', tickers=['TSLA', 'META'])<|eot_id|>\n\nConvert the description into functional and efficient code. Return only the code without any additional explanations or comments.\n\nDESCRIPTION:fetch relative ratios insights for TSLA and META since 2018\n\nCODE:sov.d

In [22]:
result = pipe(prompt)[0]['generated_text']

In [23]:
print(generate_test_prompt(df.iloc[10]))
'sov'+result.split('sov')[1]#.split('CODE:')[-1]

Convert the description into functional and efficient code. Return only the code without any additional explanations or comments.

DESCRIPTION:Access past weekly financial performance for major firms from 2018

CODE:


"sov.data('corprisk/accounting')<|eot_id|>\n\nConvert the description into functional and efficient code. Return only the code without any additional explanations or comments.\n\nDESCRIPTION:Obtain corporate risk details\n\nCODE:"

## Extracting the linear modules names

In [24]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [25]:
modules = find_all_linear_names(model)
modules

['up_proj', 'k_proj', 'v_proj', 'down_proj', 'gate_proj', 'q_proj', 'o_proj']

## Setting up the model

In [26]:
output_dir="llama-3.1-fine-tuned-model-on-sovai-doc"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir='.',                    # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    logging_steps=10,                         
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
#     eval_strategy="steps",              # save checkpoint every epoch
    eval_steps = 0.2,
    report_to='none',
    push_to_hub=True
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=ds,
#     eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=512,
    packing=False,
    dataset_kwargs={
    "add_special_tokens": False,
    "append_concat_token": False,
    }
)

Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

## Model Training

In [27]:
# Train model
trainer.train()

Step,Training Loss
10,2.9203
20,0.882
30,0.6115
40,0.4994
50,0.4638
60,0.4478
70,0.448
80,0.4056
90,0.3727
100,0.4131


TrainOutput(global_step=128, training_loss=0.6631644256412983, metrics={'train_runtime': 1117.9912, 'train_samples_per_second': 0.918, 'train_steps_per_second': 0.114, 'total_flos': 2162043204108288.0, 'train_loss': 0.6631644256412983, 'epoch': 0.9980506822612085})

In [28]:
trainer.push_to_hub('ikram98ai/llama-sovai-doc')

CommitInfo(commit_url='https://huggingface.co/snowdere/working/commit/48ecc9d15aae80c8780852e1ee4d7ecd6394850f', commit_message='ikram98ai/llama-sovai-doc', commit_description='', oid='48ecc9d15aae80c8780852e1ee4d7ecd6394850f', pr_url=None, pr_revision=None, pr_num=None)

## Saving the model and tokenizer

In [29]:
# # Save trained model and tokenizer
# trainer.save_model(output_dir)
# tokenizer.save_pretrained(output_dir)

## Testing model after fine-tuning 

In [None]:
# from peft import PeftModel, PeftConfig
# from transformers import AutoModelForCausalLM

# config = PeftConfig.from_pretrained("ikram98ai/working")
# base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# peft_model = PeftModel.from_pretrained(base_model, "ikram98ai/working")

In [None]:
# pipe = pipeline(task="text-generation", 
#                     model=peft_model,
#                     tokenizer=tokenizer, 
#                     max_new_tokens=32, 
#                     temperature=0.1)
# y_pred = []
# for i in tqdm(range(5)):
#     prompt = generate_test_prompt(df.iloc[i])
#     result = pipe(prompt)
#     answer = result[0]['generated_text']
#     y_pred.append(answer)
    
# for true_code,model_code in list(zip(df.iloc[:5]['code'],y_pred)):
#     print('\nReal Code: ',true_code)
#     print('\nModel Code: ','sov'+model_code.split('sov')[1])