## Rough

In [3]:
# Project Steps:
# Data Collection: Gather a domain-specific dataset, potentially using web scraping or public APIs.
# Model Preparation: Select a base LLM and prepare it for fine-tuning.
# Fine-Tuning & Adapters: Fine-tune the model and implement the adapters.
# Quantization: Apply quantization techniques and compare different configurations.
# RAG Integration: Set up the vector database and integrate RAG into the system.
# Evaluation & Testing: Design evaluation metrics, test the model, and analyze hallucinations.
# Optimization: Experiment with prompt engineering, and explore resource-performance trade-offs.
# Documentation: Document the process, findings, and potential improvements.

## Training

### Imports

In [22]:
from config import config_dict
import tools

In [2]:
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from transformers import Trainer, DataCollatorForLanguageModeling

# from transformers import LlamaTokenizer, LlamaForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from adapters import AdapterConfig

  from .autonotebook import tqdm as notebook_tqdm


### Load data, model, tokenizer

In [29]:
# # Load the tokenizer and model
# model_name = "meta-llama/Llama-2-7b"  # Replace with the exact model path
# tokenizer = LlamaTokenizer.from_pretrained(model_name)
# model = LlamaForConditionalGeneration.from_pretrained(model_name)


# Load pre-trained model and tokenizer
model_name = 'gpt2'  # model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset_dict = load_from_disk(config_dict['prepare_dataset']['dataset_save_path'])

# train, validation, and test sets
train_dataset = dataset_dict['train']
val_dataset = dataset_dict['validation']
test_dataset = dataset_dict['test']

train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
val_dataset_tokenized = val_dataset.map(tokenize_function, batched=True)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)


In [30]:
tools.print_model_parameters_and_memory(model, batch_size=4)

Layer Name                                                                                           # Parameters         # Trainable     Layer Type          
transformer.wte.weight                                                                               38597376             38597376        Parameter           
transformer.wpe.weight                                                                               786432               786432          Parameter           
transformer.h.0.ln_1.weight                                                                          768                  768             Parameter           
transformer.h.0.ln_1.bias                                                                            768                  768             Parameter           
transformer.h.0.attn.c_attn.weight                                                                   1769472              1769472         Parameter           
transformer.h.0.attn.c_attn.bias              

### LoRA

In [31]:
# Set up the LoRA configuration
lora_config = LoraConfig(
    r=8,  
    lora_alpha=16,  
    lora_dropout=0.1,  
    target_modules=["c_attn", "c_fc", "c_proj"],  # Target GPT-2's attention and MLP layers
    task_type=TaskType.CAUSAL_LM,  # Task type: Causal Language Modeling
)

model = get_peft_model(model, lora_config)
model



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_

In [32]:
tools.print_model_parameters_and_memory(model, batch_size=4)

Layer Name                                                                                           # Parameters         # Trainable     Layer Type          
base_model.model.transformer.wte.weight                                                              38597376             0               Parameter           
base_model.model.transformer.wpe.weight                                                              786432               0               Parameter           
base_model.model.transformer.h.0.ln_1.weight                                                         768                  0               Parameter           
base_model.model.transformer.h.0.ln_1.bias                                                           768                  0               Parameter           
base_model.model.transformer.h.0.attn.c_attn.base_layer.weight                                       1769472              0               Parameter           
base_model.model.transformer.h.0.attn.c_attn.b

In [33]:
# import bitsandbytes as bnb

# # Example of quantizing specific layers to 4-bit
# # model = model.to('cuda')  # Ensure the model is on GPU if available

# # Apply 4-bit quantization to the model
# for name, module in model.named_modules():
#     if any(target in name for target in ["c_attn", "c_fc", "c_proj"]):
#         quantized_module = bnb.nn.Int8Params(module.weight, requires_grad=True)
#         module.weight = quantized_module

In [34]:
# # Configure Adapter
# adapter_config = AdapterConfig(
#     input_dim=model.config.hidden_size,
#     output_dim=model.config.hidden_size,
#     adapter_dim=512,  # Dimension of the adapter layer
#     activation="relu"  # Activation function for the adapter
# )
# model.add_adapter("my_adapter", adapter_config)

### Train args

In [36]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    eval_strategy='steps',
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=val_dataset_tokenized,
    data_collator=data_collator,
)

### Train

In [37]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss
10,3.0862,2.754037
20,2.7647,2.746497
30,3.4989,2.737292
40,2.8927,2.729145
50,3.8055,2.723517
60,4.34,2.721076
70,3.9859,2.717703
80,3.1217,2.714619
90,3.1909,2.709363
100,3.4118,2.705761


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [text]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

### Eval

In [38]:
# Evaluate on the test set
test_results = trainer.evaluate(test_dataset_tokenized)
print(test_results)

{'eval_loss': 3.7191321849823, 'eval_runtime': 37.1388, 'eval_samples_per_second': 0.296, 'eval_steps_per_second': 0.054, 'epoch': 3.0}


### Save

In [40]:
# Save the model
tokenizer.save_pretrained('./fine-tuned-model-lora')
# model.save_adapter('./my_adapter', "my_adapter")
model.save_pretrained('./fine-tuned-model-lora')