Initializing

In [None]:
# Install required packages and clone the unsloth's Github Repo
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
# Import necessary libraries
from unsloth import FastLanguageModel
import torch
import json
import pandas as pd
from transformers import AutoTokenizer
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import nltk
from datasets import load_metric
import matplotlib.pyplot as plt
import time

In [None]:
# Configuration settings
max_seq_len = 4096
data_type = None  # Auto-detection
use_quantization = True  # 4-bit quantization for reduced memory usage

In [None]:
# Model name
model_name_4bit = "unsloth/llama-3-8b-bnb-4bit"

In [None]:
# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name_4bit,
    max_seq_length=max_seq_len,
    dtype=data_type,
    load_in_4bit=use_quantization,
)

Configuration

In [None]:
# Configure PEFT model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Optimized for 0
    bias="none",    # Optimized for "none"
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # Support for rank stabilized LoRA
    loftq_config=None, # LoftQ support
)

Data Preprocessing and Cleaning

In [None]:
# Load JSON datasets
def load_json_data(file_path):
    with open(file_path, "r", encoding='utf-8-sig') as f:  # Use 'utf-8-sig' to handle BOM
        return json.load(f)

auto_eval_topics_tree = load_json_data("/content/2022_automatic_evaluation_topics_tree_v1.0.json")
additional_data = load_json_data("/content/2022_mixed_initiative_question_answer_pool.json")

# Extract and process data
def process_data(topics_tree):
    instructions, inputs, responses = [], [], []
    for topic in topics_tree:
        turns = topic['turn']
        turn_dict = {turn['number']: turn for turn in turns}
        for turn in turns:
            if turn['participant'] == 'User':
                parent_turn_id = turn.get('parent')
                parent_turn = turn_dict.get(parent_turn_id, {})
                instruction = parent_turn.get('automatic_rewritten_utterance', parent_turn.get('utterance', ''))
                input_text = turn.get('automatic_rewritten_utterance', turn.get('utterance', ''))
                response_turn = next((item for item in turns if item.get('parent') == turn['number'] and item['participant'] == 'System'), {})
                response_text = response_turn.get('response', '')
                instructions.append(instruction)
                inputs.append(input_text)
                responses.append(response_text)
    return instructions, inputs, responses

instructions, inputs, responses = process_data(auto_eval_topics_tree)

# Create DataFrame
df = pd.DataFrame({'instruction': instructions, 'input': inputs, 'response': responses})
df['instruction'] = df.apply(lambda row: row['input'] if row['instruction'] == '' else row['instruction'], axis=1)

# Combine and save data
combined_data = df.to_dict(orient='records') + additional_data
output_file = "/content/combined_instruction_input_response_data.json"
with open(output_file, "w", encoding='utf-8-sig') as f:  # Use 'utf-8-sig' to handle BOM
    json.dump(combined_data, f, ensure_ascii=False, indent=4)

# Load combined data and remove duplicates
combined_data = load_json_data(output_file)
df = pd.DataFrame(combined_data)
df_unique = df.drop_duplicates(subset='instruction')
unique_data = df_unique.to_dict(orient='records')
cleaned_output_file = "/content/cleaned_combined_instruction_input_response_data.json"
with open(cleaned_output_file, "w", encoding='utf-8-sig') as f:  # Use 'utf-8-sig' to handle BOM
    json.dump(unique_data, f, ensure_ascii=False, indent=4)


In [None]:
# Display last few entries
def display_last_entries(data, num_entries=5):
    for entry in data[-num_entries:]:
        print(json.dumps(entry, indent=4))

display_last_entries(combined_data, num_entries=5)

Data Preparation for Training

In [None]:
# Convert to HuggingFace Dataset and prepare data
def prepare_dataset(file_path):
    with open(file_path, encoding='utf-8-sig') as f:
        data = json.load(f)
    return Dataset.from_list(data)

dataset = prepare_dataset(cleaned_output_file)

In [None]:
# Formatting prompt function
prompt_template = """Kindly have a look at the instruction and give response.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

def format_prompts(data):
    return {"text": [prompt_template.format(inst, inp, resp) + EOS_TOKEN for inst, inp, resp in zip(data['instruction'], data['input'], data['response'])]}

dataset = dataset.map(format_prompts, batched=True)

# Split dataset
df = dataset.to_pandas()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

Model Inference Before Training

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    prompt_template.format(
        "What is COP26?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
decoded_outputs= tokenizer.batch_decode(outputs)
for output in decoded_outputs:
    print(output.replace("\\n", "\n"))

Model Training

In [None]:
# Training configuration
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=24,
    learning_rate=2e-5,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    gradient_checkpointing=True
)

In [None]:
# Initialize and run the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_len,
    dataset_num_proc=2,
    packing=False,
    args=training_args
)

In [None]:
# GPU stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
start_time = time.time()
trainer_stats = trainer.train()
end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken for this instruction to run is: {time_taken:.2f} seconds")

In [None]:
# Final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

Model Inference After Training

In [None]:
FastLanguageModel.for_inference(model)
# Define the prompt with placeholders
prompt_template = """Kindly have a look at the instruction and give response.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

# Define the instruction, input, and empty response
instruction = "What is COP26?"
input_text = "What is COP26?"
response = ""

# Format the prompt
formatted_prompt = prompt_template.format(instruction, input_text, response)

# Tokenize the inputs
inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

# Measure the time taken for the model to generate the output
start_time = time.time()

# Generate the outputs
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

end_time = time.time()
time_taken = end_time - start_time

# Decode the outputs
decoded_outputs = tokenizer.batch_decode(outputs)

# Print the time taken
print(f"Time taken for this instruction to run is : {time_taken:.2f} seconds")

# Print the decoded outputs with proper newlines
for output in decoded_outputs:
    print(output.replace("\\n", "\n"))

Model Evaluation

In [None]:
# Install and evaluate with BLEU and ROUGE metrics
!pip install rouge_score
!pip install datasets evaluate

In [None]:
# Ensure nltk data is downloaded
nltk.download('punkt')

dataset = test_dataset

limited_dataset = dataset.select(range(100))

# Function to generate predictions with attention mask and pad token id
def generate_predictions(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = inputs.to("cuda")
    attention_mask = inputs['attention_mask']
    outputs = model.generate(inputs.input_ids, attention_mask=attention_mask, max_length=128, pad_token_id=tokenizer.eos_token_id)
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_text

# Prepare lists for predictions and references
predictions = []
references = []

# Generate predictions for the dataset
for example in limited_dataset:
    input_text = example['input']
    reference_text = example['response']
    predicted_text = generate_predictions(input_text)
    predictions.append(predicted_text)
    references.append([reference_text])

# Tokenize predictions and references for BLEU
predictions_tokenized = [nltk.word_tokenize(pred) for pred in predictions]
references_tokenized = [[nltk.word_tokenize(ref) for ref in ref_group] for ref_group in references]

# Load evaluation metrics
rouge_metric = load_metric("rouge")
bleu_metric = load_metric("bleu")

# Calculate BLEU score
bleu_score = bleu_metric.compute(predictions=predictions_tokenized, references=references_tokenized)

# Calculate ROUGE score
rouge_score = rouge_metric.compute(predictions=predictions, references=[[" ".join(ref) for ref in ref_group] for ref_group in references])

# Extract mid F1 scores for ROUGE
rouge1_mid_f1 = rouge_score['rouge1'].mid.fmeasure
rouge2_mid_f1 = rouge_score['rouge2'].mid.fmeasure
rougeL_mid_f1 = rouge_score['rougeL'].mid.fmeasure
rougeLsum_mid_f1 = rouge_score['rougeLsum'].mid.fmeasure

# Calculate the average ROUGE score
average_rouge = (rouge1_mid_f1 + rouge2_mid_f1 + rougeL_mid_f1 + rougeLsum_mid_f1) / 4

# Extract BLEU score
average_bleu = bleu_score['bleu']

print(f"Average BLEU Score: {average_bleu}")
print(f"Average ROUGE Score: {average_rouge}")

In [None]:
# Plotting the scores
labels = ['BLEU', 'ROUGE']
scores = [average_bleu, average_rouge]

x = range(len(labels))

plt.figure(figsize=(10, 6))
plt.bar(x, scores, width=0.4, label='Model Scores', align='center')

plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.title('Llama 3 8b -Average BLEU and ROUGE Scores')
plt.xticks(x, labels)
plt.legend()


plt.show()