In [None]:
import os
import shutil

# specify your directory
dir_path = '/kaggle/working/'

# remove all files in the directory
for filename in os.listdir(dir_path):
    file_path = os.path.join(dir_path, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')



In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl

In [None]:
%%capture
%pip install -U datasets

In [None]:
%%capture
%pip install -U wandb
%pip install -U matplotlib scipy
%pip install -U evaluate

In [None]:
#%pip install --user --force-reinstall --no-deps numpy==1.23

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb
from datasets import load_dataset
from trl import SFTTrainer

import numpy as np
import evaluate

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_wandb = user_secrets.get_secret("wandb")

In [None]:
!huggingface-cli login --token $secret_hf

In [None]:
# Monitering the LLM
wandb.login(key = secret_wandb)
run = wandb.init(
    project='Fine tuning mistral 7B', 
    job_type="training", 
    anonymous="allow",
    name="IT-Era-Run-v2",
    resume="allow",
)

In [None]:
#base_model = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"
#replace the base when >3
#base_model= "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"
#dataset_name = "/kaggle/input/essay-train-v2"
#new_model = "mistral_7b_AES_v2_max-steps"

In [None]:
base_model="mistralai/Mistral-7B-Instruct-v0.2" 
train_dataset_name="/kaggle/input/essay-train-v3"
test_dataset_name="/kaggle/input/essay-validate"
new_model = "mistral_7b_AES_v2"

In [None]:
#Importing the dataset
train_data = load_dataset(train_dataset_name, split="train")
validate_data =load_dataset(test_dataset_name, split="train")

In [None]:
train_data

In [None]:
validate_data

In [None]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token


#tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenized prompt
    """
    #prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
    #           'appropriately completes the request.\n\n'
    # Samples with additional context into.
    if data_point['input']:
        text = f"""<s>[INST]{data_point["instruction"]}\n{data_point["input"]} [/INST]{data_point["output"]}</s>"""
    # Without
    else:
        text = f"""<s>[INST]{data_point["instruction"]} [/INST]{data_point["output"]} </s>"""
    return text

#add the "prompt" column in the dataset
train_text_column = [generate_prompt(data_point) for data_point in train_data]
test_text_column = [generate_prompt(data_point) for data_point in validate_data]

train_data = train_data.add_column("text", train_text_column)
validate_data = validate_data.add_column("text", test_text_column)

In [None]:
import matplotlib.pyplot as plt

# Assuming `dataset` is your data and 'prompt' is the key where sequences are stored
sequences = train_data['text']
lengths = [len(x['text']) for x in train_data]

#print(lengths)

# Plotting the histogram
plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=20, alpha=0.7)
plt.xlabel('Length of input_ids')
plt.ylabel('Frequency')
plt.title('Distribution of Lengths of input_ids')
plt.show()

In [None]:
max_seq_length = 3300
col_to_delete = ['input', 'instruction', 'output', 'filename']

train_data = train_data.map(lambda samples: tokenizer(samples["text"], truncation=True, max_length=max_seq_length, padding="max_length"), batched=True, remove_columns=col_to_delete)
validate_data = validate_data.map(lambda samples: tokenizer(samples["text"], truncation=True, max_length=max_seq_length, padding="max_length"), batched=True, remove_columns=col_to_delete)


train_data = train_data.add_column("labels", train_data['input_ids'])
validate_data = validate_data.add_column("labels", validate_data ['input_ids'])

In [None]:
train_data.set_format("torch")
validate_data.set_format("torch")

In [None]:
import matplotlib.pyplot as plt

# Assuming `dataset` is your data and 'prompt' is the key where sequences are stored
sequences = train_data['input_ids']
lengths = [len(x['input_ids']) for x in train_data]

#print(lengths)

# Plotting the histogram
plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=20, alpha=0.7)
plt.xlabel('Length of input_ids')
plt.ylabel('Frequency')
plt.title('Distribution of Lengths of input_ids')
plt.show()

In [None]:
train_data

In [None]:
validate_data

In [None]:
# Save dataset para ma resume mao nalng ni gamiton sunod run 
#dataset.save_to_disk('/kaggle/working/my_dataset')

In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    print(torch.cuda.device_count())
    model.is_parallelizable = True
    model.model_parallel = True

In [None]:
#Adding the adapters in the layers

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
training_arguments=TrainingArguments(
    output_dir = "Mistral_AES_v2",
    warmup_steps=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    #max_steps=500,
    num_train_epochs=7, # changed to 3 
    weight_decay=0.001,
    learning_rate=2.5e-5,                # Want a small lr for finetuning
    fp16=False,                          #might need to set this to true
    bf16=False,
    optim="paged_adamw_32bit",
    logging_steps=50,                    # When to start reporting loss
    logging_dir="/kaggle/working/logs",  # Directory for storing logs
    save_strategy="epoch",               # Save the model checkpoint every step
    #save_steps=287, #287                # Save checkpoints every 96 steps 1/3 each epoch
    evaluation_strategy="epoch",         # Evaluate the model every logging step
    eval_steps=50,                       # Evaluate and save checkpoints every 287 steps
    do_eval=True,                        # Perform evaluation at the end of training
    report_to="wandb",                   # Comment this out if you don't want to use weights & baises        # Name of the W&B run (optional)
    run_name="IT_Era_Run_Epoch",                # Name of the W&B run (optional)
    lr_scheduler_type="constant",
    load_best_model_at_end=True,
    save_total_limit=8,
    do_predict=True,
)

In [None]:
#metric = evaluate.load("accuracy")

#def compute_metrics(eval_pred):
#    logits, labels = eval_pred
#    predictions = np.argmax(logits, axis=-1)
#    return metric.compute(predictions=predictions, references=labels)

def compute_metrics(eval_pred):
    # All metrics are already predefined in the HF `evaluate` package
    precision_metric = evaluate.load("precision")
    #recall_metric = evaluate.load("recall")
    f1_metric= evaluate.load("f1")
    accuracy_metric = evaluate.load("accuracy")

    logits, labels = eval_pred # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    #recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    # The trainer is expecting a dictionary where the keys are the metrics names and the values are the scores. 
    return {"precision": precision, "f1-score": f1, 'accuracy': accuracy}

def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak. 
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [None]:
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

max_seq_length = 3300

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  #packing=False,
  args=training_arguments,
  dataset_text_field="text",
  train_dataset=train_data,
  eval_dataset=validate_data,
  data_collator=collator,
  #compute_metrics=compute_metrics,
  #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [None]:
#trainer.train(resume_from_checkpoint="/kaggle/working/Mistral_AES_v2/checkpoint-574") 

In [None]:
trainer.train() 

In [None]:
wandb.finish()

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()

In [None]:
best_model_checkpoint = trainer.state.best_model_checkpoint
best_model_checkpoint 

In [None]:
#try:
#    trainer.model.push_to_hub(new_model, use_temp_dir=False)
#except:
#    print("An exception occurred")

In [None]:
#!1eval_results = trainer.evaluate()

# Print the accuracy
#print(eval_results)