In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
print(f"pytorch version {torch.__version__}")

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

In [None]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [None]:
filename = "/scratch/lf93/iw/df_all_group3.csv"

In [None]:
# Read the CSV file
df = pd.read_csv(filename)

In [None]:
df.columns = ["id", "label", "text", "group"]

In [None]:
df

In [None]:
# 60% training, 20% testing, 20% evaluation
train_size = 0.6
test_size = 0.2

# Initialize empty lists for training, testing, and evaluation data
X_train_list = []
X_test_list = []
X_eval_list = []

# Split data by label
for label in df['label'].unique():
    label_data = df[df.label == label]
    train, temp = train_test_split(label_data, train_size=train_size, random_state=42)
    test, eval = train_test_split(temp, test_size=0.5, random_state=42)  # Split remaining 40% into 20% test and 20% eval
    
    X_train_list.append(train)
    X_test_list.append(test)
    X_eval_list.append(eval)

# Concatenate the lists to form the final DataFrames
X_train = pd.concat(X_train_list).sample(frac=1, random_state=10).reset_index(drop=True)
X_test = pd.concat(X_test_list).reset_index(drop=True)
X_eval = pd.concat(X_eval_list).reset_index(drop=True)

def generate_prompt(data_point, include_label=False):
    prompt = f"""
        Analyze the Javanese sentence enclosed in square brackets. Determine if it is ngoko, ngoko alus, krama, or krama alus, 
        and return the answer as the corresponding text label: 0 (ngoko), 1 (ngoko alus), 2 (krama), or 3 (krama alus). 
        Provide only the integer label without any additional explanation. 
        [{data_point["text"]}]
    """.strip()
    if include_label:
        prompt += f" = {data_point['label']}"
    return prompt


# Generate the prompts for training and evaluation
#X_train = pd.DataFrame(X_train.apply(lambda x: generate_prompt(x, include_label=True), axis=1), columns=["text"])
#X_eval = pd.DataFrame(X_eval.apply(lambda x: generate_prompt(x, include_label=True), axis=1), columns=["text"])

# Extract labels for the test set and generate test prompts
#y_true = X_test.label
#X_test = pd.DataFrame(X_test.apply(lambda x: generate_prompt(x, include_label=False), axis=1), columns=["text"])

# Convert DataFrames to Hugging Face Datasets
#train_data = Dataset.from_pandas(X_train)
#eval_data = Dataset.from_pandas(X_eval)

In [None]:
X_test.to_csv("df_test4.csv", index=False)

In [None]:
# Combine train and validation datasets
df_combined = pd.concat([X_train, X_eval], ignore_index=True)

In [None]:
train_data = Dataset.from_pandas(df_combined)

In [None]:
train_data

In [None]:
# Display the full text of a specific row by its index
row_index = 1  # Change this to the index of the row you want to inspect
print(X_test.iloc[row_index]['text'])

In [None]:
print(df_combined.iloc[1000]['text'])

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
access_token = "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"
cache_dir = "/scratch/lf93/iwand/.cache/huggingface/transformers"

In [None]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
    token=access_token, cache_dir=cache_dir
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 512 #2048
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length, token=access_token, cache_dir=cache_dir)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
import re

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map=device,
)

def predict(test, pipeline):
    y_pred = []
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    for prompt in tqdm(test["text"]):
        messages = [
            {"role": "system", "content": "You are an AI assistant specialized in Javanese language classification."},
            {"role": "user", "content": prompt},
        ]
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]

        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )
    
        response = outputs[0][input_ids.shape[-1]:]
        decoded_response = tokenizer.decode(response, skip_special_tokens=True).strip()
    
        # Print the response for verification
        print(f"prompt: {prompt}")
        print(f"Model Response: {decoded_response}")
    
        # Extract the predicted label from the model response using regex
        match = re.search(r'\b[0-3]\b', decoded_response)
        predicted_label = int(match.group()) if match else None

        y_pred.append(predicted_label)
        
    
    return y_pred
    

In [None]:
y_pred = predict(X_test, pipeline)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Evaluate
accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_true, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

In [None]:
evaluation = pd.DataFrame({'text': X_test["text"], 
                           'y_true':y_true, 
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions.csv", index=False)

In [None]:
from sklearn.metrics import (accuracy_score, 
                             recall_score, 
                             precision_score, 
                             f1_score)

from transformers import EarlyStoppingCallback, IntervalStrategy

def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
output_dir="/scratch/lf93/iw/trained_weights"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=5,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=False,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    #evaluation_strategy="steps",              # save checkpoint every epoch
    #load_best_model_at_end = True,
    #eval_steps = 25,
    #metric_for_best_model = 'accuracy',
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    #eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    },
    #compute_metrics=compute_metrics,
    #callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
# Train model
trainer.train()

In [None]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
y_pred = predict(X_test, pipeline)
evaluate(y_true, y_pred)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Evaluate
accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_true, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
evaluation = pd.DataFrame({'text': X_test["text"], 
                           'y_true':y_true, 
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions3.csv", index=False)

In [None]:
import pandas as pd


# and the labels are in a column called 'label'
label_counts = X_train['label'].value_counts()
print(label_counts)
