### Using stratified data

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    LoraConfig,
)
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
import torch
import pandas as pd 
import numpy as np
import random 
from random import seed
import time 


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin c:\Python310\lib\site-packages\bitsandbytes\libbitsandbytes_cpu.so
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
CUDA SETUP: Loading binary c:\Python310\lib\site-packages\bitsandbytes\libbitsandbytes_cpu.so...
argument of type 'WindowsPath' is not iterable


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [2]:
model_name_or_path = "alger-ia/dziribert_sentiment"
task = "mrpc"
num_epochs = 20
lr = 1e-3
batch_size = 32

In [3]:
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [8]:
# Convert pandas DataFrame to Hugging Face dataset

df_train = pd.read_csv('dataset_stratified_train.csv') 
df_val = pd.read_csv('dataset_stratified_val.csv')

train_dataset = Dataset.from_pandas(df_train)
validation_dataset = Dataset.from_pandas(df_val)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset
})

In [9]:
metric = evaluate.load("glue", task) 

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return metric.compute(predictions=predictions, references=labels) 

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 1]  # Get the probabilities of the positive class
    threshold = 0.5  # Define your threshold here
    predictions = (predictions > threshold).astype(int)  # Apply threshold
    return metric.compute(predictions=predictions, references=labels)


In [10]:
def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    outputs = tokenizer(examples["text"], truncation=True, max_length=None)
    return outputs

In [11]:
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

Map:   0%|          | 0/8341 [00:00<?, ? examples/s]

Map:   0%|          | 0/2780 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets["train"]

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8341
})

In [13]:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

Adapting Query and Value gives the best performance, according to the paper (chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://arxiv.org/pdf/2106.09685.pdf)

In [15]:
#peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=20, encoder_hidden_size=128)
peft_config = LoraConfig(
    task_type="SEQ_CLS", 
    target_modules=["query", "value"],
    r=16, 
    lora_alpha=32, 
    lora_dropout=0.35,
    bias="none",
    )

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 594438 || all params: 125035782 || trainable%: 0.47541430980133353


In [17]:
output_dir = f'./dziribert-peft-lora-v3-tuning-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjcolanotoro[0m ([33mjcolano[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/2610 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.4299, 'learning_rate': 0.0008084291187739465, 'epoch': 0.96}


  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': 0.3707495927810669, 'eval_accuracy': 0.8172661870503597, 'eval_f1': 0.8128224023581428, 'eval_runtime': 139.8814, 'eval_samples_per_second': 19.874, 'eval_steps_per_second': 0.622, 'epoch': 1.0}
{'loss': 0.3273, 'learning_rate': 0.0006168582375478927, 'epoch': 1.92}


  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': 0.3924570083618164, 'eval_accuracy': 0.8359712230215828, 'eval_f1': 0.828828828828829, 'eval_runtime': 140.7992, 'eval_samples_per_second': 19.744, 'eval_steps_per_second': 0.618, 'epoch': 2.0}
{'loss': 0.25, 'learning_rate': 0.00042528735632183906, 'epoch': 2.87}


  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': 0.3767654597759247, 'eval_accuracy': 0.8089928057553957, 'eval_f1': 0.8100178890876565, 'eval_runtime': 139.3729, 'eval_samples_per_second': 19.946, 'eval_steps_per_second': 0.624, 'epoch': 3.0}
{'loss': 0.2067, 'learning_rate': 0.00023371647509578544, 'epoch': 3.83}


  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': 0.4279506802558899, 'eval_accuracy': 0.8237410071942446, 'eval_f1': 0.8191881918819188, 'eval_runtime': 147.8405, 'eval_samples_per_second': 18.804, 'eval_steps_per_second': 0.588, 'epoch': 4.0}
{'loss': 0.1696, 'learning_rate': 4.21455938697318e-05, 'epoch': 4.79}


  0%|          | 0/87 [00:00<?, ?it/s]

{'eval_loss': 0.5083689093589783, 'eval_accuracy': 0.839568345323741, 'eval_f1': 0.8283294842186297, 'eval_runtime': 141.4707, 'eval_samples_per_second': 19.651, 'eval_steps_per_second': 0.615, 'epoch': 5.0}
{'train_runtime': 4354.0169, 'train_samples_per_second': 9.579, 'train_steps_per_second': 0.599, 'train_loss': 0.27082695285022484, 'epoch': 5.0}


TrainOutput(global_step=2610, training_loss=0.27082695285022484, metrics={'train_runtime': 4354.0169, 'train_samples_per_second': 9.579, 'train_steps_per_second': 0.599, 'train_loss': 0.27082695285022484, 'epoch': 5.0})

In [20]:
model_path="./dziribert_peft_lora_finetune_v5"
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./dziribert_peft_lora_finetune_v5\\tokenizer_config.json',
 './dziribert_peft_lora_finetune_v5\\special_tokens_map.json',
 './dziribert_peft_lora_finetune_v5\\vocab.txt',
 './dziribert_peft_lora_finetune_v5\\added_tokens.json',
 './dziribert_peft_lora_finetune_v5\\tokenizer.json')

### Inference

#### Load the model

In [21]:

# Specify the path where the model and tokenizer were saved
model_path = "./dziribert_peft_lora_finetune_v5"


In [22]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = model_path
loaded_config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(loaded_config.base_model_name_or_path)

loaded_tokenizer = AutoTokenizer.from_pretrained(loaded_config.base_model_name_or_path)
loaded_model = PeftModel.from_pretrained(inference_model, peft_model_id)

### Test with the data3 dataset, the last one annotated by Dihia

In [32]:
# # Convert the Excel file to CSV
# data_xlsx = pd.read_excel('./datasets_other/data3.xlsx')
# data_xlsx.to_csv('data_test.csv', index=False)

# # Load the new CSV file into a DataFrame
# df = pd.read_csv('data_test.csv')

# # Add a new column 'label2' with default value as None
# df['label2'] = None

In [23]:
#df = pd.read_csv('./datasets_annotated/dataset_16k_autolabel.csv')
#df = pd.read_csv('data4.csv')
df = pd.read_csv("./dataset_stratified_test.csv")

df['label2'] = None

In [24]:
from tqdm import tqdm

# Iterate over the DataFrame and update the 'label' column based on the classifier's result
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        inputs = loaded_tokenizer(row['text'], truncation=True, padding="longest", return_tensors="pt")

        with torch.no_grad():
            outputs = loaded_model(**inputs).logits
            #print(outputs)

        paraphrased_text = torch.softmax(outputs, dim=1).tolist()[0]

        new_label = paraphrased_text.index(max(paraphrased_text))

        df.at[index, 'label2'] = new_label

        # Add the probability of the positive class to your DataFrame
        df.at[index, 'probability_positive'] = paraphrased_text[1]  # Assuming that the positive class is labeled as 1

    except Exception as e:
        print(e)

    # if index >= 2500:
    #     break 

100%|██████████| 2781/2781 [01:26<00:00, 32.00it/s]


In [11]:
df.to_csv('./datasets_annotated/dataset_16k_autolabel2.csv')

In [25]:
import pandas as pd

# Assuming df is your DataFrame and it has columns 'label' and 'label2'
matches = df[df['label'] == df['label2']]

# Count matches for each value
value_0_matches = sum(matches['label'] == 0)
value_1_matches = sum(matches['label'] == 1)

# Count total 0's and 1's in 'label'
total_0 = sum(df['label'] == 0)
total_1 = sum(df['label'] == 1)

print(f"Value 0: {value_0_matches} matches out of {total_0}")
print(f"Value 1: {value_1_matches} matches out of {total_1}")



Value 0: 1472 matches out of 1560
Value 1: 897 matches out of 1221


In [26]:
from sklearn import metrics
import numpy as np

# Make sure your labels are in numerical format
df['label'] = df['label'].astype(np.int64)
df['label2'] = df['label2'].astype(np.int64)

# Define true and predicted response values
y_true = df['label']
y_pred = df['label2']

# Compute metrics
accuracy = metrics.accuracy_score(y_true, y_pred)
precision = metrics.precision_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)
f1_score = metrics.f1_score(y_true, y_pred)
#auc_roc = metrics.roc_auc_score(y_true, y_pred)

# Compute confusion matrix
confusion_matrix = metrics.confusion_matrix(y_true, y_pred)

# Print metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1_score}')
#print(f'AUC-ROC: {auc_roc}')
print('Confusion Matrix:')
print(confusion_matrix)


Accuracy: 0.8518518518518519
Precision: 0.9106598984771573
Recall: 0.7346437346437347
F1 Score: 0.8132366273798731
Confusion Matrix:
[[1472   88]
 [ 324  897]]


In [27]:
from sklearn.metrics import roc_auc_score

y_true = df['label']
y_score = df['probability_positive']  # use the probabilities of the positive class

auc_roc = roc_auc_score(y_true, y_score)

print(f'AUC-ROC: {auc_roc}')


AUC-ROC: 0.9415648165648165


In [38]:
# Select rows where 'label' is 0 and 'label2' is 1
selected_rows = df[(df['label'] == 0) & (df['label2'] == 1)]

# Save selected rows to a csv file
selected_rows.to_csv('data4_false_positives.csv', index=False)

In [39]:
len(selected_rows)

148