In [22]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd

In [24]:
# load dataset
dataset = pd.read_csv('dataset_labeled.csv')
dataset

Unnamed: 0,conversation_id,text,sentiment,label
0,000049c4530615e68b898b3e0306630d,53a66119381d887197c67ccfe3ef6670: hi 1c8edb8bf...,Positive,0
1,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2: b8810fee2f4a...,Negative,0
2,000133dbd971ffb8f723fc61ba977ca0,8f1d151f40bd785177dec682f5407c4e: hey 3b8f9119...,Positive,0
3,0001347c00d419eb537c0692e6e58eba,e2bd430b29412d9267886e187ba28075: say asl and ...,Positive,0
4,000161e288cf8dfc468fe86d6d4af2d4,b035925d950f4a032b68dd0844ff8413: h b035925d95...,Negative,0
...,...,...,...,...
222050,fffe4d1b08952afb8627a9b594f913c7,e5a96ed432ed5041be76d3fb1784fb95: do you want ...,Negative,0
222051,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de: haiiiiiiiii....,Negative,0
222052,ffff38287b6013960b9e96e08f85526a,a9343d850a27be6ed37f176bc2ce589b: hi a9343d850...,Positive,0
222053,ffff74f40b58182a2521235b9db901d4,7bc167d759d9c56d43d1d46575433d35: hey 169b2106...,Positive,0


In [26]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=2)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from datasets import Dataset

df = Dataset.from_pandas(dataset[['text', 'label']])

In [28]:
# if tokenizer.pad_token is None:
#     tokenizer.pad_token = tokenizer.eos_token  # Assign eos_token as the pad_token
#     model.resize_token_embeddings(len(tokenizer))

# Define and add the padding token if it's not already defined
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [29]:
# Step 4: Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding = 'max_length', truncation=True, max_length = 64)

# Step 5: Apply tokenization to all rows
tokenized_datasets = df.map(tokenize_function, batched=True, batch_size = 16)


Map:   0%|          | 0/222055 [00:00<?, ? examples/s]

In [32]:
model.config.pad_token_id = tokenizer.pad_token_id

In [33]:
# Split the dataset into train and test sets (70/30 split)
train_test_split = tokenized_datasets.train_test_split(test_size=0.3)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [34]:
train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 155438
})

In [35]:
import torchvision.transforms

In [None]:
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
import evaluate
import numpy as np

# Load the required metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

# Define a function to compute multiple metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Get the predicted class

    # Calculate each metric individually
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }

training_args = TrainingArguments(
    output_dir='output',
    eval_strategy='epoch',
    learning_rate=2e-5,
    logging_steps=2000,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="no", 
)

# Define the collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss


In [None]:
results = trainer.evaluate()
print("Evaluation results:", results)