## Load data

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import xml.etree.ElementTree as ET
import re
import os

In [3]:
# load dataset
dataset = pd.read_csv('dataset_labeled.csv')
dataset

Unnamed: 0,conversation_id,text,sentiment,label
0,000049c4530615e68b898b3e0306630d,53a66119381d887197c67ccfe3ef6670: hi 1c8edb8bf...,Positive,0
1,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2: b8810fee2f4a...,Negative,0
2,000133dbd971ffb8f723fc61ba977ca0,8f1d151f40bd785177dec682f5407c4e: hey 3b8f9119...,Positive,0
3,0001347c00d419eb537c0692e6e58eba,e2bd430b29412d9267886e187ba28075: say asl and ...,Positive,0
4,000161e288cf8dfc468fe86d6d4af2d4,b035925d950f4a032b68dd0844ff8413: h b035925d95...,Negative,0
...,...,...,...,...
222050,fffe4d1b08952afb8627a9b594f913c7,e5a96ed432ed5041be76d3fb1784fb95: do you want ...,Negative,0
222051,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de: haiiiiiiiii....,Negative,0
222052,ffff38287b6013960b9e96e08f85526a,a9343d850a27be6ed37f176bc2ce589b: hi a9343d850...,Positive,0
222053,ffff74f40b58182a2521235b9db901d4,7bc167d759d9c56d43d1d46575433d35: hey 169b2106...,Positive,0


In [4]:
# clean up the text
def clean_text(text):
    if text is None:
        return ""
    # Remove HTML tags and encoded characters
    text = re.sub(r'&amp;lt;', '<', text)
    text = re.sub(r'&amp;gt;', '>', text)
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip().lower()

In [5]:
# parse the xml file
def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    data = []

    # Iterate through each conversation
    for conversation in root.findall('conversation'):
        conversation_id = conversation.get('id')
        
        # Iterate through each message in the conversation
        for message in conversation.findall('message'):
            line = message.get('line')
            author = message.find('author').text
            time = message.find('time').text
            text = message.find('text').text

            # Clean the text
            cleaned_text = clean_text(text)
            
            # Store the data in a dictionary
            data.append({
                'conversation_id': conversation_id,
                'line': line,
                'author': author,
                'time': time,
                'text': cleaned_text
            })
    
    # Convert to a DataFrame for easier manipulation
    df = pd.DataFrame(data)
    return df

In [6]:
# load the train file
xml_file = 'pan12-training/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'
train_df = parse_xml(xml_file)

In [7]:
# load the test file
xml_file = 'pan12-test/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml'
test_df = parse_xml(xml_file)

In [8]:
# make ids of train and test to lists
id_list_train = train_df['conversation_id'].tolist()
id_list_test = test_df['conversation_id'].tolist()

In [9]:
# split dataset into original train and test datasets
# train
df_train = dataset[dataset['conversation_id'].isin(id_list_train)]
df_test = dataset[dataset['conversation_id'].isin(id_list_test)]

In [10]:
df_train

Unnamed: 0,conversation_id,text,sentiment,label
1,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2: b8810fee2f4a...,Negative,0
3,0001347c00d419eb537c0692e6e58eba,e2bd430b29412d9267886e187ba28075: say asl and ...,Positive,0
6,000197b21283dc47810760e499d1f8ec,487862cd4ec27d841e2d2e80e8d91955: joint 5c7c53...,Negative,0
8,0002ee38ac5e78e7edbc4d4a556ec4b7,8150320816528784d7dfe286d781de4c: hey :) male ...,Negative,0
15,000483300677468215a9e3b38728209b,69b0d3dfe919a6b860a9fac82de52a7e: if you have ...,Negative,0
...,...,...,...,...
222047,fffdb82e3d5078c7828024ac4855bffd,83dfa3e8b377dcd111e121b9531b089a: hi. ^_^ df3a...,Positive,0
222048,fffdd9142e809a7f634fbcc13063146d,efe5f457e24831849d715590843593d8: hey asl? 5d4...,Negative,0
222049,fffde018f39dafd4c8ef4ebaaadbec97,0a39f78bcb297ab0ebe8a29c28bfed89: bugmail: [bu...,Negative,0
222052,ffff38287b6013960b9e96e08f85526a,a9343d850a27be6ed37f176bc2ce589b: hi a9343d850...,Positive,0


In [11]:
df_test

Unnamed: 0,conversation_id,text,sentiment,label
0,000049c4530615e68b898b3e0306630d,53a66119381d887197c67ccfe3ef6670: hi 1c8edb8bf...,Positive,0
2,000133dbd971ffb8f723fc61ba977ca0,8f1d151f40bd785177dec682f5407c4e: hey 3b8f9119...,Positive,0
4,000161e288cf8dfc468fe86d6d4af2d4,b035925d950f4a032b68dd0844ff8413: h b035925d95...,Negative,0
5,00018fac56b9cbb5e7c6f2024c92479d,b5ec8d28197edae90224d6853e64d332: hey 96b591d0...,Positive,0
7,0002de15312dc33d78b6e9e4b5f61f1f,a1a8f84c419e34a1a72625e2ef245516: hi a1a8f84c4...,Negative,0
...,...,...,...,...
222043,fffc6e5e47b5d6b59d446d948d875497,0a39f78bcb297ab0ebe8a29c28bfed89: bugmail: [bu...,Negative,0
222046,fffd9ce69d28d71b407c6ae24c3973e6,"72a4930439de82a74703e01923c4d974: hey, why can...",Negative,0
222050,fffe4d1b08952afb8627a9b594f913c7,e5a96ed432ed5041be76d3fb1784fb95: do you want ...,Negative,0
222051,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de: haiiiiiiiii....,Negative,0


In [12]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=2)

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from datasets import Dataset

train_df= Dataset.from_pandas(df_train[['text', 'label']])
test_df= Dataset.from_pandas(df_train[['text', 'label']])

In [14]:
# Define and add the padding token if it's not already defined
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [15]:
# Step 4: Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding = 'max_length', truncation=True, max_length = 64)

# Step 5: Apply tokenization to all rows
tokenized_dataset_train = train_df.map(tokenize_function, batched=True, batch_size = 16)
tokenized_dataset_test = test_df.map(tokenize_function, batched=True, batch_size = 16)

Map:   0%|          | 0/66927 [00:00<?, ? examples/s]

Map:   0%|          | 0/66927 [00:00<?, ? examples/s]

In [16]:
model.config.pad_token_id = tokenizer.pad_token_id

In [17]:
tokenized_dataset_train

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 66927
})

In [None]:
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
import evaluate
import numpy as np

# Load the required metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

# Define a function to compute multiple metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Get the predicted class

    # Calculate each metric individually
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }

training_args = TrainingArguments(
    output_dir='output',
    eval_strategy='epoch',
    learning_rate=2e-5,
    logging_steps=5000,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="no", 
)

# Define the collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
