## Load data

In [1]:
# import important libraries
import xml.etree.ElementTree as ET
import re
import pandas as pd
import os
import html

In [2]:
# clean up text
def clean_text(text):
    if text is None:
        return ""
    # remove HTML encodings
    text = html.unescape(text)
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text.strip().lower()

In [3]:
# parse xml file
def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    data = []

    # iterate through each conversation
    for conversation in root.findall('conversation'):
        conversation_id = conversation.get('id')
        
        # iterate through each message
        for message in conversation.findall('message'):
            line = message.get('line')
            author = message.find('author').text
            time = message.find('time').text
            text = message.find('text').text

            # clean the text
            cleaned_text = clean_text(text)
            
            # store as dictionary
            data.append({
                'conversation_id': conversation_id,
                'line': line,
                'author': author,
                'time': time,
                'text': cleaned_text
            })
    
    # convert to a DataFrame
    df = pd.DataFrame(data)
    return df

In [47]:
# load train file
xml_file = 'PAN12/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'
train_df = parse_xml(xml_file)

In [48]:
# load test file
xml_file = 'PAN12/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml'
test_df = parse_xml(xml_file)

In [49]:
# function to load the identified groomers
def load_groomers(file_path):
    with open(file_path, 'r') as f:
        positive_authors = set([line.strip() for line in f])
    return positive_authors

In [50]:
# load training groomers
training_groomers = "PAN12/pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt"
train_groomers = load_groomers(training_groomers)
# load testing groomers
testing_groomers = "PAN12/pan12-sexual-predator-identification-groundtruth-problem1.txt"
test_groomers = load_groomers(testing_groomers)

In [51]:
grouped_train_df = (
    train_df.groupby(["conversation_id", "author"])
    .agg({
        "text": lambda x: " ".join(x),  # Combine texts into a single string
    })  # Combine texts into a string for each group
    .reset_index()  # Reset the index for a clean output
)
grouped_train_df

Unnamed: 0,conversation_id,author,text
0,0000604306a283600b730276a2039471,9fdcde97c1cb33fe4e9f6aab1d84bc76,e3fb62ebfa4f36acf5cbff6a6ed0f2e0: can i have y...
1,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2,b8810fee2f4a71f849f3f7409546d1d9 - do you have...
2,0000604306a283600b730276a2039471,b8810fee2f4a71f849f3f7409546d1d9,a9b326df4e6da61c5b6f5e1058be83a2: there are so...
3,0000604306a283600b730276a2039471,e3fb62ebfa4f36acf5cbff6a6ed0f2e0,"""sean fraser posted this on january 22, 2007 0..."
4,0001347c00d419eb537c0692e6e58eba,67952953f11f8800aa8296b1457d2c01,asl
...,...,...,...
152869,ffffe01fc5b03a8d6b8c929d595644d9,24340ef160b44f4e9d826263c6dd3188,"jamesd: i thought you needed ""mash"". karrot-x:..."
152870,ffffe01fc5b03a8d6b8c929d595644d9,7ef291c89ad915978b203d427919cfbb,"man where the hell is the eggs, bangers are he..."
152871,ffffe01fc5b03a8d6b8c929d595644d9,8a2ec3d80a45ba71f61da6da97613e11,ya
152872,ffffe01fc5b03a8d6b8c929d595644d9,8cd806e5c7f4f95937df692d3b8a3554,goops god damnit i hate cpan cpan is the bigge...


In [52]:
grouped_test_df = (
    test_df.groupby(["conversation_id", "author"])
    .agg({
        "text": lambda x: " ".join(x),  # Combine texts into a single string
    })  # Combine texts into a string for each group
    .reset_index()  # Reset the index for a clean output
)
grouped_test_df

Unnamed: 0,conversation_id,author,text
0,000049c4530615e68b898b3e0306630d,1c8edb8bfd4b3f9ec565192af6162909,hi fr?
1,000049c4530615e68b898b3e0306630d,53a66119381d887197c67ccfe3ef6670,hi
2,000133dbd971ffb8f723fc61ba977ca0,3b8f9119e773f37c4a2040c8e0c3d3da,heyy
3,000133dbd971ffb8f723fc61ba977ca0,8f1d151f40bd785177dec682f5407c4e,hey hej din fjant
4,000161e288cf8dfc468fe86d6d4af2d4,9804dc98b4ca58d3799c805cf476919a,heeeyy asl ?
...,...,...,...
355562,fffe4d1b08952afb8627a9b594f913c7,f8e350fc2fe58fa245fcd04eefb406c5,"hallo hi where are you from? no, i'm not a per..."
355563,ffff2d0e314610b1df596482d806ada9,3dc4d61ed4ad210bfbc90ce935a75b46,sure.. with what? nope sry idk a thing about g...
355564,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de,haiiiiiiiii. can you help me? ): can you read ...
355565,ffff74f40b58182a2521235b9db901d4,169b210634b131ebcddc099eb64972a1,hi lookingfor girl? r u girl? what r u doing i...


In [53]:
# function to label grooming authors based on author IDs
def label_authors(df, positive_authors):
    df['label'] = df['author'].apply(lambda aid: 1 if aid in positive_authors else 0)
    return df

In [54]:
train_df = label_authors(grouped_train_df, train_groomers)

In [55]:
test_df = label_authors(grouped_test_df, test_groomers)

In [56]:
print('Total training conversations:', train_df['conversation_id'].nunique())

Total training conversations: 66927


In [57]:
train_df

Unnamed: 0,conversation_id,author,text,label
0,0000604306a283600b730276a2039471,9fdcde97c1cb33fe4e9f6aab1d84bc76,e3fb62ebfa4f36acf5cbff6a6ed0f2e0: can i have y...,0
1,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2,b8810fee2f4a71f849f3f7409546d1d9 - do you have...,0
2,0000604306a283600b730276a2039471,b8810fee2f4a71f849f3f7409546d1d9,a9b326df4e6da61c5b6f5e1058be83a2: there are so...,0
3,0000604306a283600b730276a2039471,e3fb62ebfa4f36acf5cbff6a6ed0f2e0,"""sean fraser posted this on january 22, 2007 0...",0
4,0001347c00d419eb537c0692e6e58eba,67952953f11f8800aa8296b1457d2c01,asl,0
...,...,...,...,...
152869,ffffe01fc5b03a8d6b8c929d595644d9,24340ef160b44f4e9d826263c6dd3188,"jamesd: i thought you needed ""mash"". karrot-x:...",0
152870,ffffe01fc5b03a8d6b8c929d595644d9,7ef291c89ad915978b203d427919cfbb,"man where the hell is the eggs, bangers are he...",0
152871,ffffe01fc5b03a8d6b8c929d595644d9,8a2ec3d80a45ba71f61da6da97613e11,ya,0
152872,ffffe01fc5b03a8d6b8c929d595644d9,8cd806e5c7f4f95937df692d3b8a3554,goops god damnit i hate cpan cpan is the bigge...,0


In [58]:
test_df

Unnamed: 0,conversation_id,author,text,label
0,000049c4530615e68b898b3e0306630d,1c8edb8bfd4b3f9ec565192af6162909,hi fr?,0
1,000049c4530615e68b898b3e0306630d,53a66119381d887197c67ccfe3ef6670,hi,0
2,000133dbd971ffb8f723fc61ba977ca0,3b8f9119e773f37c4a2040c8e0c3d3da,heyy,0
3,000133dbd971ffb8f723fc61ba977ca0,8f1d151f40bd785177dec682f5407c4e,hey hej din fjant,0
4,000161e288cf8dfc468fe86d6d4af2d4,9804dc98b4ca58d3799c805cf476919a,heeeyy asl ?,0
...,...,...,...,...
355562,fffe4d1b08952afb8627a9b594f913c7,f8e350fc2fe58fa245fcd04eefb406c5,"hallo hi where are you from? no, i'm not a per...",0
355563,ffff2d0e314610b1df596482d806ada9,3dc4d61ed4ad210bfbc90ce935a75b46,sure.. with what? nope sry idk a thing about g...,0
355564,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de,haiiiiiiiii. can you help me? ): can you read ...,0
355565,ffff74f40b58182a2521235b9db901d4,169b210634b131ebcddc099eb64972a1,hi lookingfor girl? r u girl? what r u doing i...,0


## LLama 3.2 1B Implementation 

In [59]:
import huggingface_hub
print(huggingface_hub.__version__)

0.26.2


In [60]:
# login to huggingface
from huggingface_hub import login
login(token="YOUR_HUGGINGFACE_TOKEN")

In [61]:
# load model with huggingface
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token="YOUR_HUGGINGFACE_TOKEN")
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=2, token="YOUR_HUGGINGFACE_TOKEN")

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
import torch

# check if CUDA is available
cuda_available = torch.cuda.is_available()

print("CUDA Available:", cuda_available)

# print GPU name if CUDA available
if cuda_available:
    print("CUDA Device Name:", torch.cuda.get_device_name(0))

CUDA Available: True
CUDA Device Name: NVIDIA A100-SXM4-40GB


In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model is loaded on device: {device}")

Model is loaded on device: cuda


In [64]:
from datasets import Dataset

train_df = Dataset.from_pandas(train_df[['text', 'label']])
test_df = Dataset.from_pandas(test_df[['text', 'label']])

In [65]:
# define and add the padding token if it's not already defined
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [66]:
# define tokenization function
def tokenize_function(row):
    return tokenizer(row['text'], padding = 'max_length', truncation=True, max_length = 64)

# apply tokenization to all rows
tokenized_train = train_df.map(tokenize_function, batched=True, batch_size = 16)
tokenized_test = test_df.map(tokenize_function, batched=True, batch_size = 16)

Map: 100%|██████████| 152874/152874 [00:22<00:00, 6682.82 examples/s]
Map: 100%|██████████| 355567/355567 [00:55<00:00, 6422.94 examples/s]


In [67]:
model.config.pad_token_id = tokenizer.pad_token_id

In [68]:
tokenized_train

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 152874
})

In [69]:
import torchvision.transforms
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
import evaluate
import numpy as np

# load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

# define function to compute multiple metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Get the predicted class

    # calculate each metric individually
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }

training_args = TrainingArguments(
    output_dir='output',
    eval_strategy='epoch',
    learning_rate=5e-6,
    logging_steps=50,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="no",   
)

# define collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [70]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [71]:
# start training
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.025,0.02632,0.991667,0.99098,0.991667,0.989217
1,0.0134,0.026031,0.993666,0.993034,0.993666,0.993178
2,0.007,0.031126,0.994029,0.993489,0.994029,0.993616
3,0.0058,0.034399,0.994167,0.993792,0.994167,0.993929
4,0.0057,0.042772,0.993723,0.993138,0.993723,0.993295
5,0.0054,0.047408,0.993821,0.99319,0.993821,0.993287
6,0.0049,0.054122,0.994178,0.993728,0.994178,0.993865
7,0.0052,0.058449,0.994026,0.993447,0.994026,0.993537
8,0.0051,0.061291,0.994066,0.99348,0.994066,0.993526
9,0.0044,0.061797,0.994001,0.993411,0.994001,0.993493




TrainOutput(global_step=1490, training_loss=0.009828276752225504, metrics={'train_runtime': 17338.9909, 'train_samples_per_second': 88.168, 'train_steps_per_second': 0.086, 'total_flos': 5.701607659615027e+17, 'train_loss': 0.009828276752225504, 'epoch': 9.982426778242678})

In [72]:
results = trainer.evaluate()
print("Evaluation results:", results)



Evaluation results: {'eval_loss': 0.06179651990532875, 'eval_accuracy': 0.9940011305886092, 'eval_precision': 0.9934109866031126, 'eval_recall': 0.9940011305886092, 'eval_f1': 0.9934931022378706, 'eval_runtime': 788.3779, 'eval_samples_per_second': 451.011, 'eval_steps_per_second': 3.524, 'epoch': 9.982426778242678}
