In [1]:
import pandas as pd

In [2]:
# load dataset
dataset = pd.read_csv('data/dataset_eng.csv')
dataset

Unnamed: 0,conversation_id,text,sentiment,label
0,0000604306a283600b730276a2039471,a9b326df4e6da61c5b6f5e1058be83a2: b8810fee2f4a...,Negative,0
1,0001347c00d419eb537c0692e6e58eba,e2bd430b29412d9267886e187ba28075: say asl and ...,Positive,0
2,000197b21283dc47810760e499d1f8ec,487862cd4ec27d841e2d2e80e8d91955: joint 5c7c53...,Negative,0
3,0002de15312dc33d78b6e9e4b5f61f1f,a1a8f84c419e34a1a72625e2ef245516: hi a1a8f84c4...,Negative,0
4,0002ee38ac5e78e7edbc4d4a556ec4b7,8150320816528784d7dfe286d781de4c: hey :) male ...,Negative,0
...,...,...,...,...
160768,fffde018f39dafd4c8ef4ebaaadbec97,0a39f78bcb297ab0ebe8a29c28bfed89: bugmail: [bu...,Negative,0
160769,fffe4d1b08952afb8627a9b594f913c7,e5a96ed432ed5041be76d3fb1784fb95: do you want ...,Negative,0
160770,ffff2d0e314610b1df596482d806ada9,eccc65c89e622a83cfec5827c16391de: haiiiiiiiii....,Negative,0
160771,ffff74f40b58182a2521235b9db901d4,7bc167d759d9c56d43d1d46575433d35: hey 169b2106...,Positive,0


In [3]:
import huggingface_hub
print(huggingface_hub.__version__)


0.26.2


In [4]:
# login to huggingface
from huggingface_hub import login
login(token="YOUR_HUGGINGFACE_TOKEN")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# load model with huggingface
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token="YOUR_HUGGINGFACE_TOKEN")
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=2, token="YOUR_HUGGINGFACE_TOKEN")

2024-12-01 11:56:16.042861: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-01 11:56:16.161570: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import torch

# check if CUDA is available
cuda_available = torch.cuda.is_available()

print("CUDA Available:", cuda_available)

# print GPU name if CUDA available
if cuda_available:
    print("CUDA Device Name:", torch.cuda.get_device_name(0))


CUDA Available: True
CUDA Device Name: NVIDIA A100-SXM4-80GB


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model is loaded on device: {device}")

Model is loaded on device: cuda


In [8]:
from datasets import Dataset

df = Dataset.from_pandas(dataset[['text', 'label']])

In [9]:
# define and add the padding token if it's not already defined
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [10]:
# define tokenization function
def tokenize_function(row):
    return tokenizer(row['text'], padding = 'max_length', truncation=True, max_length = 64)

# apply tokenization to all rows
tokenized_datasets = df.map(tokenize_function, batched=True, batch_size = 16)


Map: 100%|██████████| 160773/160773 [01:23<00:00, 1918.92 examples/s]


In [11]:
model.config.pad_token_id = tokenizer.pad_token_id

In [12]:
# split dataset into train and test sets (70/30 split)
train_test_split = tokenized_datasets.train_test_split(test_size=0.3)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [13]:
train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 112541
})

In [14]:
import torchvision.transforms

In [15]:
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
import evaluate
import numpy as np

# load metrics
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

# define function to compute multiple metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Get the predicted class

    # Calculate each metric individually
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"]
    }

training_args = TrainingArguments(
    output_dir='output',
    eval_strategy='epoch',
    learning_rate=1e-5,
    logging_steps=50,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="no",   
)

# define collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
# start training
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0384,0.025752,0.990463,0.990552,0.990463,0.990505
2,0.0142,0.024728,0.991748,0.992397,0.991748,0.991969
3,0.0048,0.022872,0.994755,0.994686,0.994755,0.994709
4,0.0033,0.024046,0.994444,0.994609,0.994444,0.994507
5,0.003,0.03061,0.995667,0.995613,0.995667,0.995627
6,0.0012,0.028672,0.995439,0.99537,0.995439,0.995377
7,0.0008,0.030624,0.995563,0.995501,0.995563,0.995482
8,0.0004,0.0334,0.995667,0.995606,0.995667,0.995616
9,0.0003,0.033441,0.995791,0.995736,0.995791,0.995748
10,0.0002,0.033403,0.99577,0.995722,0.99577,0.995737




TrainOutput(global_step=1100, training_loss=0.010690952529284087, metrics={'train_runtime': 8047.0253, 'train_samples_per_second': 139.854, 'train_steps_per_second': 0.137, 'total_flos': 4.205541026247475e+17, 'train_loss': 0.010690952529284087, 'epoch': 10.0})

In [18]:
results = trainer.evaluate()
print("Evaluation results:", results)



Evaluation results: {'eval_loss': 0.033403344452381134, 'eval_accuracy': 0.9957704428595123, 'eval_precision': 0.9957224211544815, 'eval_recall': 0.9957704428595123, 'eval_f1': 0.9957369656256296, 'eval_runtime': 106.9278, 'eval_samples_per_second': 451.071, 'eval_steps_per_second': 3.526, 'epoch': 10.0}


## Sentiments

In [17]:
# split into three datasets
# non_grooming
df_rest = dataset[dataset['label'] == 0]
df_grooming = dataset[dataset['label'] == 1]
# positive
df_positive = df_grooming[df_grooming['sentiment'] == 'Positive']
# negative 
df_negative = df_grooming[df_grooming['sentiment'] == 'Negative']

In [18]:
from sklearn.model_selection import train_test_split
# split into train and test
df_rest_train, df_rest_test = train_test_split(df_rest, test_size=0.3, random_state=42)

### Positive trained and tested

In [19]:
# split into train and test
df_pos_train, df_pos_test = train_test_split(df_positive, test_size=0.3, random_state=42)
# train data is non-grooming and positive sentiment
pp_train_df = pd.concat([df_pos_train, df_rest_train], axis=0)
# test data is non-grooming and positive sentiment
pp_test_df = pd.concat([df_pos_test, df_rest_test], axis=0)

In [20]:
pp_train_df

Unnamed: 0,conversation_id,text,sentiment,label
91219,913612b7cf3923fe8ac2c2ae48ade4c2,0d3e4cee17e1ffaa7d33d252a4175ed9: that's sound...,Positive,1
107610,ab81aca93db9de771f86dc69ec270605,1eb17bd9642e93fa84969b71bf387a1b: night miss ya,Positive,1
99407,9e1d96fcd5a9e85d74ff3735e6f2318e,"03957f443c7790f9642db14bbc59df11: katie, are y...",Positive,1
134356,d5ee98309092b769f853217f71631d22,ac07079f18fcab57692a57e092678052: hello a0d648...,Positive,1
18780,1d92861841513e08e089b841f54a823b,2e265f9b8ee76269872d56d5c6c0335b: hy u home,Positive,1
...,...,...,...,...
124274,c5f42576faf9dff95430cd77e88da27c,0bde687f1910bed528e5c889ad28ca14: hi 65bd761d6...,Negative,0
107498,ab4f6e28073fdfa79202d0b1e912795b,d15c7cf4f4fbea6f11f2e695c1578c94: hi 35953a67e...,Negative,0
136800,d9dc22a02b907953a8b3fd6237cf95f7,f0015e87cd8fbade78126b1df6bc0a02: butterflies ...,Negative,0
152309,f289457c6452c92a0c461d94d77e5313,07e276f7a0e8953c9b961084e9f5a1ab: horny? f5f70...,Negative,0


In [21]:
pp_test_df

Unnamed: 0,conversation_id,text,sentiment,label
24855,27362c1d0e039ce767ca2f75e7ce710a,8164381b4ae95713c7266cba00fec1df: hey,Positive,1
86239,8966df68adb0c7fdd807536335d6f3ab,62760245391c6d56088d814bea04baad: i'm sorry i ...,Positive,1
107365,ab155e3047e1dcae445d584c4f6d1746,a12332f18b35f3717dd7c9ac99b00fd6: i hope you'r...,Positive,1
86295,897da5d708f94b6cb0a9f6664e2d6c51,a12332f18b35f3717dd7c9ac99b00fd6: who were you...,Positive,1
87938,8c1c09f524d3f2b6aad54ffae846b63b,5a41bf6d7766977c25b0b6a97e4e1d58: hey baby i m...,Positive,1
...,...,...,...,...
118176,bc2ece1046c5ac6149de7b97883eee46,8acede54076d243a359aef6fe111b0a3: hornyyy 4758...,Negative,0
38657,3d381b2fd3048f70e17dd05c352965a4,3250f1be97c8672b54290ac7cb3f1cb6: morning,Positive,0
70535,705b01ca3beb2e12575d20c885783c34,b844a0a98f81c321afe1d38ae37f3c28: channel #htm...,Negative,0
67835,6bf3612abd388d4939edf3ca925ebeee,dcc25eefb98547160198114d030be166: hey :-) lets...,Positive,0


In [22]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token="YOUR_HUGGINGFACE_TOKEN")
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=2, token="YOUR_HUGGINGFACE_TOKEN")

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model is loaded on device: {device}")

Model is loaded on device: cuda


In [24]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [25]:
model.config.pad_token_id = tokenizer.pad_token_id

In [26]:
pp_train = Dataset.from_pandas(pp_train_df[['text', 'label']])
pp_test = Dataset.from_pandas(pp_test_df[['text', 'label']])
pp_train_dataset = pp_train.map(tokenize_function, batched=True, batch_size = 16)
pp_test_dataset = pp_test.map(tokenize_function, batched=True, batch_size = 16)

Map: 100%|██████████| 110400/110400 [00:58<00:00, 1875.13 examples/s]
Map: 100%|██████████| 47315/47315 [00:23<00:00, 1993.03 examples/s]


In [27]:
pp_train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 110400
})

In [28]:
pp_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=pp_train_dataset,
    eval_dataset=pp_test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [29]:
torch.cuda.empty_cache()

In [30]:
# start training
pp_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.0245,0.017854,0.993892,0.993982,0.993892,0.993934
1,0.0105,0.016349,0.994928,0.995096,0.994928,0.994999
2,0.0036,0.024213,0.996153,0.996038,0.996153,0.996026
3,0.0017,0.020551,0.996724,0.996656,0.996724,0.996676
4,0.0007,0.02013,0.997083,0.997027,0.997083,0.997041
5,0.0004,0.026195,0.996978,0.996924,0.996978,0.996941
6,0.0003,0.027046,0.997083,0.997019,0.997083,0.997013
8,0.0001,0.026339,0.997062,0.996997,0.997062,0.996994
9,0.0,0.026563,0.997083,0.997019,0.997083,0.997015




TrainOutput(global_step=1070, training_loss=0.005282084218598467, metrics={'train_runtime': 7726.0925, 'train_samples_per_second': 142.892, 'train_steps_per_second': 0.138, 'total_flos': 4.092290631865467e+17, 'train_loss': 0.005282084218598467, 'epoch': 9.918887601390498})

In [31]:
pp_results = pp_trainer.evaluate()
print("Evaluation results:", pp_results)



Evaluation results: {'eval_loss': 0.026562949642539024, 'eval_accuracy': 0.9970833773644722, 'eval_precision': 0.9970188445471161, 'eval_recall': 0.9970833773644722, 'eval_f1': 0.9970146435101623, 'eval_runtime': 103.6678, 'eval_samples_per_second': 456.41, 'eval_steps_per_second': 3.569, 'epoch': 9.918887601390498}


### Negative tested and trained

In [32]:
# split into train and test
df_neg_train, df_neg_test = train_test_split(df_negative, test_size=0.3, random_state=42)
# train data is non-grooming and negative sentiment
nn_train_df = pd.concat([df_neg_train, df_rest_train], axis=0)
# test data is non-grooming and negative sentiment
nn_test_df = pd.concat([df_neg_test, df_rest_test], axis=0)

In [33]:
nn_train_df

Unnamed: 0,conversation_id,text,sentiment,label
109679,aed4fd22f4b397707c8c59aae7332515,609f7b8e566e8d514eecf112d3d3bc95: @};-,Negative,1
12843,14305d38b3240a6790870d71e0215e0b,013dab612d37dc4e2cce87da5239f537: i drive a 20...,Negative,1
159273,fd89b32b3f049619366e552446c602b2,b679fca2e3690b4d3c60815edf4e3ca5: u still up??...,Negative,1
159058,fd28dc97311f6ed9ddb0db9826354891,84fb828731f4e234c54c82158127e73e: yo e03aa9707...,Negative,1
156641,f9647f68d20ef8425f19cece8b31a7b7,dd665a4e326e85d39591a322920f73fb: hi r u there...,Negative,1
...,...,...,...,...
124274,c5f42576faf9dff95430cd77e88da27c,0bde687f1910bed528e5c889ad28ca14: hi 65bd761d6...,Negative,0
107498,ab4f6e28073fdfa79202d0b1e912795b,d15c7cf4f4fbea6f11f2e695c1578c94: hi 35953a67e...,Negative,0
136800,d9dc22a02b907953a8b3fd6237cf95f7,f0015e87cd8fbade78126b1df6bc0a02: butterflies ...,Negative,0
152309,f289457c6452c92a0c461d94d77e5313,07e276f7a0e8953c9b961084e9f5a1ab: horny? f5f70...,Negative,0


In [34]:
nn_test_df

Unnamed: 0,conversation_id,text,sentiment,label
93535,94d27f2475c47638194fc0c80ecdabca,2a1ac47332661b61d943d3a4e08dda5a: hey whats up...,Negative,1
11295,11c3faca1f15abd319ef6e7d88b164b7,a12332f18b35f3717dd7c9ac99b00fd6: i miss my ba...,Negative,1
73117,749864174d49b2e52b4dbd866f3ed4ed,d18fb2dc834414a71aace67bee91c432: u here?? d18...,Negative,1
77744,7c0004d2d9aa198bc0f920a2ed397d6b,fce23ce4bcc7bcdef65385dca0575523: you can't fo...,Negative,1
84589,86d6b8ff254ff3031bd759487b9967c6,e50b5df92f1b6d75079d353cbc06d40f: hey i was ta...,Negative,1
...,...,...,...,...
118176,bc2ece1046c5ac6149de7b97883eee46,8acede54076d243a359aef6fe111b0a3: hornyyy 4758...,Negative,0
38657,3d381b2fd3048f70e17dd05c352965a4,3250f1be97c8672b54290ac7cb3f1cb6: morning,Positive,0
70535,705b01ca3beb2e12575d20c885783c34,b844a0a98f81c321afe1d38ae37f3c28: channel #htm...,Negative,0
67835,6bf3612abd388d4939edf3ca925ebeee,dcc25eefb98547160198114d030be166: hey :-) lets...,Positive,0


In [35]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", token="YOUR_HUGGINGFACE_TOKEN")
model = AutoModelForSequenceClassification.from_pretrained("meta-llama/Llama-3.2-1B", num_labels=2, token="YOUR_HUGGINGFACE_TOKEN")

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Model is loaded on device: {device}")

Model is loaded on device: cuda


In [37]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [38]:
model.config.pad_token_id = tokenizer.pad_token_id

In [39]:
nn_train = Dataset.from_pandas(nn_train_df[['text', 'label']])
nn_test = Dataset.from_pandas(nn_test_df[['text', 'label']])
nn_train_dataset = nn_train.map(tokenize_function, batched=True, batch_size = 16)
nn_test_dataset = nn_test.map(tokenize_function, batched=True, batch_size = 16)

Map: 100%|██████████| 110654/110654 [00:59<00:00, 1866.50 examples/s]
Map: 100%|██████████| 47424/47424 [00:24<00:00, 1935.52 examples/s]


In [40]:
nn_train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 110654
})

In [41]:
nn_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=nn_train_dataset,
    eval_dataset=nn_test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [42]:
# start training
nn_trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.0315,0.02646,0.99281,0.992693,0.99281,0.992088
1,0.0113,0.015968,0.995192,0.995035,0.995192,0.995004
2,0.0044,0.016771,0.995319,0.995414,0.995319,0.995361
3,0.0023,0.020719,0.995298,0.995337,0.995298,0.995316
4,0.0019,0.029805,0.996057,0.995992,0.996057,0.996017
5,0.0006,0.033194,0.996415,0.996337,0.996415,0.996356
6,0.0005,0.031902,0.996858,0.996795,0.996858,0.99678
8,0.0001,0.033661,0.996964,0.996903,0.996964,0.996897
9,0.0002,0.034178,0.996964,0.996904,0.996964,0.996892




TrainOutput(global_step=1080, training_loss=0.008645707776088751, metrics={'train_runtime': 7788.3862, 'train_samples_per_second': 142.076, 'train_steps_per_second': 0.139, 'total_flos': 4.130489190941983e+17, 'train_loss': 0.008645707776088751, 'epoch': 9.988439306358382})

In [43]:
nn_results = nn_trainer.evaluate()
print("Evaluation results:", nn_results)



Evaluation results: {'eval_loss': 0.03417796641588211, 'eval_accuracy': 0.9969635627530364, 'eval_precision': 0.99690418258551, 'eval_recall': 0.9969635627530364, 'eval_f1': 0.9968921413293417, 'eval_runtime': 103.9751, 'eval_samples_per_second': 456.109, 'eval_steps_per_second': 3.568, 'epoch': 9.988439306358382}
