In [18]:
import time
import optuna
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
# from bitermplus import BTM
from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH
from datasets import DatasetDict, Dataset

from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

In [19]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[10000, 4232, 4232])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


In [45]:
import pandas as pd
import pyarrow as pa  # Importing the pyarrow library

# Create individual datasets
train_dataset = pd.DataFrame({"text": X_train, "label": Y_train})
val_dataset = pd.DataFrame({"text": X_val, "label": Y_val})
test_dataset = pd.DataFrame({"text": X_test, "label": Y_test})

# Convert pandas DataFrame to pyarrow.Table
train_arrow_table = pa.Table.from_pandas(train_dataset)
val_arrow_table = pa.Table.from_pandas(val_dataset)
test_arrow_table = pa.Table.from_pandas(test_dataset)

# Create DatasetDict
dataset_dict = DatasetDict({
    "train": Dataset(train_arrow_table),
    "val": Dataset(val_arrow_table),
    "test": Dataset(test_arrow_table),
})

# Print the DatasetDict
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 4232
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2277
    })
})


In [46]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [47]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [49]:
tokenized_imdb = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4232 [00:00<?, ? examples/s]

Map:   0%|          | 0/2277 [00:00<?, ? examples/s]

In [50]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [51]:
import evaluate

accuracy = evaluate.load("accuracy")

In [52]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [53]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
# Define training arguments with optimizations
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,  # Increased batch size
    per_device_eval_batch_size=32,  # Increased batch size
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    dataloader_num_workers=4,  # More workers for data loading
    gradient_accumulation_steps=2,  # Accumulate gradients
    logging_dir='./logs',
    logging_steps=500,  # Log less frequently
    save_total_limit=2,  # Limit the total number of checkpoints
    report_to="none"  # Disable reporting to speed up
)

In [56]:
tr = tokenized_imdb["train"]
te = tokenized_imdb["test"]

In [57]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tr,
    eval_dataset=te,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

  0%|          | 0/312 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_loss': 0.6012217998504639, 'eval_runtime': 130.0842, 'eval_samples_per_second': 17.504, 'eval_steps_per_second': 0.553, 'epoch': 1.0}


  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_loss': 0.5595399737358093, 'eval_runtime': 125.8358, 'eval_samples_per_second': 18.095, 'eval_steps_per_second': 0.572, 'epoch': 1.99}
{'train_runtime': 3249.2925, 'train_samples_per_second': 6.155, 'train_steps_per_second': 0.096, 'train_loss': 0.6074763077955979, 'epoch': 1.99}


TrainOutput(global_step=312, training_loss=0.6074763077955979, metrics={'train_runtime': 3249.2925, 'train_samples_per_second': 6.155, 'train_steps_per_second': 0.096, 'total_flos': 315799044195840.0, 'train_loss': 0.6074763077955979, 'epoch': 1.9936102236421727})

In [58]:
# Evaluate the model
evaluation_results = trainer.evaluate()

# Print evaluation results
print(evaluation_results)

  0%|          | 0/72 [00:00<?, ?it/s]

{'eval_loss': 0.5595399737358093, 'eval_runtime': 124.6342, 'eval_samples_per_second': 18.269, 'eval_steps_per_second': 0.578, 'epoch': 1.9936102236421727}


In [6]:
from transformers import pipeline

text = "goodness jejomar binay super annoying mo pls stop like really oo laki hirap punyeta"
classifier = pipeline("text-classification", model="C:/Users/vince/Documents/Python/hate-speech-detection/my_awesome_model/checkpoint-312")
classifier(text)


[{'label': 'LABEL_1', 'score': 0.8670482039451599}]

In [26]:
import pandas as pd
from transformers import pipeline

# Define your text classification pipeline
classifier = pipeline("text-classification", model="C:/Users/vince/Documents/Python/hate-speech-detection/my_awesome_model/checkpoint-312")

# Use list comprehension to classify texts
results = [classifier(text)[0] for text in X_val]

# Create DataFrame directly from the results
df = pd.DataFrame(results)

# Print or use the DataFrame as needed
print(df)

        label     score
0     LABEL_0  0.730444
1     LABEL_1  0.839643
2     LABEL_0  0.944241
3     LABEL_0  0.909857
4     LABEL_0  0.760844
...       ...       ...
4227  LABEL_1  0.842102
4228  LABEL_1  0.775408
4229  LABEL_1  0.725372
4230  LABEL_1  0.857052
4231  LABEL_0  0.704660

[4232 rows x 2 columns]


In [34]:
y_pred = df.iloc[:,[0]]

In [35]:
y_pred = y_pred.applymap(lambda x: ''.join(filter(str.isdigit,x)))

In [40]:
y_pred_list = y_pred['label']
# Convert y_pred to integers
y_pred_int = y_pred_list.astype(int)

In [38]:
print(type(Y_val))

<class 'list'>


In [41]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate the metrics
accuracy = accuracy_score(Y_val, y_pred_int)
precision = precision_score(Y_val, y_pred_int, average='binary')
recall = recall_score(Y_val, y_pred_int, average='binary')
f1 = f1_score(Y_val, y_pred_int, average='binary')

# Print the results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.719281663516068
Precision: 0.6799343455067706
Recall: 0.8024213075060532
F1 Score: 0.7361172812083517
