In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate
import numpy as np
from transformers import TrainingArguments
from transformers import AutoTokenizer
from sklearn.utils import resample
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import ClassLabel

# preprocess data
df = pd.read_csv('aita_combined_verdicts.csv')

label2id = {'not the asshole': 0, 'asshole': 1} 
id2label = {v: k for k, v in label2id.items()}

df['labels'] = df['verdict'].map(label2id) 


In [None]:
# Count the occurrences of each class in the 'labels' column
class_counts = df['labels'].value_counts()

# Calculate percentages
class_percentages = (class_counts / class_counts.sum()) * 100

# Map the class indices to their corresponding class names
class_percentages = class_percentages.rename(index=id2label)

plt.figure(figsize=(8, 6))
sns.barplot(x=class_percentages.index, y=class_percentages.values, palette="viridis")

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Percentage of Instances')
plt.title('Class Distribution in Dataset (Percentages)')

# Show the plot
plt.show()


In [None]:
# oversampling
# Separate majority and minority class
df_majority = df[df['labels'] == 0]
df_minority = df[df['labels'] == 1]

# Oversample the minority class
df_minority_oversampled = resample(df_minority, 
                                   replace=True, 
                                   n_samples=len(df_majority), 
                                   random_state=42)

# Combine the oversampled minority class with the majority class
# if you don't want to oversample, comment out this block
df_balanced = pd.concat([df_majority, df_minority_oversampled])

# Shuffle the dataset
df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# tokenize data
# dif bert versions: 
model_name = 'prajjwal1/bert-medium'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(example['body'], padding='max_length', truncation=True, max_length=256)

dataset = Dataset.from_pandas(df_balanced[['body', 'labels']])

# Convert the 'labels' column to ClassLabel type
class_label = ClassLabel(num_classes=len(label2id), names=list(label2id.keys()))
dataset = dataset.cast_column('labels', class_label)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
# ensure stratify_by_column='labels' for stratified splitting when using non oversampled data
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2, stratify_by_column='labels', seed=10)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


In [None]:

model = AutoModelForSequenceClassification.from_pretrained(
    'prajjwal1/bert-medium',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1
)

# evaluate accuracy
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# name of dir that you want to save stuff in (ex. bert_tiny, bert_tiny_oversampled)
main_dir = "bert_medium_0.01_dropout"
# num epochs
num_epochs = 15
# prob name something like "model_oversampled_no_dropout_bert_tiny_epochs_{num_epochs}"
save_name = f"model_oversampled_dropout_bert_medium_dropout_0.01_epochs_{num_epochs}"

#  by saving checkpoints, we can save models at different epochs and compare them :D
training_args = TrainingArguments(
    output_dir=f"./{main_dir}/{save_name}", # where to save checkpoints
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    save_strategy = "epoch",
    eval_strategy = "epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    
)

trainer.train(resume_from_checkpoint=True)

# After training, evaluate on the test set
eval_results = trainer.evaluate()

# Save evaluation results to a text file
with open(f"./{main_dir}/{save_name}_evals.txt", "w") as file:
    for key, value in eval_results.items():
        file.write(f"{key}: {value}\n")

# Print the accuracy from the evaluation results
print(f"Test Accuracy: {eval_results['eval_accuracy']}")
    
model.save_pretrained(f"./{main_dir}/{save_name}")
# save tokenizer - optional
# tokenizer.save_pretrained(f"./{main_dir}/{save_name}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
14,0.3244,0.643838,0.771997
15,0.3034,0.653574,0.776004


Test Accuracy: 0.7760044995957395


In [None]:
# Example text to predict
# Should be NTA
text_not_asshole = """AITAH for kicking out my gf's sister and her kids out of my flat after my gf gave her the keys 

People around me are saying I am an AH but I need the perspective of uninvolved people.
My long term gf has my apartment keys, as I have hers. Only unspoken until now but always respected rule was, if you need to go to the other place, just send a text "Hey going to your place". Doesn't matter if the other is at home or even responds. Just simply to tell the other you will be at their place.

I was supposed to be away 700km from home for 2 weeks for work related stuff, but 4 days in and our instructor got into an accident. Work tried to find another one, but no such luck on very short notice. They decided at like 10pm to get us the 1st flight home the next day at like 6am, pay us the overtime and the next day at home, then resume our normal work schedule.
So I get home the next day at like 9am, sent a text to my gf to tell her I am back.
Getting to my door, I am very confused hearing children screaming inside since none of the people who have my keys have low kids like that (my brother and my gf). I thought I got squatters or something. Opening the door and I see my gf's sister's kid running around after a shower, putting water everywhere. Plates of half finished ravioli on my living room ground. Their suitcases opened in the entrance.

I get inside and see the husband on my couch trying to hook up my PS2(that he must have digged out in my storage room). Getting into a verbal argument with him trying to understand why the fuck they are here. Said my gf told them they could get my flat for 2 weeks while I was gone (they wanted to visit the city for a bit, go to the beach). My gf sent me a text while i was arguing, telling me "oh ok, btw my sis fam' is at your flat".

I admit I blew up on him and the sis who left my bedroom in the meantime. Told them to leave immediately. They argued quite a bit, my gf called her sis, then sis put up the speaker so we could all hear, and she said I was embarrassing her, that she told them they could use my place for a while.

I threatened to call the police, also asked my brothers to come.
They left while cursing me to their children, telling that holidays are over because the mean little sister's boyfriend cast us out.
I have now all of my gf family on my back, and even some of my own family, saying i could have stayed with at my gf so the kids could have some vacations...

Also. They have read my doctor prescription papers(I put them in a specific order, and it was not the same), and obviously took some of my prescribed meds (opened a box of benzodiazepine...).

AITAH for making them leave? We pretty much stopped talking about anything else with my gf. I feel like i am being gaslighted. I would never invite people to her apartment like that, especially without telling her. It seems so disrespectful.
Am I going insane?
"""

# List of texts to predict
texts = [ text_not_asshole ]

# Tokenize the texts as a batch
# If not using NVIDIA/CUDA, comment out the .to(device) part
inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

# Get model predictions
with torch.no_grad():
    logits = model(**inputs).logits

# Get the predicted labels
predicted_classes = logits.argmax(dim=-1).tolist()

# Map predicted labels to their corresponding class names
predicted_labels = [id2label[pred] for pred in predicted_classes]

# Print the predictions
for text, label in zip(texts, predicted_labels):
    print(f"Text: {text[:50]}...") 
    print(f"Predicted class: {label}")