In [24]:
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoModelForSequenceClassification, AutoConfig
from datasets import load_dataset
from datasets import Dataset
import torch
from transformers import Trainer
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import train_test_split
from typing import Dict, List, Tuple, Any
import matplotlib.pyplot as plt
import seaborn as sns
import re





In [25]:
import importlib
import finetuning_eval_func
importlib.reload(finetuning_eval_func)
from finetuning_eval_func import create_eval_summary_df, process_csv_social_bias,tokenize_function,custom_collate_fn,evaluation_report,conf_matrix



/Users/mariamielniczuk/anaconda3/envs/capstone_env/bin/python


### Example code to get the posts from df and runing inference in the finetuned model from the checkpoint

### lets Test the best model "t2835ru3" load the checkpoint and make inferences on test_dataset


load your dataset/list of flatten posts and commments. in the cell bellow I am using the test_df and the label2id, and the id2label resutls of my function (we do not need here the train_df, and val_df)

In [None]:
train_df, val_df, test_df, label2id, id2label = process_csv_social_bias('/Users/mariamielniczuk/Documents/capstone')

Train size: 29790
Validation size: 3724
Test size: 3724


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bias_type'].fillna('Neutral', inplace=True)


In [13]:
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
os.environ["TRANSFORMERS_NO_TF"] = "1"

CUDA available: False
GPU name: No GPU


In [14]:
# def tokenize_function(examples):
#     model_name = "GroNLP/hateBERT"
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     return tokenizer(examples["post"], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
val_dataset   = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)
test_dataset  = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

# Remove extra columns
for dset in [train_dataset, val_dataset, test_dataset]:
    dset = dset.remove_columns([col for col in dset.column_names if col not in ["input_ids", "attention_mask", "label"]])



Map: 100%|██████████| 29790/29790 [00:06<00:00, 4575.25 examples/s]
Map: 100%|██████████| 3724/3724 [00:00<00:00, 3993.41 examples/s]
Map: 100%|██████████| 3724/3724 [00:00<00:00, 4971.76 examples/s]


In [15]:
# Define a custom collate function for the DataLoader
def custom_collate_fn(batch):
    # Select and collate only the necessary columns
    collated_batch = {}
    collated_batch['input_ids'] = torch.tensor([item['input_ids'] for item in batch], dtype=torch.long)
    collated_batch['attention_mask'] = torch.tensor([item['attention_mask'] for item in batch], dtype=torch.long)
    if 'label' in batch[0]: # Check if label exists in the first item
         collated_batch['labels'] = torch.tensor([item['label'] for item in batch], dtype=torch.long) # Note: model expects 'labels'

    return collated_batch



### Testing the best model from crossentropy loss using the whole test dataset

The following cells can be uncommented if you want to run the inference yourserlf. However you can have access to the inference results by uploading the csv with the results in the uncommented cell

In [None]:

model_path = "model_t2835ru3/checkpoint-2236"
model = AutoModelForSequenceClassification.from_pretrained(model_path, local_files_only=True)

In [None]:
# Create a DataLoader for the test set using the custom collate function
# Use the tokenized_test_dataset
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=custom_collate_fn)

# Set the model to evaluation mode and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

predictions = []
actual_labels = []
probabilities = []

# Perform inference
with torch.no_grad():
    for batch in test_dataloader:
        # Move batch to the same device as the model
        batch = {k: v.to(device) for k, v in batch.items()} # moves all items in the batch

        outputs = model(**batch)#receiving the batch dictonary k,v
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)
        predicted_labels = torch.argmax(logits, dim=-1)
        #extending the lists
        predictions.extend(predicted_labels.cpu().numpy())
        actual_labels.extend(batch['labels'].cpu().numpy()) # Use 'labels' from the batch
        probabilities.extend(probs.cpu().numpy())



In [None]:
# Convert predictions and actual_labels to original bias types
predicted_bias_types = [id2label[pred] for pred in predictions]
actual_bias_types = [id2label[actual] for actual in actual_labels]



In [None]:
# the results DataFrame will provide the prob for all the bias lables
# check the length of 'post' matches the length of predictions/actual_labels
# The DataLoader iterates over the dataset, so the number of predictions should match dataset size
# Fetch original posts from test_df based on the order in the DataLoader
# The order in the DataLoader should match the order in the original test_df since we used from_pandas and no shuffling
results_df = pd.DataFrame({
    'post': test_df['post'].tolist(), # Get original posts from test_df
    'actual_bias_type': actual_bias_types,
    'predicted_bias_type': predicted_bias_types
})


# Add probability columns
prob_df = pd.DataFrame(probabilities, columns=[f'prob_{id2label[i]}' for i in range(len(id2label))])
results_df = pd.concat([results_df, prob_df], axis=1)



In [None]:
results_df.head()

In [1]:
#This is also a custom function to create a summary of the model performance that you will not be using 
# #eval_corss,report_cross=evaluation_report(results_df)

In [2]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
