In [None]:
from google.colab import drive
drive.mount('/content/drive')
save_directory = "/content/drive/My Drive"


# Install necessary libraries
!pip install -q datasets transformers
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import torch
from transformers import BertForSequenceClassification, AutoTokenizer, get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader
from datasets import Dataset
from sklearn.metrics import accuracy_score

# Load the dataset
url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/orientation/reddit_comments_orientation_lgbtq_processed_phrase_annotated.csv"
gender_url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/gender/reddit_comments_gender_female_processed_phrase_annotated.csv"
religion1_url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/religion1/reddit_comments_religion1_jews_processed_phrase_annotated.csv"
religion2_url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/religion2/reddit_comments_religion2_muslims_processed_phrase_annotated.csv"
race_url = "https://raw.githubusercontent.com/nataliecclaire/RedditBias/master/data/race/reddit_comments_race_black_processed_phrase_annotated.csv"

# Load and combine datasets
orientation_data = pd.read_csv(url)
gender_data = pd.read_csv(gender_url, encoding="latin1")
religion1_data = pd.read_csv(religion1_url, encoding="latin1")
religion2_data = pd.read_csv(religion2_url, encoding="latin1")
race_data = pd.read_csv(race_url, encoding="latin1")

# Combine all datasets
all_data = pd.concat([orientation_data, gender_data, religion1_data, religion2_data, race_data], ignore_index=True)

# Data preprocessing
all_data = all_data.dropna(subset=['bias_sent', 'comment'])
all_data['bias_sent'] = all_data['bias_sent'].replace('1 - context needed', 1)
values_to_remove = [np.nan, 're-state', 'biased?', 'toxic-unrelated', 'fact?', 'question']
mask = all_data['bias_sent'].isin(values_to_remove) | all_data['bias_sent'].isna()
all_data = all_data[~mask]

# Convert data types
all_data['comment'] = all_data['comment'].astype(str)
all_data['bias_sent'] = all_data['bias_sent'].astype(int)
all_data['bias_sent'] = all_data['bias_sent'].clip(0, 1)

# Prepare input and target
X = all_data['comment']
y = all_data['bias_sent']

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["comment"], padding="max_length", truncation=True)

# K-Fold Cross-Validation
k_folds = 2
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
fold_accuracies = []

# Training and evaluation loop
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
num_epochs = 8
gradient_accumulation_steps = 4
max_grad_norm = 1.0

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n===== Fold {fold + 1}/{k_folds} =====")

    # Split data into training and validation sets
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Convert to HuggingFace Dataset
    train_df = pd.DataFrame({'comment': X_train, 'label': y_train})
    val_df = pd.DataFrame({'comment': X_val, 'label': y_val})
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)

    train_dataset = train_dataset.rename_column("label", "labels")
    val_dataset = val_dataset.rename_column("label", "labels")

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
    val_dataloader = DataLoader(val_dataset, batch_size=8)

    # Initialize model
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    num_training_steps = len(train_dataloader) * num_epochs
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0.1 * num_training_steps, num_training_steps=num_training_steps)

    # Training loop
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()

            if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_dataloader):
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            if step % 50 == 0:
                print(f"Step {step}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

    # Validation loop
    model.eval()
    all_predictions, all_labels = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

    # Calculate accuracy
    fold_accuracy = accuracy_score(all_labels, all_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold + 1}: {fold_accuracy:.4f}")

# Final cross-validation accuracy
print("\n===== Cross-Validation Results =====")
print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")
save_directory = f"./model_fold_{fold + 1}"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]


===== Fold 1/2 =====


Map:   0%|          | 0/5758 [00:00<?, ? examples/s]

Map:   0%|          | 0/5759 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8
Step 0/720, Loss: 0.1805
Step 50/720, Loss: 0.1803
Step 100/720, Loss: 0.1786
Step 150/720, Loss: 0.1811
Step 200/720, Loss: 0.1668
Step 250/720, Loss: 0.1793
Step 300/720, Loss: 0.1794
Step 350/720, Loss: 0.1649
Step 400/720, Loss: 0.1725
Step 450/720, Loss: 0.1904
Step 500/720, Loss: 0.1517
Step 550/720, Loss: 0.1786
Step 600/720, Loss: 0.1853
Step 650/720, Loss: 0.1601
Step 700/720, Loss: 0.2004
Epoch 2/8
Step 0/720, Loss: 0.1715
Step 50/720, Loss: 0.1516
Step 100/720, Loss: 0.1715
Step 150/720, Loss: 0.1757
Step 200/720, Loss: 0.1899
Step 250/720, Loss: 0.1374
Step 300/720, Loss: 0.1388
Step 350/720, Loss: 0.1291
Step 400/720, Loss: 0.0842
Step 450/720, Loss: 0.2242
Step 500/720, Loss: 0.1485
Step 550/720, Loss: 0.1133
Step 600/720, Loss: 0.1091
Step 650/720, Loss: 0.1527
Step 700/720, Loss: 0.1226
Epoch 3/8
Step 0/720, Loss: 0.1470
Step 50/720, Loss: 0.1006
Step 100/720, Loss: 0.1295
Step 150/720, Loss: 0.1842
Step 200/720, Loss: 0.1888
Step 250/720, Loss: 0.1302
Step 30

Map:   0%|          | 0/5759 [00:00<?, ? examples/s]

Map:   0%|          | 0/5758 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8
Step 0/720, Loss: 0.2035
Step 50/720, Loss: 0.1292
Step 100/720, Loss: 0.1776
Step 150/720, Loss: 0.1623
Step 200/720, Loss: 0.1715
Step 250/720, Loss: 0.1954
Step 300/720, Loss: 0.1678
Step 350/720, Loss: 0.1490
Step 400/720, Loss: 0.1767
Step 450/720, Loss: 0.1600
Step 500/720, Loss: 0.1690
Step 550/720, Loss: 0.1648
Step 600/720, Loss: 0.1671
Step 650/720, Loss: 0.1765
Step 700/720, Loss: 0.1799
Epoch 2/8
Step 0/720, Loss: 0.2000
Step 50/720, Loss: 0.1657
Step 100/720, Loss: 0.1783
Step 150/720, Loss: 0.1859
Step 200/720, Loss: 0.1590
Step 250/720, Loss: 0.1990
Step 300/720, Loss: 0.1257
Step 350/720, Loss: 0.1264
Step 400/720, Loss: 0.1206
Step 450/720, Loss: 0.1319
Step 500/720, Loss: 0.0927
Step 550/720, Loss: 0.1667
Step 600/720, Loss: 0.1714
Step 650/720, Loss: 0.1653
Step 700/720, Loss: 0.1627
Epoch 3/8
Step 0/720, Loss: 0.1376
Step 50/720, Loss: 0.2134
Step 100/720, Loss: 0.1401
Step 150/720, Loss: 0.0639
Step 200/720, Loss: 0.1169
Step 250/720, Loss: 0.2269
Step 30

('./model_fold_2/tokenizer_config.json',
 './model_fold_2/special_tokens_map.json',
 './model_fold_2/vocab.txt',
 './model_fold_2/added_tokens.json',
 './model_fold_2/tokenizer.json')

In [None]:
save_directory = "/content/drive/My Drive"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/drive/My Drive/tokenizer_config.json',
 '/content/drive/My Drive/special_tokens_map.json',
 '/content/drive/My Drive/vocab.txt',
 '/content/drive/My Drive/added_tokens.json',
 '/content/drive/My Drive/tokenizer.json')

In [None]:
def evaluate_example(inputs):
  inputs = tokenizer(inputs,return_tensors="pt")
  inputs = {key: value.to(device) for key, value in inputs.items()}

  model.eval()

  with torch.no_grad():
    outputs = model(**inputs)

  logits = outputs.logits
  print(logits)
  probabilities = torch.nn.functional.softmax(logits, dim=1)

  prediction = torch.argmax(probabilities,dim=-1).item()
  if prediction == 1:
    print("Prediction: Biased")
  else:
    print("Prediction: Not Biased")
