<a href="https://colab.research.google.com/github/khushishelat/GenderBiasSportsJournalism/blob/Khushi/InitialAttemptBERTClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
questions_df_cleaned = pd.read_csv('/content/drive/MyDrive/CIS 5300 NLP Project/Project Notebook/Data/tennis_data/questions_df_cleaned.csv')

In [None]:
# Assuming 'gender' column has values 'M' and 'F' in your DataFrame
# Map 'M' to 0 and 'F' to 1
questions_df_cleaned['gender'] = questions_df_cleaned['gender'].map({'M': 0, 'F': 1})

# Verify the changes
print(questions_df_cleaned['gender'].value_counts())

0    43973
1    37960
Name: gender, dtype: int64


In [None]:
pip install torch torchvision torchaudio




In [None]:
pip install transformers --upgrade




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.metrics import accuracy_score

# Tokenization
print("Tokenization...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128

# Tokenize and format data
print("Tokenizing and formatting data...")
def tokenize_data(texts, labels, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            max_length=max_len,
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels, dtype=torch.long)

    return input_ids, attention_masks, labels

# Split the data into training, validation, and test sets
print("Splitting the data...")
train_texts, test_texts, train_labels, test_labels = train_test_split(
    questions_df_cleaned['questions'].values,
    questions_df_cleaned['gender'].values,
    test_size=0.2,
    random_state=42
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts,
    train_labels,
    test_size=0.1,
    random_state=42
)

# Tokenize the data
print("Tokenizing training data...")
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_texts, train_labels, tokenizer, max_len)
print("Tokenizing validation data...")
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_texts, val_labels, tokenizer, max_len)
print("Tokenizing test data...")
test_input_ids, test_attention_masks, test_labels = tokenize_data(test_texts, test_labels, tokenizer, max_len)

# Create DataLoader
batch_size = 32

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize and fine-tune BERT model
print("Initializing and fine-tuning BERT model...")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Fine-tune the model
epochs = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}...")
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_preds = []
    val_true = []

    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true, val_preds)
    print(f'Epoch {epoch + 1}/{epochs}, Validation Accuracy: {val_accuracy}')

# Evaluation on the test set
print("Evaluating on the test set...")
model.eval()
test_preds = []
test_true = []

for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1)
    test_preds.extend(preds.cpu().numpy())
    test_true.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_true, test_preds)
print(f'Test Accuracy: {test_accuracy}')


Tokenization...
Tokenizing and formatting data...
Splitting the data...
Tokenizing training data...




In [None]:
# Save the model and tokenizer
model.save_pretrained('/content/drive/MyDrive/CIS 5300 NLP Project/BERTModelClassifierQuestions.csv')
tokenizer.save_pretrained('/content/drive/MyDrive/CIS 5300 NLP Project/BERTTokenizerClassifierQuestions')