In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('drive/MyDrive/CIS 5300 - Final Project/Milestone 2/Data')

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

train_df = pd.read_csv('train_data_with_features.csv').drop(columns=['content_category'])
dev_df = pd.read_csv('dev_data_with_features.csv').drop(columns=['content_category'])
test_df = pd.read_csv('test_data_with_features.csv').drop(columns=['content_category'])

In [None]:
!pip install transformers accelerate



In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe['text']
        self.labels = dataframe['generated']
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data.iloc[index])
        label = self.labels.iloc[index]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = TextDataset(train_df, tokenizer)
val_dataset = TextDataset(dev_df, tokenizer)
test_dataset = TextDataset(test_df, tokenizer)

print("Datasets created successfully.")
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Datasets created successfully.
Train dataset size: 389788
Validation dataset size: 48723
Test dataset size: 48724


In [None]:
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, logging
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

logging.set_verbosity_info()

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate weighted metrics
    precision_w, recall_w, f1_w, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    # Calculate macro metrics
    precision_m, recall_m, f1_m, _ = precision_recall_fscore_support(labels, preds, average='macro')

    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1_weighted': f1_w,
        'precision_weighted': precision_w,
        'recall_weighted': recall_w,
        'f1_macro': f1_m,
        'precision_macro': precision_m,
        'recall_macro': recall_m
    }

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    eval_strategy='epoch',
    save_strategy='epoch',
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/model.safetensors
A pretrained model of type `BertForSequenceClassification` contains parameters that have been renamed internally (a few are listed below but more are present in the model):
* `cls.predictions.transform.LayerNorm.beta` -> `cls.predictions.transform.LayerNorm.bias`
* `cls.predictions.transform.LayerNorm.gamma` -> `cls.predictions.transform.LayerNorm.weight`
If you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.se

In [None]:
print("Starting training...")
trainer.train()

print("Evaluating model on validation set...")
eval_results = trainer.evaluate()
print("Validation Results:", eval_results)

print("Predicting on test set...")
test_output = trainer.predict(test_dataset)

test_metrics = test_output.metrics
test_predictions = test_output.predictions.argmax(axis=1)

print("\nTest Metrics:")
for key, value in test_metrics.items():
    print(f"{key}: {value:.4f}")

print("\nFirst 10 Test Predictions:", test_predictions[:10])

import pandas as pd
submission_df = pd.DataFrame({
    'text': test_df['text'],
    'generated': test_df['generated'],
    'prediction': test_predictions
})
submission_df.to_csv('test_predictions.csv', index=False)
print("Predictions saved to 'test_predictions.csv'")

Starting training...


***** Running training *****
  Num examples = 389,788
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 24,362
  Number of trainable parameters = 109,483,778


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,Precision Weighted,Recall Weighted,F1 Macro,Precision Macro,Recall Macro
1,0.0052,0.005648,0.998851,0.998851,0.998851,0.998851,0.99877,0.99868,0.99886



***** Running Evaluation *****
  Num examples = 48723
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-24362
Configuration saved in ./results/checkpoint-24362/config.json
Model weights saved in ./results/checkpoint-24362/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)



***** Running Evaluation *****
  Num examples = 48723
  Batch size = 64


Evaluating model on validation set...



***** Running Prediction *****
  Num examples = 48724
  Batch size = 64


Validation Results: {'eval_loss': 0.00564814917743206, 'eval_accuracy': 0.9988506454857049, 'eval_f1_weighted': 0.9988507489219283, 'eval_precision_weighted': 0.9988510828262367, 'eval_recall_weighted': 0.9988506454857049, 'eval_f1_macro': 0.9987699846257545, 'eval_precision_macro': 0.9986803423492301, 'eval_recall_macro': 0.9988598897471994, 'eval_runtime': 409.4849, 'eval_samples_per_second': 118.986, 'eval_steps_per_second': 1.861, 'epoch': 1.0}
Predicting on test set...

Test Metrics:
test_loss: 0.0055
test_accuracy: 0.9988
test_f1_weighted: 0.9988
test_precision_weighted: 0.9988
test_recall_weighted: 0.9988
test_f1_macro: 0.9987
test_precision_macro: 0.9986
test_recall_macro: 0.9988
test_runtime: 407.2573
test_samples_per_second: 119.6390
test_steps_per_second: 1.8710

First 10 Test Predictions: [0 0 0 0 0 0 1 0 1 1]
Predictions saved to 'test_predictions.csv'


# Fine-Tuning Methodology

This model was fine-tuned using the **Hugging Face Transformers `Trainer` API**. The specific process involved:

1.  **Model Architecture**:
    *   **Base Model**: `bert-base-uncased` (110M parameters).
    *   **Head**: A classification layer was added on top of the pre-trained BERT encoder to output binary predictions (generated vs. human).

2.  **Data Preparation**:
    *   Input text was tokenized using `BertTokenizer` with a maximum sequence length of **128 tokens**.
    *   Data was wrapped in a custom `TextDataset` class to handle batching and tensor conversion.

3.  **Training Configuration**:
    *   **Epochs**: 1
    *   **Batch Size**: 16 (Training), 64 (Evaluation)
    *   **Optimizer**: AdamW (default) with `weight_decay=0.01`
    *   **Learning Rate Schedule**: Linear decay with **500 warmup steps**
    *   **Precision**: Mixed Precision (FP16) was enabled to optimize training speed and memory usage on the GPU.