<a href="https://colab.research.google.com/github/jeffreylowzg/LLM_homework6/blob/jeffrey-commits/data_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install -U "huggingface_hub[cli]"
!pip install torch transformers numpy tqdm datasets



Download dataset and saves 5%

In [26]:
from datasets import load_dataset
import pandas as pd
import os

# Load the dataset from Hugging Face
dataset = load_dataset("dmitva/human_ai_generated_text", split="train")

# Calculate 5% of the dataset size
sample_size = int(0.05 * len(dataset))

# Sample 5% of the data
sampled_dataset = dataset.shuffle(seed=42).select(range(sample_size))

# Convert to pandas DataFrame for easier handling
df = pd.DataFrame(sampled_dataset)

# Ensure the 'data' directory exists
os.makedirs("data", exist_ok=True)

# Save to a CSV file in the 'data' folder
df.to_csv("data/sample_5_percent.csv", index=False)

print("5% of the dataset has been saved to 'data/sample_5_percent.csv'")


5% of the dataset has been saved to 'data/sample_5_percent.csv'


Read saved data and split into labels 0 (for human) and 1 (for ai)

In [37]:
import json

# Read the sampled CSV file
df = pd.read_csv("data/sample_5_percent.csv")

# Initialize an empty list to hold the new records
data = []

# Process each row to create two entries: one for human text, one for AI text
for _, row in df.iterrows():
    # Append the human text with label 0
    data.append({
        "text": row["human_text"],
        "instructions": row["instructions"],
        "label": 0
    })

    # Append the AI text with label 1
    data.append({
        "text": row["ai_text"],
        "instructions": row["instructions"],
        "label": 1
    })

# Save the processed data to a JSON file
outfile = "data/sample_5_percent.jsonl"
with open(outfile, "w") as f:
    for d in data:
        json.dump(d, f)
        f.write("\n")

print(f"The dataset has been saved to {outfile} with the specified format.")

The dataset has been saved to data/sample_5_percent.jsonl with the specified format.


check if jsonl file is formatted correctly

In [38]:
import json

data_path = "data/sample_5_percent.jsonl"

with open(data_path, 'r') as f:
    for i, line in enumerate(f, 1):
        try:
            json.loads(line)
        except json.JSONDecodeError as e:
            print(f"JSON decode error on line {i}: {e}")


In [None]:
!mkdir -p models/pythia-160m
!huggingface-cli download EleutherAI/pythia-160m --local-dir ./models/pythia-160m
!cd ../..

In [45]:
!wandb login 7077b7416aa6d8dd6e87ab0b9150b82abed30bd1

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Specify the local directory where the model was downloaded
model_path = "./models/pythia-160m"

# Load the tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)  # Two labels for binary classification

# Add padding token if it doesn't exist and set it as the pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings to match the new pad token

# Explicitly set pad_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# Load the dataset
data_path = "data/sample_5_percent.jsonl"
dataset = load_dataset("json", data_files=data_path)

# Preprocessing function for tokenization and label mapping
def preprocess_function(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = examples["label"]  # Use label for classification
    return inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split the dataset for training and evaluation
split_datasets = tokenized_dataset["train"].train_test_split(test_size=0.2)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./models/pythia-160m-finetuned-classifier",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    save_strategy="epoch",  # Set save_strategy to epoch
    evaluation_strategy="epoch",
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the final model
model.save_pretrained("./models/pythia-160m-finetuned-classifier")
tokenizer.save_pretrained("./models/pythia-160m-finetuned-classifier")

print("Model fine-tuning completed and saved to './models/pythia-160m-finetuned-classifier'")


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at ./models/pythia-160m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
