<a href="https://colab.research.google.com/github/jeffreylowzg/LLM_homework6/blob/jeffrey-commits/data_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U "huggingface_hub[cli]"
!pip install torch transformers[torch] numpy tqdm datasets peft accelerate

Download dataset and saves 5%

In [None]:
from datasets import load_dataset
import pandas as pd
import os

# Load the dataset from Hugging Face
dataset = load_dataset("dmitva/human_ai_generated_text", split="train")

# Calculate 5% of the dataset size
sample_size = int(0.05 * len(dataset))

# Sample 5% of the data
sampled_dataset = dataset.shuffle(seed=42).select(range(sample_size))

# Convert to pandas DataFrame for easier handling
df = pd.DataFrame(sampled_dataset)

# Ensure the 'data' directory exists
os.makedirs("data", exist_ok=True)

# Save to a CSV file in the 'data' folder
df.to_csv("data/sample_5_percent.csv", index=False)

print("5% of the dataset has been saved to 'data/sample_5_percent.csv'")


Read saved data and split into labels 0 (for human) and 1 (for ai)

In [None]:
import json

# Read the sampled CSV file
df = pd.read_csv("data/sample_5_percent.csv")

# Initialize an empty list to hold the new records
data = []

# Process each row to create two entries: one for human text, one for AI text
for _, row in df.iterrows():
    # Append the human text with label 0
    data.append({
        "text": row["human_text"],
        "instructions": row["instructions"],
        "label": 0
    })

    # Append the AI text with label 1
    data.append({
        "text": row["ai_text"],
        "instructions": row["instructions"],
        "label": 1
    })

# Save the processed data to a JSON file
outfile = "data/sample_5_percent.jsonl"
with open(outfile, "w") as f:
    for d in data:
        json.dump(d, f)
        f.write("\n")

print(f"The dataset has been saved to {outfile} with the specified format.")

split dataset into train and test

In [None]:
import json
from sklearn.model_selection import train_test_split

# Paths
original_data_path = "data/sample_5_percent.jsonl"
train_data_path = "data/train.jsonl"
test_data_path = "data/test.jsonl"

# Function to split JSONL file
def split_jsonl_file(input_path, train_path, test_path, test_size=0.2):
    with open(input_path, "r") as f:
        lines = [json.loads(line) for line in f]
    
    train_lines, test_lines = train_test_split(lines, test_size=test_size, random_state=42)
    
    # Save split datasets
    with open(train_path, "w") as train_file:
        for line in train_lines:
            train_file.write(json.dumps(line) + "\n")
    
    with open(test_path, "w") as test_file:
        for line in test_lines:
            test_file.write(json.dumps(line) + "\n")

if __name__ == "__main__":
    # Perform the split
    split_jsonl_file(original_data_path, train_data_path, test_data_path)
    print(f"Data split completed. Train: {train_data_path}, Test: {test_data_path}")


In [None]:
!mkdir -p models/pythia-160m
!huggingface-cli download EleutherAI/pythia-160m --local-dir ./models/pythia-160m
!cd ../..

In [None]:
!wandb login 7077b7416aa6d8dd6e87ab0b9150b82abed30bd1

train + evaluate (freeze 6 layers + lora r(16) alpha(32))

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score
import numpy as np

# Paths for train and test data
train_data_path = "data/train.jsonl"
test_data_path = "data/test.jsonl"

# Specify the local directory where the model was downloaded
model_path = "./models/pythia-160m"

# Load the tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)  # Binary classification

# Add padding token if it doesn't exist and set it as the pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings to match the new pad token

# Explicitly set pad_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# LoRA Configuration
lora_config = LoraConfig(
    task_type="SEQ_CLS",   # Sequence classification
    inference_mode=False,
    r=16,                  # LoRA rank
    lora_alpha=32,         # Scaling factor
    lora_dropout=0.1       # Regularization
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Freeze the first few layers of GPT-NeoX
num_layers_to_freeze = 6  # Adjust based on model depth and dataset size

# For GPT-NeoX, transformer layers are in model.base_model.gpt_neox.layers
for layer in model.base_model.gpt_neox.layers[:num_layers_to_freeze]:
    for param in layer.parameters():
        param.requires_grad = False

# Always ensure the classification head and LoRA layers are trainable
model.print_trainable_parameters()  # Check trainable parameters

# Load the split datasets
train_dataset = load_dataset("json", data_files=train_data_path)["train"]
test_dataset = load_dataset("json", data_files=test_data_path)["train"]

# Preprocessing function for tokenization and label mapping
def preprocess_function(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = examples["label"]  # Use label for classification
    return inputs

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Define a function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Take the highest probability class
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./models/pythia-160m-finetuned-classifier-lora",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    save_strategy="epoch",     # Save the model at the end of each epoch
    evaluation_strategy="epoch",
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    learning_rate=1e-4,        # Adjusted for PEFT
    fp16=True,                 # Enable mixed precision training if supported
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the final model
model.save_pretrained("./models/pythia-160m-finetuned-classifier-lora")
tokenizer.save_pretrained("./models/pythia-160m-finetuned-classifier-lora")

print("Model fine-tuning completed and saved to './models/pythia-160m-finetuned-classifier-lora'")

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")


Evaluation on untrained model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score
import numpy as np

# Specify the paths for the train and test datasets
train_data_path = "data/train.jsonl"
test_data_path = "data/test.jsonl"

# Specify the local directory where the model was downloaded
model_path = "./models/pythia-160m"

# Load the tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)  # Binary classification

# Add padding token if it doesn't exist and set it as the pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings to match the new pad token

# Explicitly set pad_token_id in model configuration
model.config.pad_token_id = tokenizer.pad_token_id

# LoRA Configuration
lora_config = LoraConfig(
    task_type="SEQ_CLS",   # Sequence classification
    inference_mode=False,
    r=16,                  # LoRA rank
    lora_alpha=32,         # Scaling factor
    lora_dropout=0.1       # Regularization
)

# Wrap the model with LoRA
model = get_peft_model(model, lora_config)

# Load the train and test datasets
train_dataset = load_dataset("json", data_files=train_data_path)["train"]
test_dataset = load_dataset("json", data_files=test_data_path)["train"]

# Preprocessing function for tokenization and label mapping
def preprocess_function(examples):
    # Tokenize the text
    inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = examples["label"]  # Use label for classification
    return inputs

# Tokenize the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Define a function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)  # Take the highest probability class
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Set up evaluation arguments
evaluation_args = TrainingArguments(
    output_dir="./models/pythia-160m-eval",
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    fp16=True,  # Enable mixed precision evaluation if supported
)

# Initialize the Trainer for evaluation only
trainer = Trainer(
    model=model,
    args=evaluation_args,
    train_dataset=tokenized_train_dataset,  # Optional: If you're training as well
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Evaluate the untrained model
eval_results = trainer.evaluate()
print(f"Evaluation Results (Untrained Model): {eval_results}")


printing generated outputs before classification head.

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
import numpy as np
from datasets import load_dataset
from peft import PeftModel

# Path to the fine-tuned model and test data
model_path = "./models/pythia-160m-finetuned-classifier-lora"
base_model_path = "./models/pythia-160m"  # Base pre-trained model path
test_data_path = "data/test.jsonl"

# Load the tokenizer from the fine-tuned directory
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Add padding token if not already defined
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_path,
    num_labels=2
)

# Resize the base model's embedding layer to match the tokenizer
base_model.resize_token_embeddings(len(tokenizer))

# Set the padding token ID in the model configuration
base_model.config.pad_token_id = tokenizer.pad_token_id

# Load the LoRA adapters into the resized base model
model = PeftModel.from_pretrained(base_model, model_path)

# Ensure the model is in evaluation mode
model.eval()

# Load the test dataset
test_dataset = load_dataset("json", data_files=test_data_path)["train"]

# Extract the text prompts from the dataset
test_prompts = test_dataset["text"][:10]  # Select only the first 10 inputs

# Tokenize the test prompts
inputs = tokenizer(
    test_prompts,
    padding=True,  # Enable padding
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Move tensors to the appropriate device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Pass the inputs through the model to get hidden states
with torch.no_grad():
    outputs = model.base_model(**inputs, output_hidden_states=True)
    # Extract the last hidden state (before the classification head)
    hidden_states = outputs.hidden_states[-1]  # Last layer's hidden states
    pooled_embeddings = hidden_states[:, 0, :]  # CLS token's embedding for each prompt

# Convert embeddings to a numpy array for saving
pooled_embeddings_np = pooled_embeddings.cpu().numpy()

# Save the embeddings and corresponding prompts to a JSON file
# output_data = {
#     "prompts": test_prompts,
#     "embeddings": pooled_embeddings_np.tolist()
# }

# output_file = "data/test_prompt_embeddings.json"
# with open(output_file, "w") as f:
#     json.dump(output_data, f, indent=4)
# print(f"Embeddings saved to {output_file}")

# Compute softmax values for pooled_embeddings_np[0]
softmax_values = torch.nn.functional.softmax(torch.tensor(pooled_embeddings_np[0]), dim=0)

# Print the softmax values
# print("Softmax values:", softmax_values.numpy())

# Compute and print the sum of the softmax values
softmax_sum = torch.sum(softmax_values)
print("Sum of softmax values:", softmax_sum.item())


Some weights of GPTNeoXForSequenceClassification were not initialized from the model checkpoint at ./models/pythia-160m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Softmax values: [3.24826234e-12 8.69767464e-05 5.82471184e-05 1.36960717e-03
 6.49074002e-13 9.44287924e-04 1.30623123e-02 6.04943954e-04
 4.03423473e-05 3.62395163e-04 5.42565038e-12 4.51100804e-03
 1.34562977e-12 1.21491175e-04 9.80132791e-14 2.46027320e-13
 2.13456856e-13 8.52983916e-12 1.23555888e-03 3.29034461e-13
 1.56991147e-02 3.37813445e-03 1.15255965e-03 2.02863463e-04
 1.21862123e-12 2.27833545e-04 2.07327027e-03 1.49468845e-03
 8.29757216e-13 1.96254742e-03 9.23577013e-12 9.46272357e-13
 1.79499617e-13 1.82690980e-11 2.33802630e-11 7.62412499e-04
 1.71059815e-04 2.30500824e-03 1.37877589e-12 2.41770037e-12
 1.06422039e-13 2.66644051e-11 7.44767692e-13 7.40908550e-12
 1.93687348e-14 8.14892986e-10 4.94206195e-12 1.99630018e-02
 1.11116329e-04 4.06794106e-06 3.84543309e-12 5.95168706e-12
 6.15305007e-13 1.25535546e-04 4.17864387e-04 1.28217031e-13
 7.72837794e-13 9.72644925e-13 4.00763005e-03 2.13686770e-04
 1.84353814e-02 2.01296946e-03 5.75011945e-04 1.24378374e-03
 4.69073

Temp workspace for generative output

In [None]:
import json
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import LoraConfig, get_peft_model
from transformers import DataCollatorForSeq2Seq

# Step 1: Load and Process Dataset
# Load the CSV file and convert it into a JSONL format with the required structure
df = pd.read_csv("data/sample_5_percent.csv")

# Initialize an empty list to hold the new records
data = []

# Process each row to create two entries: one for human text, one for AI text
for _, row in df.iterrows():
    # Append the human text with label 0
    if pd.notna(row["human_text"]):
        data.append({
            "text": row["human_text"],
            "instructions": row["instructions"],
            "label": 0
        })

    # Append the AI text with label 1
    if pd.notna(row["ai_text"]):
        data.append({
            "text": row["ai_text"],
            "instructions": row["instructions"],
            "label": 1
        })

# Save the processed data to a JSONL file
outfile = "data/sample_5_percent.jsonl"
with open(outfile, "w") as f:
    for d in data:
        json.dump(d, f)
        f.write("\n")

print(f"The dataset has been saved to {outfile} with the specified format.")

In [22]:
from datasets import Dataset  
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from transformers import PreTrainedTokenizerBase
from typing import Any, Dict, List
import torch
import json

# Step 1: Load the processed JSONL dataset into a Hugging Face Dataset
data_file = "data/sample_5_percent_labelled_generative.jsonl"
with open(data_file, "r") as f:
    json_data = [json.loads(line) for line in f]

dataset = Dataset.from_list(json_data)

# Step 2: Preprocess Data
def preprocess_function(examples):
    # Combine instructions and text for input
    inputs = [f"Instruction: {instruction}\nText: {text}" for instruction, text in zip(examples["instructions"], examples["text"])]
    # Generate natural language labels for output
    targets = ["This is AI-generated." if label == 1 else "This is human-written." for label in examples["label"]]
    return {"input_text": inputs, "target_text": targets}

processed_data = dataset.map(preprocess_function, batched=True)

# Step 3: Tokenize the Data
model_path = "./models/pythia-160m"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Check if pad_token is defined; if not, set it
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token

def tokenize_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]
    eos_token = tokenizer.eos_token if tokenizer.eos_token else tokenizer.sep_token

    # Concatenate inputs and targets with the EOS token
    full_texts = [input_text + eos_token + target_text + eos_token for input_text, target_text in zip(inputs, targets)]

    # Tokenize the concatenated texts without padding
    tokenized = tokenizer(
        full_texts,
        max_length=512,
        truncation=True,
        padding=False,  # Let the data collator handle padding
    )

    # Create labels: Copy of input_ids
    input_ids = tokenized["input_ids"]
    labels = []
    for ids in input_ids:
        labels.append(ids.copy())

    # Mask labels corresponding to the input tokens
    for i, ids in enumerate(input_ids):
        # Find the position of the first EOS token (end of input)
        try:
            eos_position = ids.index(tokenizer.eos_token_id)
            # Mask input tokens
            labels[i][:eos_position + 1] = [-100] * (eos_position + 1)
        except ValueError:
            # If EOS token not found, mask the entire sequence
            labels[i] = [-100] * len(ids)

    tokenized["labels"] = labels
    return tokenized

# Tokenize the dataset with batched=True
tokenized_data = processed_data.map(tokenize_function, batched=True, remove_columns=processed_data.column_names)

# Verify tokenized data
print("Sample tokenized data:", tokenized_data[0])

# Split the dataset into training and validation sets
train_test_split = tokenized_data.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Step 4: Load the Model with LoRA
# Load base model
model = AutoModelForCausalLM.from_pretrained(model_path)

# LoRA Configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=16,                  # LoRA rank
    lora_alpha=32,         # Scaling factor
    lora_dropout=0.1       # Regularization
)

# Wrap the model with LoRA
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

# Define Custom Data Collator
class DataCollatorForCausalLM:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, padding: bool = True):
        self.tokenizer = tokenizer
        self.padding = padding
        self.label_pad_token_id = -100  # Token ID to use for label padding

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Separate the labels from the features
        labels = [feature.pop('labels') for feature in features]  # Remove 'labels' from features

        # Pad the inputs (input_ids and attention_mask)
        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            return_tensors='pt',
        )

        # Pad the labels manually to match the length of input_ids
        max_length = batch['input_ids'].size(1)
        padded_labels = []
        for label in labels:
            # Truncate or pad the label
            label = label[:max_length]
            padding_length = max_length - len(label)
            label += [self.label_pad_token_id] * padding_length
            padded_labels.append(label)

        batch['labels'] = torch.tensor(padded_labels, dtype=torch.long)

        return batch

data_collator = DataCollatorForCausalLM(
    tokenizer=tokenizer,
    padding=True,
)

# Step 6: Create and Train the Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Verify the data collator
sample_features = [tokenized_data[i] for i in range(2)]
collated_batch = data_collator(sample_features)
print("Input IDs shape:", collated_batch['input_ids'].shape)
print("Labels shape:", collated_batch['labels'].shape)

# Train the model
trainer.train()

# Save the LoRA-adapted model
peft_model.save_pretrained("./peft_lora_model")
tokenizer.save_pretrained("./peft_lora_model")

print("Training complete. Model saved at './peft_lora_model'.")


Map: 100%|██████████| 100000/100000 [00:00<00:00, 123206.82 examples/s]
Map: 100%|██████████| 100000/100000 [00:22<00:00, 4418.85 examples/s]


Sample tokenized data: {'input_ids': [10548, 2705, 27, 17526, 27, 17526, 27, 10639, 271, 16555, 326, 33826, 253, 5373, 273, 3192, 5971, 275, 247, 20874, 347, 10066, 281, 3192, 731, 3909, 15, 187, 4312, 27, 3808, 3484, 1379, 3909, 5971, 597, 1158, 310, 1175, 281, 1379, 625, 5971, 685, 275, 20874, 15, 187, 187, 6436, 13, 672, 597, 878, 281, 1614, 253, 789, 285, 597, 403, 10960, 275, 1895, 597, 476, 417, 1089, 9732, 281, 1361, 731, 13, 390, 253, 2813, 556, 7154, 13, 594, 3484, 597, 476, 417, 1614, 253, 789, 15, 187, 187, 8430, 690, 952, 1158, 3484, 651, 5649, 432, 1146, 2104, 281, 8041, 5971, 432, 1728, 13, 352, 310, 2686, 2032, 326, 3484, 651, 417, 5649, 432, 1146, 2104, 281, 8041, 5971, 432, 1728, 984, 627, 403, 642, 9732, 323, 1361, 731, 285, 690, 3484, 597, 513, 417, 452, 8573, 387, 1728, 15, 187, 187, 3039, 253, 3484, 1089, 247, 1895, 597, 588, 878, 9732, 323, 1361, 731, 281, 1614, 253, 789, 1078, 253, 20639, 15, 187, 187, 6436, 13, 604, 627, 403, 642, 9732, 323, 7729, 731, 13, 840, 

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Input IDs shape: torch.Size([2, 504])
Labels shape: torch.Size([2, 504])


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 