In [14]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://download.pytorch.org/whl/nightly/cpu
Collecting torchvision
  Downloading https://download.pytorch.org/whl/nightly/cpu/torchvision-0.20.0.dev20241126-cp311-cp311-macosx_11_0_arm64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.5.0.dev20241126-cp311-cp311-macosx_11_0_arm64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m52.0 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241126-cp311-none-macosx_11_0_arm64.whl (66.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.0/66.0 MB[0m [31m76.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pillow!=8.3.*,>=5.3.0 (from torchvision)
  Downloading https://download.pytorch.org

In [None]:
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

# Step 1: Prepare the Dataset
# Load the JSONL dataset
data_files = {"train": "training_formatted.jsonl"}  # Replace with your JSONL file
dataset = load_dataset("json", data_files=data_files)

# Split into train and validation datasets
split_datasets = dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
val_dataset = split_datasets["test"]

# Step 2: Define Label Mapping
label_mapping = {
    "hate speech": 0,
    "spam": 1,
    "explicit material": 2,
    "misinformation": 3
}

# Map string labels to integers
def map_labels(example):
    example["label"] = label_mapping[example["messages"][2]["content"]]
    return example

# Apply label mapping
train_dataset = train_dataset.map(map_labels)
val_dataset = val_dataset.map(map_labels)

# Step 3: Tokenize the Dataset
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define tokenization function
def tokenize_function(example):
    return tokenizer(
        example["messages"][1]["content"],  # User input (prompt)
        truncation=True,
        padding="max_length",
        max_length=512
    )

# Tokenize the datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Rename label column for Hugging Face models
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")

# Set dataset format to PyTorch
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

# Step 4: Define the Model
# Load a pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_mapping))

# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)

# Define data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 6: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Step 7: Train the Model
trainer.train()

# Step 8: Evaluate the Model
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

# Step 9: Save the Model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Step 10: Inference
from transformers import pipeline

# Load the fine-tuned model for inference
classifier = pipeline("text-classification", model="./fine_tuned_model", tokenizer="./fine_tuned_model")

# Test the classifier
test_input = "Glad my work doesn't cater for people with disabilities."
prediction = classifier(test_input)
print("Prediction:", prediction)


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 661 examples [00:00, 143352.38 examples/s]
Map: 100%|██████████| 594/594 [00:00<00:00, 25762.53 examples/s]
Map: 100%|██████████| 67/67 [00:00<00:00, 11955.18 examples/s]
Map:   0%|          | 0/594 [00:00<?, ? examples/s]


TypeError: list indices must be integers or slices, not str

In [1]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

# Step 1: Load the JSON Dataset
# Replace 'input.json' with your JSON file path
input_file = "prompt_with_completion.json"  # Your file path
with open(input_file, "r") as f:
    data = json.load(f)

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)

# Split into train and validation datasets
split_datasets = dataset.train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
val_dataset = split_datasets["test"]

# Step 2: Define Label Mapping
label_mapping = {
    "hate speech": 0,
    "spam": 1,
    "explicit material": 2,
    "misinformation": 3
}

# Map string labels to integers
def map_labels(example):
    example["label"] = label_mapping[example["completion"]]
    return example

# Apply label mapping
train_dataset = train_dataset.map(map_labels)
val_dataset = val_dataset.map(map_labels)

# Step 3: Tokenize the Dataset
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define tokenization function
def tokenize_function(example):
    return tokenizer(
        example["prompt"],  # Tokenize the prompt
        truncation=True,
        padding="max_length",
        max_length=512
    )

# Tokenize the datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Rename label column for Hugging Face models
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")

# Set dataset format to PyTorch
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

# Step 4: Define the Model
# Load a pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_mapping))

# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

# Define data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 6: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Step 7: Train the Model
trainer.train()

# Step 8: Evaluate the Model
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

# Step 9: Save the Model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Step 10: Inference
from transformers import pipeline

# Load the fine-tuned model for inference
classifier = pipeline("text-classification", model="./fine_tuned_model", tokenizer="./fine_tuned_model")

# Test the classifier
test_input = "Glad my work doesn't cater for people with disabilities."
prediction = classifier(test_input)
print("Prediction:", prediction)


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 504/504 [00:00<00:00, 28927.03 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 15318.47 examples/s]
Map: 100%|██████████| 504/504 [00:00<00:00, 9726.55 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 6997.26 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
 10%|█         | 10/96 [02:35<20:57, 14.63s/it]

{'loss': 0.8114, 'grad_norm': nan, 'learning_rate': 1.7916666666666667e-05, 'epoch': 0.31}


 21%|██        | 20/96 [05:05<19:04, 15.06s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.5833333333333333e-05, 'epoch': 0.62}


 31%|███▏      | 30/96 [07:41<16:56, 15.41s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.375e-05, 'epoch': 0.94}


                                               
 33%|███▎      | 32/96 [08:12<15:33, 14.59s/it]

{'eval_loss': nan, 'eval_runtime': 3.1175, 'eval_samples_per_second': 18.284, 'eval_steps_per_second': 1.283, 'epoch': 1.0}


 42%|████▏     | 40/96 [10:36<17:10, 18.41s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.25}


 52%|█████▏    | 50/96 [13:56<16:06, 21.01s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 9.583333333333335e-06, 'epoch': 1.56}


 62%|██████▎   | 60/96 [17:39<11:41, 19.47s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 7.500000000000001e-06, 'epoch': 1.88}


                                               
 67%|██████▋   | 64/96 [18:51<09:08, 17.13s/it]

{'eval_loss': nan, 'eval_runtime': 2.7458, 'eval_samples_per_second': 20.759, 'eval_steps_per_second': 1.457, 'epoch': 2.0}


 73%|███████▎  | 70/96 [20:43<08:22, 19.31s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5.416666666666667e-06, 'epoch': 2.19}


 83%|████████▎ | 80/96 [24:10<05:23, 20.20s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}


 94%|█████████▍| 90/96 [27:29<01:58, 19.74s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.25e-06, 'epoch': 2.81}


                                               
100%|██████████| 96/96 [29:34<00:00, 18.39s/it]

{'eval_loss': nan, 'eval_runtime': 2.7214, 'eval_samples_per_second': 20.945, 'eval_steps_per_second': 1.47, 'epoch': 3.0}


100%|██████████| 96/96 [29:37<00:00, 18.52s/it]


{'train_runtime': 1777.6174, 'train_samples_per_second': 0.851, 'train_steps_per_second': 0.054, 'train_loss': 0.08452354868253072, 'epoch': 3.0}


100%|██████████| 4/4 [00:01<00:00,  2.07it/s]


Evaluation Results: {'eval_loss': nan, 'eval_runtime': 2.711, 'eval_samples_per_second': 21.026, 'eval_steps_per_second': 1.475, 'epoch': 3.0}


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Prediction: [{'label': 'LABEL_0', 'score': nan}]


In [10]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import classification_report

# Step 1: Load the Test Data
# Replace 'test.json' with the path to your test file
test_file = "test.json"
with open(test_file, "r") as f:
    test_data = json.load(f)

# Convert to Hugging Face Dataset
test_dataset = Dataset.from_list(test_data)

# Step 2: Load the Fine-Tuned Model and Tokenizer
model_path = "./fine_tuned_model"  # Path to your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Step 3: Tokenize the Test Data
# Define label mapping used during training
label_mapping = {
    "hate speech": 0,
    "spam": 1,
    "explicit material": 2,
    "misinformation": 3
}
# Reverse mapping for predictions
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Tokenize the test dataset
def tokenize_function(example):
    return tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_test = test_dataset.map(tokenize_function, batched=True)
tokenized_test.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Step 4: Run Predictions
# Initialize the pipeline for classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=False)

# Make predictions
predictions = []
true_labels = []

for entry in test_data:
    prompt = entry["prompt"]
    true_label = entry["completion"]
    result = classifier(prompt)[0]  # Get the top prediction
    predicted_label = reverse_label_mapping[int(result["label"][-1])]
    
    predictions.append(predicted_label)
    true_labels.append(true_label)

# Step 5: Evaluate Results
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=label_mapping.keys()))


Map: 100%|██████████| 100/100 [00:00<00:00, 7980.33 examples/s]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



Classification Report:
                   precision    recall  f1-score   support

      hate speech       0.00      0.00      0.00        13
             spam       0.46      1.00      0.63        46
explicit material       0.00      0.00      0.00        13
   misinformation       0.00      0.00      0.00        28

         accuracy                           0.46       100
        macro avg       0.12      0.25      0.16       100
     weighted avg       0.21      0.46      0.29       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
import json
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import pipeline
from sklearn.metrics import classification_report

# Step 1: Check Device (MPS or CPU)
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using GPU: Metal Performance Shaders (MPS)")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Step 2: Load the JSON Dataset
# Replace 'input.json' with your JSON file path
input_file = "prompt_with_completion.json"  # Your file path
with open(input_file, "r") as f:
    data = json.load(f)

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)

# Split into train and validation datasets
split_datasets = dataset.train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
val_dataset = split_datasets["test"]

# Step 3: Define Label Mapping
label_mapping = {
    "hate speech": 0,
    "spam": 1,
    "explicit material": 2,
    "misinformation": 3
}

# Map string labels to integers
def map_labels(example):
    example["label"] = label_mapping[example["completion"]]
    return example

# Apply label mapping
train_dataset = train_dataset.map(map_labels)
val_dataset = val_dataset.map(map_labels)

# Step 4: Tokenize the Dataset
# Load a larger tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

# Define tokenization function
def tokenize_function(example):
    return tokenizer(
        example["prompt"],  # Tokenize the prompt
        truncation=True,
        padding="max_length",
        max_length=512
    )

# Tokenize the datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Rename label column for Hugging Face models
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")

# Set dataset format to PyTorch
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

# Step 5: Define the Model
# Load a larger pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=len(label_mapping)).to(device)

# Step 6: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results_roberta_large",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust batch size for MPS
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_roberta_large",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    #fp16=False,  # Mixed precision not supported on MPS
    dataloader_num_workers=2  # Reduce worker count to avoid overload
)

# Define data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 7: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Step 8: Train the Model
trainer.train()

# Step 9: Evaluate the Model
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

# Step 10: Save the Model
model.save_pretrained("./fine_tuned_roberta_large")
tokenizer.save_pretrained("./fine_tuned_roberta_large")

# Step 11: Inference
# Load the fine-tuned model for inference
classifier = pipeline(
    "text-classification",
    model="./fine_tuned_roberta_large",
    tokenizer="./fine_tuned_roberta_large",
    device=0 if torch.backends.mps.is_available() else -1
)

# Test the classifier
test_input = "Glad my work doesn't cater for people with disabilities."
prediction = classifier(test_input)
print("Prediction:", prediction)

# Step 12: Additional Evaluation with Test Data
test_file = "test.json"  # Replace with your test file path
with open(test_file, "r") as f:
    test_data = json.load(f)

test_dataset = Dataset.from_list(test_data)

true_labels = []
predictions = []

for entry in test_data:
    prompt = entry["prompt"]
    true_label = entry["completion"]
    pred = classifier(prompt)[0]  # Get the top prediction
    predictions.append(pred["label"])
    true_labels.append(true_label)

# Print classification report
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=label_mapping.keys()))


Using GPU: Metal Performance Shaders (MPS)


Map: 100%|██████████| 504/504 [00:00<00:00, 26389.81 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 14522.86 examples/s]
Map: 100%|██████████| 504/504 [00:00<00:00, 11973.48 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 8419.63 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


ValueError: fp16 mixed precision requires a GPU (not 'mps').

Map:   0%|          | 0/504 [00:00<?, ? examples/s]

Map: 100%|██████████| 504/504 [00:00<00:00, 29327.95 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 13383.08 examples/s]
Map: 100%|██████████| 504/504 [00:00<00:00, 24199.29 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 14268.04 examples/s]
  trainer = Trainer(


ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [21]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Step 1: Load the Dataset
input_file = "prompt_with_completion.json"  # Replace with your JSON file path
with open(input_file, "r") as f:
    data = json.load(f)

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)

# Split into train and validation datasets
split_datasets = dataset.train_test_split(test_size=0.1)
train_dataset = split_datasets["train"]
val_dataset = split_datasets["test"]

# Step 2: Define Label Mapping
label_mapping = {
    "hate speech": 0,
    "spam": 1,
    "explicit material": 2,
    "misinformation": 3
}

# Map labels to integers
def map_labels(example):
    example["label"] = label_mapping[example["completion"]]
    return example

# Apply label mapping
train_dataset = train_dataset.map(map_labels)
val_dataset = val_dataset.map(map_labels)

# Step 3: Tokenize the Dataset
# Load GPT-2 Large tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Define tokenization function
def tokenize_function(example):
    inputs = f"Prompt: {example['prompt']} Completion: {example['completion']}"
    return tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=504
    )

# Tokenize the training dataset
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

# Tokenize the validation dataset
tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

# Step 4: Load the GPT-2 Model
model = GPT2LMHeadModel.from_pretrained("gpt2-large")

# Resize token embeddings to accommodate the new padding token
model.resize_token_embeddings(len(tokenizer))

# Ensure the model runs on CPU
model.to("cpu")

# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results_gpt2_large",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_gpt2_large",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=False,  # Ensure mixed precision is disabled
    dataloader_num_workers=1,  # Reduce workers for CPU compatibility
    report_to="none"  # Disable reporting to external tools like W&B
)

# Define a data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Step 6: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Step 7: Train the Model
trainer.train()

# Step 8: Save the Model
model.save_pretrained("./fine_tuned_gpt2_large")
tokenizer.save_pretrained("./fine_tuned_gpt2_large")

# Step 9: Inference on CPU
from transformers import pipeline

classifier = pipeline(
    "text-generation",
    model="./fine_tuned_gpt2_large",
    tokenizer="./fine_tuned_gpt2_large",
    device=-1  # Force CPU for inference
)

# Test the classifier
test_input = "Prompt: Glad my work doesn't cater for people with disabilities. Completion:"
prediction = classifier(test_input, max_length=100, num_return_sequences=1)
print("Prediction:", prediction[0]["generated_text"])


Map: 100%|██████████| 504/504 [00:00<00:00, 25328.95 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 16092.85 examples/s]
Map: 100%|██████████| 504/504 [00:00<00:00, 22391.89 examples/s]
Map: 100%|██████████| 57/57 [00:00<00:00, 16020.59 examples/s]
  trainer = Trainer(


ValueError: fp16 mixed precision requires a GPU (not 'mps').