In [1]:
!pip install transformers datasets
!pip install accelerate



In [2]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

# Load dataset
dataset = load_dataset("glue", "mrpc")
metric = load_metric("glue", "mrpc", trust_remote_code=True)

# Load model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize inputs
def tokenize_function(examples):
    return tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        padding="max_length",  # Pad to the maximum length
        truncation=True,
        max_length=128  # Adjust this based on data
    )


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  metric = load_metric("glue", "mrpc", trust_remote_code=True)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
tokenized_dataset["train"].features

{'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [4]:
for example in dataset["train"].select(range(5)):
    print(example)

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", 'sentence2': "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .", 'label': 0, 'idx': 1}
{'sentence1': 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'sentence2': "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .", 'label': 1, 'idx': 2}
{'sentence1': 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .', 'sentence2': 'Tab shares jumped 20 cents , or 4.6 % , to set a rec

In [5]:

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
)

# Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

# Train
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.424708,0.828431,0.882943
2,0.539500,0.547247,0.835784,0.88225
3,0.294300,0.801185,0.843137,0.891525


TrainOutput(global_step=1377, training_loss=0.33282347701993126, metrics={'train_runtime': 301.2975, 'train_samples_per_second': 36.522, 'train_steps_per_second': 4.57, 'total_flos': 723818513295360.0, 'train_loss': 0.33282347701993126, 'epoch': 3.0})

In [7]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

# Load dataset
dataset = load_dataset("squad_v2", split="train[:1%]")  # Use a small subset for demonstration
metric = load_metric("squad_v2")

# Load model and tokenizer
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize inputs
def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=384,
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Add start and end positions to the tokenized dataset
def add_token_positions(example):
    example["start_positions"] = example["answers"]["answer_start"][0]
    example["end_positions"] = example["answers"]["answer_start"][0] + len(example["answers"]["text"][0])
    return example

tokenized_dataset = tokenized_dataset.map(add_token_positions)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train
trainer.train()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1303 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,5.724277
2,No log,4.988701
3,No log,4.410675


TrainOutput(global_step=489, training_loss=5.504108452358129, metrics={'train_runtime': 200.1333, 'train_samples_per_second': 19.532, 'train_steps_per_second': 2.443, 'total_flos': 383042226018816.0, 'train_loss': 5.504108452358129, 'epoch': 3.0})

## Alternative Configuration Showing Different Logging Approaches

`training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    logging_strategy="steps",  # Log every few steps
    logging_steps=50,  # Log every 50 steps
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    report_to="all",  # Report to all supported platforms
)`

**Additional Notes:**
For 1,000 training instances and a batch size of 100, one epoch contains 10 steps (1,000 examples / 100 = 10 batches).
Logging by steps: Setting logging_strategy to "steps" and logging_steps to 5 would log after every 5 steps, so you'd log twice per epoch with a batch size of 100 and 10 steps.
Logging by epochs: Setting logging_strategy to "epoch" would log once at the end of each epoch, or after 10 steps.

# GPT-2 Fine-Tune

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# This could be anything, just a placeholder to show the data structure
data = {"text": ["Hello, how are you?", "I am fine, thank you.", "What about you?"]}
dataset = Dataset.from_dict(data)

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize inputs
def tokenize_function(examples):
    encoding = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=32,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Train
trainer.train()



Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=6, training_loss=3.799689610799154, metrics={'train_runtime': 0.4983, 'train_samples_per_second': 18.06, 'train_steps_per_second': 12.04, 'total_flos': 146976768000.0, 'train_loss': 3.799689610799154, 'epoch': 3.0})

# GPT 3.5 (Not Open Source, Rates Apply)

In [None]:
!pip install openai

In [None]:
import openai

# 1. Set your OpenAI API key
openai.api_key = 'your-api-key'

# 2. Upload the file
file_response = openai.File.create(
    file=open("dataset.jsonl", "rb"),
    purpose="fine-tune"
)
file_id = file_response['id']

# 3. Create a fine-tuning job
fine_tune_response = openai.FineTuningJob.create(
    training_file=file_id,
    model="gpt-3.5-turbo"
)

# 4. Monitor the fine-tuning job
job_id = fine_tune_response['id']
status = fine_tune_response['status']

while status not in ["succeeded", "failed"]:
    job_response = openai.FineTuningJob.retrieve(job_id)
    status = job_response['status']
    print(f"Job status: {status}")
    if status == "succeeded":
        fine_tuned_model = job_response['fine_tuned_model']
        print(f"Fine-tuned model ID: {fine_tuned_model}")
    elif status == "failed":
        print("Fine-tuning job failed.")
    else:
        time.sleep(30)

# 5. Use the fine-tuned model
if status == "succeeded":
    chat_response = openai.ChatCompletion.create(
        model=fine_tuned_model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is Dr. Joel Kowalewski's position?"}
        ]
    )
    print(chat_response['choices'][0]['message']['content'])
