# Fine-tunning a model

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset

from transformers import (
  Trainer,TrainingArguments
)

from transformers import pipeline

from transformers import AutoModelForCausalLM, AutoProcessor

from PIL import Image

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Preparing a dataset

In [None]:
dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
dataset = dataset.select(range(3))

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use tokenizer on text
dataset = dataset.map(lambda row: tokenizer(row["text"], padding=True, max_length=512, truncation=True), keep_in_memory=True)

## Building the trainer

In [None]:
dataset = load_dataset("yelp_review_full")

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10))

model_name = "google-bert/bert-base-cased"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

training_args = TrainingArguments(output_dir=".results")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset
)

trainer.train()

## Using the fine-tunned model

In [None]:
text_example = "I am a HUGE fan of romantic comedies."

# Create the classifier
classifier = pipeline(task="sentiment-analysis", model=".results")

# Classify the text
results = classifier(text=text_example)

print(results)

# Text generation

## Generating text from a text prompt

In [None]:
# Set model name
model_name = "gpt2"

# Get the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

prompt = "Wear sunglasses when its sunny because"

# Tokenize the input
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate the text output
output = model.generate(input_ids, num_return_sequences=1)

# Decode the output
generated_text = tokenizer.decode(output[0])

print(generated_text)

## Generating a caption from an image

In [None]:
image = Image.open("images/profile.jpeg")

# Get the processor and model
processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

# Process the image
pixels = processor(images=image, return_tensors="pt").pixel_values

# Generate the ids
output = model.generate(pixel_values=pixels)

# Decode the output
caption = processor.batch_decode(output)

print(caption[0])

# Embeddings

## Generate embeddings for a sentence

In [2]:
sentence = "Programmers, do you put your comments (before|after) the related code?"

# Create the first embedding model
embedder1 = SentenceTransformer("all-MiniLM-L6-v2")

# Embed the sentence
embedding1 = embedder1.encode([sentence])

# Create and use second embedding model
embedder2 = SentenceTransformer("sentence-transformers/paraphrase-albert-small-v2")
embedding2 = embedder2.encode([sentence])
 
# Compare the shapes
print(embedding1.shape == embedding2.shape)



False
