# Computer vision

## Image classification

In [None]:
from datasets import load_dataset

dataset = load_dataset("nlphuji/flickr30k")
image = dataset['test'][134]["image"] 

In [None]:
import matplotlib.pyplot as plt

plt.imshow(image) 

In [None]:
from transformers import pipeline

pipe = pipeline("image-classification", "google/mobilenet_v2_1.0_224")
pred = pipe(image) 
print("Predicted class:", pred[0]['label'])

## Object detection

In [None]:
pipe = pipeline("object-detection", "facebook/detr-resnet-50", revision="no_timm")
outputs = pipe(image, threshold=0.95)

for obj in outputs:
  box = obj['box'] 
  print(f"Detected {obj['label']} with confidence {obj['score']:.2f} at ({box['xmin']}, {box['ymin']}) to ({box['xmax']}, {box['ymax']})")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

ax = plt.gca()
colors = ['r', 'g', 'b', 'y', 'm', 'c', 'k']
plt.imshow(image)  
for n, obj in enumerate(outputs):
  box = obj['box']   
  rect = patches.Rectangle(
    (box['xmin'], box['ymin']),
    box['xmax']-box['xmin'],
    box['ymax']-box['ymin'],
    linewidth=1,
    edgecolor=colors[n],
    facecolor='none')  
  ax.add_patch(rect)  
plt.show() 

## Image background removal

In [None]:
pipe = pipeline("image-segmentation", model="briaai/RMBG-1.4", trust_remote_code=True)
outputs = pipe(image)

plt.imshow(outputs)
plt.show()

# Fine-tunning computer vision models

## CV fine-tunning: dataset prep

In [None]:
from datasets import load_dataset

dataset = load_dataset("ideepankarsharma2003/Midjourney_v6_Classification_small_shuffled")['train']
data_splits = dataset.train_test_split(test_size=0.2, seed=42)

In [None]:
from transformers import AutoImageProcessor 

checkpoint = "google/mobilenet_v2_1.0_224"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

from torchvision.transforms import Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
transform = Compose([ToTensor(), normalize])

def transforms(examples):
  examples["pixel_values"] = [transform(img.convert("RGB")) for img in examples["image"]]
  del examples["image"]
  return examples

dataset = dataset.with_transform(transforms)

In [None]:
import matplotlib.pyplot as plt

plt.imshow(dataset["train"][0]["pixel_values"].permute(1, 2, 0))
plt.show()

## CV fine-tunning: model classes

In [None]:
labels = data_splits["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [None]:
from transformers import AutoModelForImageClassification 

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

## CV fine-tunning: trainer configuration

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="dataset_finetune",
  learning_rate=6e-5,
  gradient_accumulation_steps=4,
  num_train_epochs=3,
  push_to_hub=False
)

In [None]:
from transformers import Trainer, DefaultDataCollator

data_collator = DefaultDataCollator()
trainer = Trainer(  
  model=model,
  args=training_args,
  train_dataset=dataset["train"],
  eval_dataset=dataset["test"],
  processing_class=image_processor,
  data_collator=data_collator
)

In [None]:
predictions = trainer.predict(dataset["test"])
predictions.metrics["test_accuracy"]

trainer.train()

# Speech recognition and audio generation

## Automatic speech recognition

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("CSTR-Edinburgh/vctk")["train"]
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
sample = dataset[0]["audio"]
input_preprocessed = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt")
predicted_ids = model.generate(input_preprocessed.input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

## Creating speech embeddings

In [None]:
import torch
from speechbrain.inference.speaker import EncoderClassifier

speaker_model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb")
speaker_embeddings = speaker_model.encode_batch(torch.tensor(dataset[0]["audio"]["array"]))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2).unsqueeze(0)

## Audio denoising

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [None]:
inputs = processor(audio=dataset[0]["audio"], sampling_rate=dataset[0]["audio"]["sampling_rate"], return_tensors="pt")
speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)

# Fine-tuning text to speech models

## Fine-tuning a text to speech model

In [None]:
from datasets import load_dataset

dataset = load_dataset("facebook/voxpopuli", "it", split="train", trust_remote_code=True)
print(dataset.features)

speaker_model = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb") 

In [None]:
from transformers import SpeechT5Processor

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
def prepare_dataset(example):
    audio = example["audio"]
    example = processor(text=example["normalized_text"], audio_target=audio["array"],
                        sampling_rate=audio["sampling_rate"], return_attention_mask=False)
    example["labels"] = example["labels"][0]
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(audio["array"]))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        example["speaker_embeddings"] = speaker_embeddings.squeeze().cpu().numpy()
    return example
dataset = dataset.map(prepare_dataset)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-5,
    warmup_steps=500,
    label_names=["labels"],
    data_collator=data_collator
) 

In [None]:
from transformers import SpeechT5ForTextToSpeech, Seq2SeqTrainer

model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

trainer = Seq2SeqTrainer(args=training_args, model=model, train_dataset=dataset["train"], eval_dataset=dataset["test"], tokenizer=processor)

trainer.train()

## Generating new speech

In [None]:
text = "se sono italiano posso cantare l'opera lirica"

speaker_embedding = torch.tensor(dataset[5]["speaker_embeddings"]).unsqueeze(0)
inputs = processor(text=text, return_tensors="pt")
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)

make_spectrogram(speech)