# Processing and classifying images

In [None]:
from transformers import image_transforms
from transformers import pipeline
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from datasets import Audio
from datasets import Dataset, load_dataset
import librosa

## Processing image data

In [None]:
original_image = Image.open("images/fashion.jpeg")

# Create the numpy array
image_array = np.array(original_image)

imgplot = plt.imshow(image_array)
plt.show()

In [None]:
# Crop the center of the image
cropped_image = image_transforms.center_crop(image=image_array, size=(200, 200))

imgplot = plt.imshow(cropped_image)
plt.show()

## Creating an image classifier

In [None]:
cropped_pil_image = Image.fromarray(cropped_image)
print(cropped_pil_image)

In [None]:
# Create the pipeline
image_classifier = pipeline(task="image-classification", 
            model="abhishek/autotrain_fashion_mnist_vit_base")

# Predict the class of the image
results = image_classifier(cropped_pil_image)

# Print the results
print(results[0]["label"])

## What about the original image?

In [None]:
# Predict the class of the image
results = image_classifier(original_image)

# Print the results
print(results[0]["label"])

# Question answering and multi-modal tasks

## Document question and answering

In [None]:
# Create the pipeline
dqa = pipeline(task="document-question-answering", model="naver-clova-ix/donut-base-finetuned-docvqa")

# Set the image and question
image = "images/document.jpg"
question = "Which meeting is this document about?"

# Get the answer
results = dqa(image=image, question=question)

print(results)

## Visual question and answering

In [None]:
image = "images/fashion.jpeg"
question = "What is the model wearing in this image?"

# Create pipeline
vqa = pipeline(task="visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")

# Use image and question in vqa
results = vqa(image=image, question=question)

print(results)

# Audio classification

## Resampling audio files

In [None]:
dataset = load_dataset("common_language")
dataset = dataset["train"]
dataset = dataset.select([0, 1, 2, 3, 4, 5, 6])

# Save the old sampling rate
old_sampling_rate = dataset[1]['audio']['sampling_rate']

# Resample the audio files
audio_file = dataset.cast_column("path", Audio(sampling_rate=16_000))

# Compare the old and new sampling rates
print("Old sampling rate:", old_sampling_rate)
print("New sampling rate:", dataset[1]['audio']['sampling_rate'])

## Filtering out audio files

In [None]:
dataset = Dataset.from_dict({"path": ["audios/audio1.mp3", "audios/audio2.mp3", "audios/audio3.mp3"]}).cast_column("path", Audio())

# Create a list of durations
old_durations_list = []

# Loop over dataset
for row in dataset["path"]:
    old_durations_list.append(librosa.get_duration(path=row["path"]))

# Creat a new column
dataset = dataset.add_column("duration", old_durations_list)

# Filter the dataset
filtered_dataset = dataset.filter(lambda d: d < 60.0, input_columns=["duration"], keep_in_memory=True)

# Save new durations
new_durations_list = filtered_dataset["duration"]

print("Old duration:", np.mean(old_durations_list))
print("New duration:", np.mean(new_durations_list))

## Classifying audio files

In [None]:
dataset = load_dataset("common_language")
dataset = dataset["train"]
dataset = dataset.select([0, 1, 2, 3, 4, 5, 6])

# Create the pipeline
classifier = pipeline(task="audio-classification", model="facebook/mms-lid-126")

# Extract the sample
audio = dataset[0]['audio']['array']
sentence = dataset[0]["sentence"]

# Predict the language
prediction = classifier(audio)

print(f"Predicted language is '{prediction[0]['label'].upper()}' for the sentence '{sentence}'")