!!!! Numerous issues on M1

# Loading the API

In [None]:
from huggingface_hub import HfApi 
import tqdm as notebook_tqdm

api = HfApi() 

list(api.list_models())

# Filtering Models

In [None]:
api = HfApi()

models = api.list_models(
	task="image-classification",
	library="pytorch",
	trained_dataset="imagenet",
)


# Store as a list
modelList = list(models)

print(modelList[0].modelId)

# Downloading a Model

In [None]:
from transformers import AutoModel

modelId = "distilbert-base-uncased-finetuned-sst-2-english"

# Instantiate the AutoModel class
model = AutoModel.from_pretrained(modelId)

# Save the model
model.save_pretrained(save_directory=f"models/{modelId}")

# Woking with Datasets

Datasets can be consulted [here](https://huggingface.co/datasets). Each dataset has a dataset card, detailing itself.

The library *dataset* allows access, download, mutate, use and share all these datasets straight away. 

These datasets are usually quite big. Check them before downloading and mind the disk space.





In [None]:
from datasets import load_dataset_builder

data_builder = load_dataset_builder("derenrich/wikidata-en-descriptions-small")

print('Description: ', data_builder.info.description)
print('Features: ', data_builder.info.features)

In [None]:
from datasets import load_dataset 

data=load_dataset('imdb')


In [None]:
# With split parameter
data = load_dataset('imdb', split='train')

In [None]:
# Configuration parameter 
#data=load_dataset('wikipedia', '20231101.en')

This datasets are in apache arrow format -> efficient processing on large data and faster querying



# Pipelines 

Autoclasses are general classes for using Models, tokenizers, Configurations, Processors, Feature extractors. They are flexible and direct.

## AutoModels 

They are Auto classes to directly download a model 

AutoModel class for each type of task

In [None]:
from transformers import AutoModelForSequenceClassification 

model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased-finetuned-sst-2-english'
)



## AutoTokenizers

Prepare text input data.

Recommended to use the tokenizer paired with the model. 



In [None]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained(
    'distilbert-base-uncased-finetuned-sst-2-english'    
)

## Pipeline Module 

Contains all task-specific steps 

Best for quickly performing tasks 

### Task Pipelines 

contains task specific pipeline for each task. These task pipelines leverage Auto classes. They download model and relevant processing.



In [None]:
from transformers import pipeline

from transformers import (
    SummarizationPipeline, 
    TextClassificationPipeline, 
    AudioClassificationPipeline,
    ImageSegmentationPipeline, 
    QuestionAnsweringPipeline
)

### Creating a pipeline 

When creating a pipeline you can specify a task or a model. This will end up in hf using default values for the missing param.

In [None]:
my_pipelne = pipeline(task='text-classification') 

my_pipeline = pipeline(model='distilbert-base-uncased-finetuned-sst-2-english')


In [None]:
my_pipelne = pipeline(task='text-classification',
                      model='distilbert-base-uncased-finetuned-sst-2-english')


In [None]:
from transformers import pipeline, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased-finetuned-sst-2-english'
)

my_pipeline = pipeline(model=model)

In [None]:
from transformers import pipeline 

my_pipeline = pipeline(task = 'text-classification', 
                        model='distilbert-base-uncased-finetuned-sst-2-english')

In [None]:
input = 'Hi, welcome to this course'

my_pipeline(input)

In [None]:
input = 'This course is pretty good, I guess.' 

# Import pipeline
from transformers import pipeline

# Create the task pipeline
task_pipeline = pipeline(task='sentiment-analysis')

# Create the model pipeline
model_pipeline = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")

# Predict the sentiment
task_output = task_pipeline(input)
model_output = model_pipeline(input)

print(f"Sentiment from task_pipeline: {task_output[0]['label']}; Sentiment from model_pipeline: {model_output[0]['label']}")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Download the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Create the pipeline
sentimentAnalysis = pipeline(task="sentiment-analysis", model=model, tokenizer=tokenizer)

# Predict the sentiment
output = sentimentAnalysis(input)

print(f"Sentiment using AutoClasses: {output[0]['label']}")

In [None]:
# Create the pipeline
distil_pipeline = pipeline(task="sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Predict the sentiment
distil_output = distil_pipeline(input)

# Create the second pipeline and predict the sentiment
bert_pipeline = pipeline(model="kwang123/bert-sentiment-analysis", task="sentiment-analysis")
bert_output = bert_pipeline(input)

print(f"Bert Output: {bert_output[0]['label']}")
print(f"Distil Output: {distil_output[0]['label']}")

# Tokenizers

In [None]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

print(tokenizer.backend_tokenizer.normalizer.normalize_str('HI, whát are You doing  ?'))

In [None]:
from transformers import GPT2Tokenizer, DistilBertTokenizer

# Download the gpt tokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize the input
gpt_tokens = gpt_tokenizer.tokenize(input)

# Repeat for distilbert
distil_tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
distil_tokens = distil_tokenizer.tokenize(text=input)

# Compare the output
print(f"GPT tokenizer: {gpt_tokens}")
print(f"DistilBERT tokenizer: {distil_tokens}")

# Text Classification 

Text classifiers assing a set of predefined categories to a text. Ambiguity, sarcasm, irony, multiple languages... represent challenges of this discipline.

## Sentiment analysis

Categorizes in positive or negative depending on the emotional charge of the text. 

## Question Natural Language Inference (QNLI) 

Determines if the text (premise) contains sufficient information to answer a question. Categories are: 
- Entailment (yes, the text contains that info)
- Not entailment (no)
- Neutral (no relationship between the question and the text)

## Topic modeling

Assings labels to text dending on the subject they talk about 

## Grammatical correctness 

Categories are acceptable or not acceptable 



In [None]:
from transformers import pipeline 

# without any specified task, it will default to sentiment analysis 
classifier = pipeline(task='text-classification')

print(classifier('I dont like this notebook'))

In [None]:
from transformers import pipeline 

classifier = pipeline(task='text-classification',
                      model='abdulmatinomotoso/English_Grammar_Checker'
                     )

string_0 = 'Me doesnt like these notebook'
string_1 = 'I dont like this notebook'
print(string_0, classifier(string_0))
print(string_1, classifier(string_1))

In [None]:
from transformers import pipeline 

classifier = pipeline(task='text-classification',
                      model='cross-encoder/qnli-electra-base'
                     )

string_0 = 'Where is Seattle located?, Seattle is located in Washington state.'
string_1 = 'Where is Seattle located?, I like turtles.'
print(string_0, classifier(string_0))
print(string_1, classifier(string_1))

## Zero shot classification 

Thanks to transfer learning, there are models who can categorize text into unseen labels, without an specific training. Very useful if we dont have the resources/time to train a new model. 



In [None]:
classifier = pipeline(task='zero-shot-classification', 
                     model='facebook/bart-large-mnli')

text = 'Wikipedia earlier this month released its list of the 25 most viewed...'
candidate_labels = ['politics', 'science', 'technology']

output = classifier(text, candidate_labels)

print(output['labels'][0])
print(output['scores'][0])

# Summarization

Can be extractive (reusing pieces) or abstractive (generating new text)


In [None]:
text='''Data science is an interdisciplinary field that combines various domains such as statistics, computer science, machine learning, and domain expertise to extract meaningful insights from data. As the world becomes increasingly data-driven, organizations are leveraging data science to inform decision-making, optimize operations, and gain competitive advantages. This rapidly evolving field has become a cornerstone for businesses in diverse industries, including finance, healthcare, marketing, technology, and more.

Foundations of Data Science

At its core, data science involves the application of scientific methods to data. This typically involves five key stages:

	1.	Data Collection: Gathering data is the starting point of any data science project. Data can come from a variety of sources, such as databases, IoT sensors, websites, social media platforms, and internal systems. This data can be structured (like data in databases) or unstructured (such as text, images, and videos).
	2.	Data Cleaning and Preprocessing: Once data is collected, it often needs cleaning. Real-world data is often noisy, incomplete, or inconsistent. Techniques such as handling missing values, correcting data types, removing duplicates, and normalization are used to prepare the data for analysis. This step is crucial because the quality of the data can directly impact the quality of the insights derived from it.
	3.	Exploratory Data Analysis (EDA): After cleaning, the next step is to understand the data through exploratory data analysis. Visualization tools and statistical methods are used to summarize key patterns, relationships, and outliers in the dataset. EDA helps to uncover hidden insights and trends, which can inform further modeling decisions.
	4.	Modeling: Once a deep understanding of the data is achieved, machine learning or statistical models are applied to make predictions, classifications, or decisions. Depending on the problem, different techniques such as regression, classification, clustering, or neural networks may be used. Model selection, training, validation, and tuning are critical steps that require expertise to achieve high performance.
	5.	Deployment and Monitoring: A data science project does not end with building a model. The final step involves deploying the model in a real-world environment where it can generate value for the business. This might involve integrating the model into applications or systems. Continuous monitoring is required to ensure that the model’s performance remains accurate over time, and regular updates may be needed as new data becomes available.
'''


model = 'sshleifer/distilbart-cnn-12-6'
summarizer = pipeline(task='summarization', model=model)

summary_text = summarizer(text)

print(summary_text[0]['summary_text'])

In [None]:
summarizer = pipeline(task='summarization', model=model, min_length=10, max_length=50) # length in words

summary_text = summarizer(text)

print(summary_text[0]['summary_text'])

# Images

Preprocessing images is and important step since: 
- Models may have expectations about size, colors... of the image
- Maintaens consistency
- Can target specific parts of the images

## Cropping 

Removing unwanted parts 

## Resizing  

Change the size of the image. May impact the resolution 


In [None]:
from transformers import image_transforms 

from PIL import Image

original_image = Image.open('images/dragon-ball-z.jpeg')

In [None]:
import numpy as np 

image_array = np.array(original_image)

In [None]:
image_array

In [None]:
cropped_image = image_transforms.center_crop(
    image=image_array, 
    size=(50, 50)
)

cropped_image

## Image Classification

In [None]:
from transformers import pipeline

classifier = pipeline(task= 'image-classification',
                     model='google/vit-base-patch16-224')

In [None]:
classifier('images/free-photo-of-a-boat-is-docked-in-front-of-a-row-of-buildings.jpeg')

In [None]:
classifier('images/free-photo-of-a-boat-is-docked-in-front-of-a-row-of-buildings.jpeg', top_k=2)

# Question answering and multi-modal tasks

Answering the questions about content of a document 

Document is a text based images 

Question is specific to the document 

Answer can be direct quote or paraphrased response 

Visual question and answering is similar buy using image or video as input.

The pipeline require the document and the question. 

Use cases are information retrieval in customer support, legal compliance...

Multi modal tasks involve more than one type of data.


In [None]:
from transformers import pipeline 

dqa = pipeline(task = 'document-question-answering', 
              model='naver-clova-ix/donut-base-finetuned-docvqa')
document_image='images/hq720.jpg'
question_text='What is this memo about'

results = dqa(document_image, question_text)

In [None]:
results

In [None]:
from transformers import pipeline 

vqa = pipeline(task='visual-question-answering', 
              model='dandelin/vilt-b32-finetuned-vqa'
              )

results = vqa(image='images/free-photo-of-a-boat-is-docked-in-front-of-a-row-of-buildings.jpeg',
             question='whats the color of the boat')

print(results)

# Audio Classification

Sampling is a very important step that allows to standardize the signals. All the observations in our dataset have to have the same sampling rate.

In [None]:
from datasets import Audio

# songs = songs.cast_column('audio', Audio(sampling_rate=16_000))



The **librosa** library contains tools to get the duration of audio files among others.

In [None]:
import librosa

# Audio Classification

Process of qssigning one or more lables to audio clips based on its content. Language identification, environmental sounds, speaker identification...



In [None]:
from pydub import AudioSegment

# Convert .m4a to .wav
def convert_m4a_to_wav(input_file, output_file):
    audio = AudioSegment.from_file(input_file, format="m4a")
    audio.export(output_file, format="wav")

input_m4a = "audios/Sin título.m4a"
output_wav = "audios/output_file.wav"
convert_m4a_to_wav(input_m4a, output_wav)

In [None]:
classifier = pipeline(task='audio-classification', 
                     model='superb/wav2vec2-base-superb-ks')

results = classifier(output_wav)

print(results)

In [None]:
feature_extractor = pipeline("feature-extraction", model="facebook/wav2vec2-base-960h")
features = feature_extractor(output_wav)
print(features)

# Automatic Speech Recognition

'facebook/wave2vec2-base-960h' and 'openai/whisper-base' are two good models for ASR 



In [None]:
transcriber = pipeline(task='automatic-speech-recognition', 
                      model='facebook/wav2vec2-base-960h')

transcriber(output_wav)

A common wat to evaluate ASR systems is the Word Error Rate (WER), based on the levenshtein Distance.

In [None]:
from evaluate import load 

wer = load('wer')

# Fine Tuning 

Adjusting a pretrained model for a specific task or dataset. A pretrained model is an algorithm developed on extensive data to perform a task. 

The two main reasons for fine tuning are: 
- Learn new task or domain
- Reduced time and computation

How to fine tune a model? 
- Identify the model: 
- Prepare the data
- Build trainer
- Train the model
  


Setup the dataset object!!!

In [None]:
# Import modules
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use tokenizer on text
dataset = dataset.map(lambda row: tokenizer(row["text"], padding=True, max_length=512, truncation=True), keep_in_memory=True)

In [None]:
from transformers import pipeline

# Create the classifier
classifier = pipeline(task="sentiment-analysis", model="./fine_tuned_model")

# Classify the text
results = classifier(text=text_example)

print(results)

# Text Generation 



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Set model name
model_name = "gpt2"

# Get the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

prompt = "Start scratching your head because"

# Tokenize the input
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate the text output
output = model.generate(input_ids, num_return_sequences=1)

# Decode the output
generated_text = tokenizer.decode(output[0])

print(generated_text)

In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image

original_image = Image.open('images/free-photo-of-a-boat-is-docked-in-front-of-a-row-of-buildings.jpeg')

# Get the processor and model
processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

# Process the image
pixels = processor(images=original_image, return_tensors="pt").pixel_values

# Generate the ids
output = model.generate(pixel_values=pixels)

# Decode the output
caption = processor.batch_decode(output)

print(caption[0])

# Embeddings 

Embeddings are numerical representations of . Each embeddings has n-dimensions.

Embeddings are useful in recommender sysetms, searches and fraud detection for instance. 

Benefits: 
- semantic understanding
- embeddings can be used as features
- improved generalization

Challenges: 
- Huge amount of data to create them
- Hard to interpret
- Inherit bias from the training data



In [None]:
from sentence_transformers import SentenceTransformer

model_name = 'all-MiniLM-L6-v2'
embedder = SentenceTransformer(model=model_name)

In [None]:
from sentence_transformers import SentenceTransformers
SentenceTransformers("sentence-transformers/paraphrase-albert-small-v2")

In [None]:
sentence = 'What are embeddings?' 

embedding = embedder.encode([sentence]) 
print(embedding)

In [None]:
# Create the first embedding model
embedder1 = SentenceTransformers("all-MiniLM-L6-v2")

# Embed the sentence
embedding1 = embedder1.encode([sentence])

# Create and use second embedding model
embedder2 = SentenceTransformer("sentence-transformers/paraphrase-albert-small-v2")
embedding2 = embedder2.encode([sentence])
 
# Compare the shapes
print(embedding1.shape == embedding2.shape)

# Semantic Search

Type of search technology that tries to improve accuracy by understanding the intent and context behind the query.

It tries to find content matching the meaning, and not word matches.

Cosine similarity can be applied on embeddings to perform searches.

When searching with cosine, we embedd the question and find close documents.



In [None]:
from sentence_transformers import SentenceTransformers

embedder1 = SentenceTransformers("all-MiniLM-L6-v2")

In [None]:
from sentence_transformers import util 


In [None]:
hits = util.semantic_search(query_embedding, document_embeddings, top_k=2)

In [None]:
query = "I need a desktop book reader for Mac"

# Generate embeddings
query_embedding = embedder.encode([query])[0]

# Compare embeddings
hits = util.semantic_search(query_embedding, sentence_embeddings, top_k=2)

# Print the top results
for hit in hits[0]:
    print(sentences[hit["corpus_id"]], "(Score: {:.4f})".format(hit["score"]))