In [None]:
!pip install transformers

In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers[torch] -U

In [None]:
!pip install pyarrow --upgrade

# Using Transformer pipeline

In [1]:
from transformers import pipeline, set_seed

In [4]:
# Sentiment Analysis

classifier = pipeline("sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life.")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598049521446228}]

In [5]:
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [6]:
# Text Generation

generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to make two different type of "Biblical Texts" from the Book of Isaiah and the Isaiah Bible. The Bible and the Bible of Isaiah also provide a common Bible format for both Bible Study and Bible'}]

In [7]:
generator = pipeline("text-generation", model="distilgpt2")
generator(
    "In this course, we will teach you how to",
    max_length=30,
    num_return_sequences=2,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to use the most effective tools like the tools.'},
 {'generated_text': 'In this course, we will teach you how to learn your skills and skills for using an Inverse.\n\nInverse is a way of giving'}]

In [8]:
unmasker = pipeline("fill-mask")
unmasker("This course will teach you all about <mask> models.", top_k=2)

No model was supplied, defaulted to distilbert/distilroberta-base and revision ec58a5b (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.19619785249233246,
  'token': 30412,
  'token_str': ' mathematical',
  'sequence': 'This course will teach you all about mathematical models.'},
 {'score': 0.0405273512005806,
  'token': 38163,
  'token_str': ' computational',
  'sequence': 'This course will teach you all about computational models.'}]

In [9]:
# NER
ner = pipeline("ner", grouped_entities=True)
ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

Some of the currently available pipelines are:

* feature-extraction (get the vector representation of a text)
* fill-mask
* ner (named entity recognition)
* question-answering
* sentiment-analysis
* summarization
* text-generation
* translation
* zero-shot-classification

# Zero-shot

In [13]:
classifier = pipeline("text-classification", model="JimminDev/Depressive-detector")

In [14]:
text = "The market was very volatile today due to the unexpected interest rate hike."

candidate_labels = ['depressive']

classifier(text, candidate_labels, multi_label=False)

TypeError: TextClassificationPipeline.__call__() takes 2 positional arguments but 3 were given

In [None]:
text = "I don't like the food."

candidate_labels = ['satisfied', 'unsatisfied','neutral']
hypothesis_template = "The text show the notion of {} sentiment."

classifier(text, candidate_labels, multi_label=True, hypothesis_template=hypothesis_template)

# Few-shot using prompt

In [5]:
from transformers import pipeline, set_seed

In [None]:
def classify_new_text(new_text):
    generator = pipeline('text-generation', model='distilgpt2')
    set_seed(42)

    prompt_template = """The following are examples of text classification:
    Text: "The market was very volatile today due to the unexpected interest rate hike."
    Category: Economy

    Text: "The new fitness program includes routines that improve cardiovascular health."
    Category: Health

    Text: "A groundbreaking discovery in renewable energy has been announced."
    Category: Science

    Text: "The local sports team won their game last night in a surprising upset."
    Category: Sports

    Text: "{}"
    Category:"""

    prompt = prompt_template.format(new_text)

    response = generator(prompt, max_length=200, num_return_sequences=1, temperature=0.7)
    generated_text = response[0]['generated_text']

    # Attempt to extract the category from the generated text
    try:
        # Splitting the generated text to find the category part
        category_part = generated_text.split("Category:")[1].strip()
        # Assuming the category is the first word/phrase followed by any newline or extra text
        predicted_category = category_part.split('\n')[0].strip()
        print(f'Text: "{new_text}"\nPredicted Category: {predicted_category}')
    except IndexError:
        # If the expected format isn't found
        print("Failed to extract the category. Please check the generated text format.")

# Example usage
new_text = "The stock market is reaching new heights."
classify_new_text(new_text)


# Few-shot using Setfit

Compared to other few-shot learning methods, SetFit has several unique features:

* 🗣 No prompts or verbalizers: Current techniques for few-shot fine-tuning require handcrafted prompts or verbalizers to convert examples into a format suitable for the underlying language model. SetFit dispenses with prompts altogether by generating rich embeddings directly from text examples.
* 🏎 Fast to train: SetFit doesn't require large-scale models like T0 or GPT-3 to achieve high accuracy. As a result, it is typically an order of magnitude (or more) faster to train and run inference with.
* 🌎 Multilingual support: SetFit can be used with any Sentence Transformer on the Hub, which means you can classify text in multiple languages by simply fine-tuning a multilingual checkpoint.

In [None]:
!pip install setfit

In [1]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer

let's download a text classification dataset from the Hugging Face Hub. We'll use the SentEval-CR dataset, which is a dataset of customer reviews:

In [7]:
from datasets import load_dataset
dataset = load_dataset("carblacac/twitter-sentiment-analysis")

ValueError: Invalid pattern: '**' can only be an entire path component

In [None]:
#To simulate a real-world scenario with just a few labeled examples, we'll sample 8 examples per class from the training set:
train_ds = dataset["train"].shuffle(seed=42).select(range(8 * 2))
test_ds = dataset["test"]

In [None]:
train_ds[0]

In [None]:
# To check the unique labels in the dataset
set(train_ds['label_text'])

In [None]:
# Load SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=5, # Number of text pairs to generate for contrastive learning
    num_epochs=1 # Number of epochs to use for contrastive learning
)

# Train and evaluate!
trainer.train()

In [None]:
# See the cuuracy
metrics = trainer.evaluate()


In [None]:
metrics

In [None]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./fine_tuned_model")

In [None]:
# Download from Hub
model = SetFitModel.from_pretrained("./fine_tuned_model")

In [None]:
# Run inference
preds = model.predict(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
print(preds)