In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import pipeline

model: https://huggingface.co/ProsusAI/finbert?text=Stocks+rallied+and+the+British+pound+gained.

In [None]:
stock_sentiment = pipeline("text-classification", model = "ProsusAI/finbert")

In [None]:
stock_sentiment("Stocks rallied and the British pound gained.")

[{'label': 'positive', 'score': 0.898361325263977}]

model : https://huggingface.co/dslim/bert-base-NER-uncased

In [None]:
ner = pipeline("ner", model = "dslim/bert-base-NER-uncased")

In [None]:
ner("My name is Wolfgang and I live in Berlin")

[{'entity': 'B-PER',
  'score': 0.993952,
  'index': 4,
  'word': 'wolfgang',
  'start': 11,
  'end': 19},
 {'entity': 'B-LOC',
  'score': 0.997895,
  'index': 9,
  'word': 'berlin',
  'start': 34,
  'end': 40}]

model = https://huggingface.co/deepset/roberta-base-squad2

In [None]:
qa = pipeline("question-answering", model = "deepset/roberta-base-squad2")

In [None]:
context = "My name is Sarah and I live in London"

In [None]:
qa(question = "What is my name?", context = context)

{'score': 0.5483798980712891, 'start': 11, 'end': 16, 'answer': 'Sarah'}

In [None]:
qa(question = "Where do I live?", context = context)

{'score': 0.7772290706634521, 'start': 31, 'end': 37, 'answer': 'London'}

model : https://huggingface.co/facebook/bart-large-cnn

In [None]:
summarization = pipeline("summarization", model = "facebook/bart-large-cnn")

In [None]:
long_text = "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."

In [None]:
summarization(long_text)

[{'summary_text': 'The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world.'}]

In [None]:
generator = pipeline("text-generation", model = "gpt2")

In [None]:
generator("Long live the rovolution our next meeting will be at")



[{'generated_text': 'Long live the rovolution our next meeting will be at the Dnipro Hotel located at 2335 S. N Lakewood Highway. This meeting is open to the general public of Downtown Dnipro; all are welcome. We need'}]

In [None]:
image_classifier = pipeline("image-classification")

Downloading (…)lve/main/config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/346M [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [None]:
!wget https://raw.githubusercontent.com/maxogden/cats/master/cat_photos/3ba670686e7111e181bd12313817987b_7.png

--2023-01-25 20:12:10--  https://raw.githubusercontent.com/maxogden/cats/master/cat_photos/3ba670686e7111e181bd12313817987b_7.png
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 592539 (579K) [image/png]
Saving to: ‘3ba670686e7111e181bd12313817987b_7.png’


2023-01-25 20:12:11 (87.4 MB/s) - ‘3ba670686e7111e181bd12313817987b_7.png’ saved [592539/592539]



In [None]:
image_classifier("3ba670686e7111e181bd12313817987b_7.png")

[{'score': 0.32977744936943054, 'label': 'Persian cat'},
 {'score': 0.14789025485515594, 'label': 'tabby, tabby cat'},
 {'score': 0.14650829136371613, 'label': 'Egyptian cat'},
 {'score': 0.12757571041584015, 'label': 'tiger cat'},
 {'score': 0.047103818506002426,
  'label': 'dishwasher, dish washer, dishwashing machine'}]

In [None]:
from transformers import AutoTokenizer
from transformers import pipeline

In [None]:
tokenizer = pipeline("distilbert-base-uncased")

KeyError: ignored

In [None]:
text = "siała baba mak"

In [None]:
encoded_text = tokenizer(text)

In [None]:
encoded_text

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)

In [None]:
tokens

In [None]:
from datasets import load_dataset

In [None]:
emotions_dataset = load_dataset("emotion")

In [None]:
emotions_dataset

In [None]:
emotions_dataset

In [None]:
emotions_dataset['train']['text'][:5]

In [None]:
emotions_dataset['train']['label'][:5]

In [None]:
from transformers import DistilBertTokenizer

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') 

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding='max_length', truncation=True, max_length = 26)

In [None]:
print(tokenize(emotions_dataset["train"][:2]))

In [None]:
emotions_encoded = emotions_dataset.map(tokenize, batch_size=None)

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=6).to(torch.device("cuda"))

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds)
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import TrainingArguments

In [None]:
batch_size = 64
logging_steps = len(emotions_dataset["train"]) // batch_size
training_args = TrainingArguments(
    output_dir = "classification_model",
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    log_level="error")

In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = emotions_encoded["train"],
    eval_dataset = emotions_encoded["test"],
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train()

In [None]:
preds_output = trainer.predict(emotions_encoded["validation"])

In [None]:
preds_output.metrics

In [None]:
import numpy as np

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = np.array(emotions_dataset["validation"]["label"])

labels = emotions_dataset["train"].features["label"].names

In [None]:
labels

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels) 
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False) 
  plt.title("Normalized confusion matrix")
  plt.show()

In [None]:
plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
from transformers import TextClassificationPipeline

In [None]:
emotions_pipeline = TextClassificationPipeline(model = model.cpu(), tokenizer = tokenizer)

In [None]:
emotions_pipeline("I am so angry")

In [None]:
from datasets import load_dataset

In [None]:
ner_dataset = load_dataset("xtreme", name="PAN-X.en")

In [None]:
ner_dataset

In [None]:
tags = ner_dataset["train"].features["ner_tags"].feature

In [None]:
tags

In [None]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [None]:
index2tag

In [None]:
ner_dataset['train']['tokens'][2]

In [None]:
ner_dataset['train']['ner_tags'][2]

In [None]:
[index2tag[t] for t in ner_dataset['train']['ner_tags'][2]]

In [None]:
from transformers import AutoTokenizer
tokenizer = pipeline("xlm-roberta-base")

In [None]:
encoded_text = tokenizer(ner_dataset['train']['tokens'][2], is_split_into_words=True)

In [None]:
encoded_text

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)

In [None]:
tokens

In [None]:
encoded_text.word_ids(batch_index = 0)

In [None]:

def tokenize_and_align_labels(examples):
  tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
  labels = []
  for idx, label in enumerate(examples["ner_tags"]): 
    word_ids = tokenized_inputs.word_ids(batch_index=idx)  
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
  
      if word_idx is None or word_idx == previous_word_idx: 
        label_ids.append(-100) 
      else: 
        label_ids.append(label[word_idx]) 
      previous_word_idx = word_idx
    labels.append(label_ids)

  tokenized_inputs["labels"] = labels 
  return tokenized_inputs


In [None]:
ner_dataset = ner_dataset.map(tokenize_and_align_labels, batched=True,
remove_columns=['langs', 'ner_tags', 'tokens'])

In [None]:
import numpy as np

In [None]:
def align_predictions(predictions, label_ids): 
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape 
  labels_list, preds_list = [], []
  for sentence_idx in range(batch_size): 
    example_labels, example_preds = [], [] 
    for token_idx in range(seq_len): 
      if label_ids[sentence_idx, token_idx] != -100: 
        example_labels.append(index2tag[label_ids[sentence_idx][token_idx]]) 
        example_preds.append(index2tag[preds[sentence_idx][token_idx]]) 
    labels_list.append(example_labels)
    preds_list.append(example_preds)
  return preds_list, labels_list

In [None]:
!pip install seqeval

In [None]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
  y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids) 
  return {"f1": f1_score(y_pred, y_true)}

In [None]:
from transformers import AutoModelForTokenClassification
import torch


In [None]:
ner_model = AutoModelForTokenClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=tags.num_classes,
    id2label=index2tag,
    label2id=tag2index
     ).to(torch.device("cuda"))

In [None]:
from transformers import TrainingArguments

num_epochs = 3
batch_size = 8
logging_steps = len(ner_dataset["train"]) // batch_size
training_args = TrainingArguments(
    output_dir="ner_model",
    log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps,
    )


In [None]:
from transformers import DataCollatorForTokenClassification 

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = ner_dataset["train"],
    eval_dataset = ner_dataset["test"],
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train()

In [None]:
from transformers import TokenClassificationPipeline

In [None]:
ner_pipeline = TokenClassificationPipeline(model= ner_model.cpu(), tokenizer = tokenizer)

In [None]:
ner_pipeline("His name is Robert Paulson")