## FineTuning with IMDB Dataset

In [1]:
! pip install transformers[sentencepiece]
! pip install datasets



In [2]:
from transformers import pipeline
from datasets import load_dataset

In [3]:
imdb = load_dataset("imdb")

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer


checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
def tokenize_imdb_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = imdb.map(tokenize_imdb_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-6c89106cc0230425.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-e44dbee1caaf8fcd.arrow


In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [9]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [43]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
training_args = TrainingArguments("test_trainer")

trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=375, training_loss=0.1939275919596354, metrics={'train_runtime': 275.5582, 'train_samples_per_second': 10.887, 'train_steps_per_second': 1.361, 'total_flos': 397402195968000.0, 'train_loss': 0.1939275919596354, 'epoch': 3.0})

In [44]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'eval_accuracy': 0.863,
 'eval_loss': 0.7737125158309937,
 'eval_runtime': 32.9696,
 'eval_samples_per_second': 30.331,
 'eval_steps_per_second': 3.791}

### Pure Pytorch

In [19]:
from torch.utils.data import DataLoader
from datasets import load_metric
from transformers import AdamW
from transformers import get_scheduler

In [20]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

optimizer = AdamW(model.parameters(), lr=5e-5)


num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [22]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [23]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/375 [00:00<?, ?it/s]

In [24]:
metric = load_metric("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.873}

In [27]:
pt_save_directory = "./pt_save_pretrained"

tokenizer.save_pretrained(pt_save_directory)

model.save_pretrained(pt_save_directory)

In [29]:
! ls pt_save_pretrained

config.json	   special_tokens_map.json  tokenizer.json
pytorch_model.bin  tokenizer_config.json    vocab.txt


In [33]:
for batch in eval_dataloader:
  print(f"batches : {batch}")
  break

batches : {'labels': tensor([1, 1, 0, 1, 0, 1, 1, 0]), 'input_ids': tensor([[ 101, 1026, 7987,  ...,    0,    0,    0],
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 2023, 3185,  ...,    0,    0,    0],
        ...,
        [ 101, 2044, 1037,  ...,    0,    0,    0],
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 2123, 1005,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [35]:
model.to(torch.device("cpu"))

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [36]:
pt_outputs = model(**batch)

In [37]:
print(pt_outputs)

SequenceClassifierOutput(loss=tensor(0.0818, grad_fn=<NllLossBackward0>), logits=tensor([[-2.7224e+00,  2.8038e+00],
        [-2.4453e+00,  2.5370e+00],
        [ 2.2058e-01,  3.1478e-03],
        [-1.5988e+00,  1.6608e+00],
        [ 3.1647e+00, -2.6685e+00],
        [-3.7235e+00,  4.0082e+00],
        [-2.1883e+00,  2.2664e+00],
        [ 3.7447e+00, -3.1668e+00]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [38]:
from torch import nn

pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)

tensor([[3.9649e-03, 9.9604e-01],
        [6.8112e-03, 9.9319e-01],
        [5.5414e-01, 4.4586e-01],
        [3.6983e-02, 9.6302e-01],
        [9.9708e-01, 2.9202e-03],
        [4.3850e-04, 9.9956e-01],
        [1.1490e-02, 9.8851e-01],
        [9.9900e-01, 9.9526e-04]], grad_fn=<SoftmaxBackward0>)


In [42]:
batch['labels']

tensor([1, 1, 0, 1, 0, 1, 1, 0])

In [46]:
import torch
from torch import tensor

pt_outputs = model(**batch, labels=tensor([1, 1, 0, 1, 0, 1, 1, 0]))
print(pt_outputs)

TypeError: ignored

##### Older expts

In [23]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=True)


tokenized_datasets = imdb.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [30]:
t1 = 'It was great to see some of my favorite stars of 30 years ago including John Ritter, Ben Gazarra and Audrey Hepburn. They looked quite wonderful. But that was it. They were not given any characters or good lines to work with. I neither understood or cared what the characters were doing.<br /><br />Some of the smaller female roles were fine, Patty Henson and Colleen Camp were quite competent and confident in their small sidekick parts. They showed some talent and it is sad they didn\'t go on to star in more and better films. Sadly, I didn\'t think Dorothy Stratten got a chance to act in this her only important film role.<br /><br />The film appears to have some fans, and I was very open-minded when I started watching it. I am a big Peter Bogdanovich fan and I enjoyed his last movie, "Cat\'s Meow" and all his early ones from "Targets" to "Nickleodeon". So, it really surprised me that I was barely able to keep awake watching this one.<br /><br />It is ironic that this movie is about a detective agency where the detectives and clients get romantically involved with each other. Five years later, Bogdanovich\'s ex-girlfriend, Cybil Shepherd had a hit television series called "Moonlighting" stealing the story idea from Bogdanovich. Of course, there was a great difference in that the series relied on tons of witty dialogue, while this tries to make do with slapstick and a few screwball lines.<br /><br />Bottom line: It ain\'t no "Paper Moon" and only a very pale version of "What\'s Up, Doc".'
paraphrase = tokenizer(t1, return_tensors="pt")

paraphrase_classification_logits = model(**paraphrase).logits

In [31]:
paraphrase_classification_logits

tensor([[-0.2434,  0.3627]], grad_fn=<AddmmBackward0>)

In [32]:
 torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]

[0.35295718908309937, 0.6470427513122559]

In [None]:
imdb['train'][10]["label"]

0

In [None]:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [None]:
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['text', 'labels', 'input_ids', 'attention_mask']

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8
)

In [None]:
for batch in train_dataloader:
    break

TypeError: ignored

## Blurr (experiments)

In [None]:
! pip install git+https://github.com/ohmeow/blurr.git@dev-2.0.0 -Uqq

[?25l[K     |█▊                              | 10 kB 33.0 MB/s eta 0:00:01[K     |███▌                            | 20 kB 40.4 MB/s eta 0:00:01[K     |█████▏                          | 30 kB 40.8 MB/s eta 0:00:01[K     |███████                         | 40 kB 43.3 MB/s eta 0:00:01[K     |████████▋                       | 51 kB 27.7 MB/s eta 0:00:01[K     |██████████▍                     | 61 kB 29.7 MB/s eta 0:00:01[K     |████████████                    | 71 kB 22.9 MB/s eta 0:00:01[K     |█████████████▉                  | 81 kB 24.4 MB/s eta 0:00:01[K     |███████████████▋                | 92 kB 25.5 MB/s eta 0:00:01[K     |█████████████████▎              | 102 kB 23.0 MB/s eta 0:00:01[K     |███████████████████             | 112 kB 23.0 MB/s eta 0:00:01[K     |████████████████████▊           | 122 kB 23.0 MB/s eta 0:00:01[K     |██████████████████████▌         | 133 kB 23.0 MB/s eta 0:00:01[K     |████████████████████████▏       | 143 kB 23.0 MB/s eta 0:

In [None]:
import torch
from transformers import *
from fastai.text.all import *

from blurr.data.all import *
from blurr.modeling.all import *

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
n_labels = len(imdb_df['label'].unique())

In [None]:
n_labels

2

In [None]:
imdb_df['label'].unique()

array(['negative', 'positive'], dtype=object)

In [None]:
model_cls = AutoModelForSequenceClassification

pretrained_model_name = "bert-base-uncased"

config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = n_labels

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls, config=config)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), CategoryBlock)

In [None]:
dblock = DataBlock(blocks=blocks,  get_x=ColReader('text'), get_y=ColReader('label'), splitter=ColSplitter())

dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
dls.show_batch(dataloaders=dls)

Unnamed: 0,text,target
0,"raising victor vargas : a review < br / > < br / > you know, raising victor vargas is like sticking your hands into a big, steaming bowl of oatmeal. it's warm and gooey, but you're not sure if it feels right. try as i might, no matter how warm and gooey raising victor vargas became i was always aware that something didn't quite feel right. victor vargas suffers from a certain overconfidence on the director's part. apparently, the director thought that the ethnic backdrop of a latino family on the lower east side, and an idyllic storyline would make the film critic proof. he was right, but it didn't fool me. raising victor vargas is the story about a seventeen - year old boy called, you guessed it, victor vargas ( victor rasuk ) who lives his teenage years chasing more skirt than the rolling stones could do in all the years they've toured. the movie starts off in ` ugly fat'donna's bedroom where victor is sure to seduce her, but a cry from outside disrupts his plans when his best - friend harold ( kevin rivera ) comes - a - looking for him. caught in the attempt by harold and his sister, victor vargas runs off for damage control. yet even with the embarrassing implication that he's been boffing the homeliest girl in the neighborhood, nothing dissuades young victor from going off on the hunt for more fresh meat. on a hot, new york city day they make way to the local public swimming pool where victor's eyes catch a glimpse of the lovely young nymph judy ( judy marte ), who's not just pretty, but a strong and independent too. the relationship that develops between victor and judy becomes the focus of the film. the story also focuses on victor's family that is comprised of his grandmother or abuelita ( altagracia guzman ), his brother nino ( also played by real life brother to victor, silvestre rasuk ) and his sister vicky ( krystal rodriguez ). the action follows victor between scenes with judy and scenes with his family. victor tries to cope with being an oversexed pimp - daddy, his feelings for judy and his grandmother's conservative catholic upbringing. < br / > < br / > the problems that arise from raising victor vargas are a few, but glaring errors. throughout the film you get to know certain characters like vicky, nino, grandma, judy and even",negative
1,"the shop around the corner is one of the sweetest and most feel - good romantic comedies ever made. there's just no getting around that, and it's hard to actually put one's feeling for this film into words. it's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. in fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. it's easy to think that such a love story, as beautiful as any other ever told, * could * happen to you... a feeling you don't often get from other romantic comedies, however sweet and heart - warming they may be. < br / > < br / > alfred kralik ( james stewart ) and clara novak ( margaret sullavan ) don't have the most auspicious of first meetings when she arrives in the shop ( matuschek & co. ) he's been working in for the past nine years, asking for a job. they clash from the very beginning, mostly over a cigarette box that plays music when it's opened - - he thinks it's a ludicrous idea ; she makes one big sell of it and gets hired. their bickering takes them through the next six months, even as they both ( unconsciously, of course! ) fall in love with each other when they share their souls and minds in letters passed through po box 237. this would be a pretty thin plotline to base an entire film on, except that the shop around the corner is expertly fleshed - out with a brilliant supporting cast made up of entirely engaging characters, from the fatherly but lonely hugo matuschek ( frank morgan ) himself, who learns that his shop really is his home ; pirovitch ( felix bressart ), kralik's sidekick and friend who always skitters out of the room when faced with the possibility of being asked for his honest opinion ; smarmy pimp - du - jour vadas ( joseph schildkraut ) who ultimately gets his comeuppance from a gloriously righteous kralik ; and ambitious errand boy pepi katona ( william tracy ) who wants nothing more than to be promoted to the position of clerk for matuschek & co. the unpretentious love story between '",positive
2,"the blob starts with one of the most bizarre theme songs ever, sung by an uncredited burt bacharach of all people! you really have to hear it to believe it, the blob may be worth watching just for this song alone & my user comment summary is just a little taste of the classy lyrics... after this unnerving opening credits sequence the blob introduces us, the viewer that is, to steve andrews ( steve mcqueen as steven mcqueen ) & his girlfriend jane martin ( aneta corsaut ) who are parked on their own somewhere & witness what looks like a meteorite falling to earth in nearby woods. an old man ( olin howland as olin howlin ) who lives in a cabin also sees it & goes to investigate, he finds a crater & a strange football sized rock which splits open when he unwisely pokes it with a stick. laying in the centre of the meteorite is a strange jelly like substance which sticks to the stick, if you know what i mean! it then slides up the stick & attaches itself to the old man's hand. meanwhile steve & jane are quietly driving along minding their own business when the old man runs out in front of steve's car, steve being a decent kinda guy decides to take the old man to dr. t. hallan ( alden'stephen'chase as steven chase ) at the local surgery. dr. hallan says he doesn't know what the substance on the old man's hand is but it's getting bigger & asks steve to go back where he found him & see if he can find out what happened. steve agrees but doesn't come up with anything & upon returning to dr. hallan's surgery he witnesses the blob devouring him. the town's police, lieutenant dave ( earl rowe ) & the teenage hating sergeant jim bert ( john benson ) unsurprisingly don't believe a word of it & end up suspecting steve & his mates al ( anthony franke ), tony ( robert fields ) & someone called'mooch'miller ( james bonnet ) of playing an elaborate practical joke on the police department. however as the blob continues to eat it's way through the town steve sets about finding proof of it's existence & convincing the police about the threat it posses not just to their town but the entire world! < br / > < br / > directed irvin s. yeaw",negative
3,"the year 2005 saw no fewer than 3 filmed productions of h. g. wells'great novel, "" war of the worlds "". this is perhaps the least well - known and very probably the best of them. no other version of wotw has ever attempted not only to present the story very much as wells wrote it, but also to create the atmosphere of the time in which it was supposed to take place : the last year of the 19th century, 1900 using wells'original setting, in and near woking, england. < br / > < br / > imdb seems unfriendly to what they regard as "" spoilers "". that might apply with some films, where the ending might actually be a surprise, but with regard to one of the most famous novels in the world, it seems positively silly. i have no sympathy for people who have neglected to read one of the seminal works in english literature, so let's get right to the chase. the aliens are destroyed through catching an earth disease, against which they have no immunity. if that's a spoiler, so be it ; after a book and 3 other films ( including the 1953 classic ), you ought to know how this ends. < br / > < br / > this film, which follows wells'plot in the main, is also very cleverly presented in a way that might put many viewers off due to their ignorance of late 19th / early 20th century photography. although filmed in a widescreen aspect, the film goes to some lengths to give an impression of contemporaneity. the general coloration of skin and clothes display a sepia tint often found in old photographs ( rather than black ). colors are often reminiscent of hand - tinting. at other times, colors are washed out. these variations are typical of early films, which didn't use standardized celluloid stock and therefore presented a good many changes in print quality, even going from black / white to sepia / white to blue / white to reddish / white and so on as you'll see on occasion here. the special effects are deliberately retrograde, of a sort seen even as late as the 1920s and yet the martians and their machines are very much as wells described them and have a more nearly realistic "" feel "". some of effects are really awkward such as the destruction of big ben. the acting is often more in the style of that period than ours. some aspects of victorian dress may appear odd, particularly the use of pomade",positive


In [None]:
model = BaseModelWrapper(hf_model)

learn = Learner(dls, 
                model,
                opt_func=partial(Adam, decouple_wd=True),
                loss_func=CrossEntropyLossFlat(),
                metrics=[accuracy],
                cbs=[BaseModelCallback],
                splitter=blurr_splitter)

learn.freeze()

learn.fit_one_cycle(3, lr_max=1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.569163,0.353877,0.87,00:35
1,0.365226,0.308661,0.9,00:35


In [None]:
learn.show_results(learner=learn, max_n=2)

In [5]:
classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/638M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
classifier("I am very angry at Abhilash for the way he is handling my leaves. Let's be professional like things people say in cleancode")

[{'label': '2 stars', 'score': 0.42168861627578735}]

In [8]:
classifier("I am happy")

[{'label': '5 stars', 'score': 0.5958592891693115}]

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [10]:
inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
inputs

{'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
pt_batch = tokenizer(

    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],

    padding=True,

    truncation=True,

    max_length=512,

    return_tensors="pt",

)

In [12]:
pt_batch

{'input_ids': tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996,   100,
         19081,  3075,  1012,   102],
        [  101,  2057,  3246,  2017,  2123,  1005,  1056,  5223,  2009,  1012,
           102,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [16]:
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
pt_model.save_pretrained(pt_save_directory)

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

classes = ["not paraphrase", "is paraphrase"]

sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

# The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
# the sequence, as well as compute the attention masks.
paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")

paraphrase_classification_logits = model(**paraphrase).logits
not_paraphrase_classification_logits = model(**not_paraphrase).logits

paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]

# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")

# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

not paraphrase: 10%
is paraphrase: 90%
not paraphrase: 94%
is paraphrase: 6%
