In [None]:
!pip install datasets==2.1.0 transformers==4.28.0 evaluate --quiet

In [None]:
from datasets import load_dataset
dataset = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
# Use nltk to remove stopwords, puctuations, make text lowercase and lemmatize
import nltk
import re
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
#!unzip /usr/share/nltk_data/corpora/stopwords.zip -d /usr/share/nltk_data/corpora/

lemmatizer = WordNetLemmatizer()

stopwords = stopwords.words('english')
stopwords.remove('no')
stopwords.remove('not')
stopwords.remove('nor')
stopwords.remove('but')
stopwords.remove('against')

def clean(text, contraction_mapping):
  text = re.sub(r'@[A-Za-z0-9]+', '', text) # remove user mentions
  text = re.sub('#','', text) # remove hashtags
  text = ' '.join([contraction_mapping[word] if word in contraction_mapping else word for word in text.split(" ")]) # remove contractions
  text = re.sub(r'\W', ' ', text) # remove special characters
  text = re.sub(r'http\S+','',text) # remove url/links
  text = text.lower() # convert to lowercase

  words = text.split() # split to remove multiple white spaces
  return " ".join(words).strip()

def remove_stop_words_and_lemmatize(text):
  tokens = [w for w in text.split() if not w in stopwords] # remove stopwords
  new_text = ""
  for token in tokens:
    new_text = new_text + lemmatizer.lemmatize(token, "v") + " "

  return new_text.strip()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# remove contraction mappings and preprocess
def preprocess_features(example):
  contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

  sentence = example["text"]
  cleaned = clean(sentence, contraction_mapping)
  cleaned = remove_stop_words_and_lemmatize(cleaned)
    
  example["text"] = cleaned
  return example

In [None]:
# genearte training, valdiation and testing splits

ds = dataset['train'].shuffle(seed=42)
splits = ds.train_test_split(test_size=0.15)
train_ds = splits['train'].map(preprocess_features)
val_ds = splits['test'].map(preprocess_features)

test_ds = dataset['test'].shuffle(seed=42).select([i for i in list(range(1500))])

# train_ds.rename_column('content', 'text')
# val_ds.rename_column('content', 'text')



  0%|          | 0/21250 [00:00<?, ?ex/s]

  0%|          | 0/3750 [00:00<?, ?ex/s]



In [None]:
# import tokenizer of pretrained bert uncased model

model_name = "bert-base-uncased"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)
 
train_ds = train_ds.map(preprocess_function, batched=True)
val_ds = val_ds.map(preprocess_function, batched=True)

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

In [None]:
# setup an evaluation strategy for pretrained model to use

import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
import os
from transformers import TrainingArguments, Trainer

os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2829,0.23468,0.906667
2,0.1731,0.315259,0.900533
3,0.1085,0.342766,0.916


TrainOutput(global_step=3987, training_loss=0.20144629209359127, metrics={'train_runtime': 4752.823, 'train_samples_per_second': 13.413, 'train_steps_per_second': 0.839, 'total_flos': 1.308368953322724e+16, 'train_loss': 0.20144629209359127, 'epoch': 3.0})

In [None]:
# create a function to infer class from input
import torch
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
def inferinputs(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs.to(device)).logits
    predicted_class_id = logits.argmax().item()
    return model.config.id2label[predicted_class_id]    

In [None]:
test_ds = test_ds.map(preprocess_features)

  0%|          | 0/1500 [00:00<?, ?ex/s]

In [None]:
print(inferinputs(test_ds[0]['text']))

pos


'pos'

In [None]:
from sklearn.metrics import classification_report

labels = test_ds['label']
for i in range(len(labels)):
    labels[i] = model.config.id2label[labels[i]]
text = test_ds['text']
predictions = []

for _ in text:
    # input sequence is greater than 512 words long trim it to match bert's max
    # sequence length
    if len(_) > 512:
        _ = _[:512]
    predictions.append(inferinputs(_))

In [None]:
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

         neg       0.88      0.87      0.87       762
         pos       0.87      0.87      0.87       738

    accuracy                           0.87      1500
   macro avg       0.87      0.87      0.87      1500
weighted avg       0.87      0.87      0.87      1500



In [None]:
id2label

{0: 'neg', 1: 'pos'}

In [None]:
print(f"first five test sentences (preprocessed): {test_ds[0:6]['text']}")
print(f"their true labels {labels[0:6]}")
print(f"predictions {predictions[0:6]}")

first five test sentences (preprocessed): ['br br unsuspectedly rent thousand acres think entertain king lear story course michelle pfeiffer could go wrong br br quickly however realize story thousand things besides acres start cry could not stop long movie end thank jane laura jocelyn bring us wonderfully subtle compassionate movie thank cast involve portray character depth gentleness br br recognize angry sister runaway sister sister denial recognize abusive husband father oh oh father superbly play also recognize movie eye opener relief chance face truth finally something truly hope thousand acres effect others br br since not understand cover say film sisters fight land not fight watch second time able see one not live similar story one would easily miss overwhelm undercurrent dread fear deep bond sisters run exactly reason people general often overlook truth neighbor instance br br but yet another reason movie perfect br br not give rat ass pardon french extend king lear story fol

In [None]:
!pip install huggingface_hub --quiet

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("hazardous/bert_base_uncased_finetuned_imdb")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hazardous/bert_base_uncased_finetuned_imdb/commit/5d69f46f23b4dfd4b58bf8c874edfe3c97e0e1a4', commit_message='Upload BertForSequenceClassification', commit_description='', oid='5d69f46f23b4dfd4b58bf8c874edfe3c97e0e1a4', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("hazardous/bert_base_uncased_finetuned_imdb")

CommitInfo(commit_url='https://huggingface.co/hazardous/bert_base_uncased_finetuned_imdb/commit/bd235afe1c12dd058314a4e824e9aee5074277ca', commit_message='Upload tokenizer', commit_description='', oid='bd235afe1c12dd058314a4e824e9aee5074277ca', pr_url=None, pr_revision=None, pr_num=None)