In [None]:
%%capture
!pip install datasets==1.0.2
!pip install transformers

import datasets
import transformers

In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [None]:
train_data = datasets.load_dataset("amazon_us_reviews","Apparel_v1_00",split="train[:60000]")

In [None]:
train_data = datasets.load_dataset("amazon_us_reviews", "Grocery_v1_00",split="train[:60000]")
val_data = datasets.load_dataset("amazon_us_reviews",'Grocery_v1_00', split="train[60000:63000]")
test_data=datasets.load_dataset("amazon_us_reviews", 'Grocery_v1_00', split="train[63000:65000]")

In [None]:
import pandas as pd
from IPython.display import display, HTML
from datasets import ClassLabel

df = pd.DataFrame(train_data[5000:5003])
for column, typ in train_data.features.items():
      if isinstance(typ, ClassLabel):
          df[column] = df[column].transform(lambda i: typ.names[i])
display(HTML(df.to_html()))

Unnamed: 0,customer_id,helpful_votes,marketplace,product_category,product_id,product_parent,product_title,review_body,review_date,review_headline,review_id,star_rating,total_votes,verified_purchase,vine
0,36290331,0,US,Apparel,B011K3PEWO,183050004,Women Swing Dresses Retro Hepburn Dress Classy Size XL F-4,very well constructed,2015-08-05,Elegant,R31XYT5049JIMC,5,2,Y,N
1,7478727,12,US,Apparel,B011K2MKXQ,948944557,Leno Fashion Mens Pullover Fleece Hoodies,"I ordered this and it came a few weeks before the estimated delivery date. I bought this for my brother who normally wears a size medium in mens. After reading the reviews I ordered an XL and it fits, but it's a bit snug. The arms are tight. I probably could have ordered next size up and it would have fit much better. But he likes it so overall just get a few sizes up and you're good.",2015-08-11,fast shipping.,R27KMSEBXHA5RR,4,13,Y,N
2,30609491,2,US,Apparel,B011K0WPZ6,422607286,Pullover Tee Top in Plaid Pattern for Women Large,I love this shirt !!! It fit very well and I was worried it wouldn't since it's in Asian sizes instead of USA. It arrived earlier than I had expected. Great product and purchase😃👍🏻,2015-08-28,Super nice shirt!!!,RPREUFG7K9L4A,5,3,Y,N


In [None]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.review_body
        self.ctext = self.data.review_headline

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        inputs = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        outputs = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = inputs['input_ids'].squeeze()
        source_mask = inputs['attention_mask'].squeeze()
        target_ids = outputs['input_ids'].squeeze()
        target_mask = outputs['attention_mask'].squeeze()
        labels=outputs["input_ids"].copy()

        return {
            'input_ids': source_ids.to(dtype=torch.long), 
            'attention_mask': source_mask.to(dtype=torch.long), 
            'decoder_input_ids': target_ids.to(dtype=torch.long),
            'decoder_attention_mask': target_ids.to(dtype=torch.long),
            'labels':labels.to(dtype=torch.long)
        }

In [None]:
training_set = CustomDataset(df, tokenizer, 30, 5)

In [None]:
train_params = {
        'batch_size': 64,
        'shuffle': True,
        'num_workers': 0
        }

In [None]:
training_loader = DataLoader(training_set, **train_params)

In [None]:
batch_size=64  # change to 16 for full training
encoder_max_length=16
decoder_max_length=6

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["review_body"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["review_headline"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

# only use 32 training examples for notebook - DELETE LINE FOR FULL TRAINING
#train_data = train_data.select(range(32))

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["review_body", "review_headline"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
#val_data = val_data.select(range(16))

# val_data = val_data.map(
#     process_data_to_model_inputs, 
#     batched=True, 
#     batch_size=batch_size, 
#     remove_columns=["review_body", "review_headline"]
# )
# val_data.set_format(
#     type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
# )

HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))




In [None]:
train_data

Dataset(features: {'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'customer_id': Value(dtype='string', id=None), 'decoder_attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'decoder_input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'helpful_votes': Value(dtype='int32', id=None), 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'marketplace': Value(dtype='string', id=None), 'product_category': Value(dtype='string', id=None), 'product_id': Value(dtype='string', id=None), 'product_parent': Value(dtype='string', id=None), 'product_title': Value(dtype='string', id=None), 'review_date': Value(dtype='string', id=None), 'review_id': Value(dtype='string', id=None), 'star_rating': Value(dtype='int32', id=None), 'total_votes': Value(dtype='int32', id=None), 'verified_purchase': Clas

In [None]:
from transformers import EncoderDecoderModel

bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

In [None]:

# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 16
bert2bert.config.min_length = 2
bert2bert.config.no_repeat_ngram_size = 2
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

In [None]:
import transformers.training_args

In [None]:
%%capture
!rm seq2seq_trainer.py
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/seq2seq/seq2seq_trainer.py

!pip install git-python==1.0.3
!pip install sacrebleu==1.4.12
!pip install rouge_score

from seq2seq_trainer import Seq2SeqTrainer
from transformers import TrainingArguments
from dataclasses import dataclass, field
from typing import Optional

In [None]:

@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(
        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
    )
    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSamler or not."})
    predict_with_generate: bool = field(
        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
    )
    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
    encoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
    )
    decoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
    )
    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
    attention_dropout: Optional[float] = field(
        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
    )
    lr_scheduler: Optional[str] = field(
        default="linear", metadata={"help": f"Which lr scheduler to use."}
    )

In [None]:
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:

# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    #evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=2,  # set to 1000 for full training
    save_steps=16,  # set to 500 for full training
    eval_steps=20,  # set to 8000 for full training
    warmup_steps=1,  # set to 2000 for full training
    max_steps=1000, # delete for full training
    overwrite_output_dir=True,
    save_total_limit=1,
    fp16=True, 
)
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    #eval_dataset=val_data,
)
trainer.train()

  return torch.tensor(x, **format_kwargs)


Step,Training Loss
2,11.456554
4,11.542711
6,10.418022
8,8.32341
10,7.280854
12,6.462896
14,5.796972
16,5.42946
18,5.110919
20,4.783245


TrainOutput(global_step=1000, training_loss=2.974914128780365)

In [None]:
!nvidia-smi

Tue Dec  1 16:48:05 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0    60W / 149W |   9411MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
pred_str[:-10]

['very tasty!',
 'five stars',
 'five stars',
 'three stars',
 'great flavor and flavor',
 'five stars']

In [None]:
label_str[:-10]

['Classic and classy hot sauce',
 'Five Stars',
 'Five Stars',
 'One Star',
 'Peppered with Praise',
 'Great little hard candies!']

In [None]:
import datasets
from transformers import BertTokenizer, EncoderDecoderModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel.from_pretrained("./checkpoint-992")
model.to("cuda")
batch_size = 64

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["review_body"], padding="max_length", truncation=True, max_length=32, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["review_body"])

pred_str = results["pred"]
label_str = results["review_headline"]

rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Score(precision=0.4375, recall=0.4375, fmeasure=0.4375)


In [None]:
print("ROUGE 1 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])["rouge1"].mid)
print("ROUGE 2 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid)
print("ROUGE F SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rougeL"])["rougeL"].mid)

ROUGE 1 SCORE:  Score(precision=0.484375, recall=0.4618055555555555, fmeasure=0.4616071428571428)
ROUGE 2 SCORE:  Score(precision=0.4375, recall=0.4375, fmeasure=0.4375)
ROUGE F SCORE:  Score(precision=0.484375, recall=0.4618055555555555, fmeasure=0.4616071428571428)


In [None]:
!cp -r /content/checkpoint-992 "/content/drive/MyDrive/IIITD/ML PROJECT"