In [None]:
%%capture
!pip install datasets==1.0.2
!pip install transformers

import datasets
import transformers

In [None]:
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/ML Project/Reviews.csv")
df.drop(columns=['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator','HelpfulnessDenominator', 'Score', 'Time'],axis=1,inplace=True)
print("Before",len(df))
df = df.dropna()
print("Data size:",len(df))
df.head()

Before 568454
Data size: 568427


Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
cnt=0
for i in df['Summary']:
    if(len(i.split())<=8):
        cnt=cnt+1
print(cnt/len(df['Summary']))

0.9340601343708163


In [None]:
from datasets import Dataset
train_data=Dataset.from_pandas(df[:550000])
val_data=Dataset.from_pandas(df[550000:555000])
test_data=Dataset.from_pandas(df[556000:557000])

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
batch_size=256  # change to 16 for full training
encoder_max_length=200
decoder_max_length=8

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["Text"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["Summary"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

# only use 32 training examples for notebook - DELETE LINE FOR FULL TRAINING
#train_data = train_data.select(range(32))

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["Text", "Summary"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
#val_data = val_data.select(range(16))

val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["Text", "Summary"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

HBox(children=(FloatProgress(value=0.0, max=2149.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [None]:
from transformers import EncoderDecoderModel

# set encoder decoder tying to True
roberta_shared = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-base", "xlm-roberta-base", tie_encoder_decoder=True)

In [None]:
# set special tokens
roberta_shared.config.decoder_start_token_id = tokenizer.bos_token_id                                             
roberta_shared.config.eos_token_id = tokenizer.eos_token_id

# sensible parameters for beam search
# set decoding params                               
roberta_shared.config.max_length = 40
roberta_shared.config.early_stopping = True
roberta_shared.config.no_repeat_ngram_size = 3
roberta_shared.config.length_penalty = 2.0
roberta_shared.config.num_beams = 4
roberta_shared.config.vocab_size = roberta_shared.config.encoder.vocab_size

In [None]:
%%capture
!rm seq2seq_trainer.py
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/seq2seq/seq2seq_trainer.py

!pip install git-python==1.0.3
!pip install sacrebleu==1.4.12
!pip install rouge_score



In [None]:
from seq2seq_trainer import Seq2SeqTrainer
from transformers import TrainingArguments
from dataclasses import dataclass, field
from typing import Optional

In [None]:
@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(
        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
    )
    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSamler or not."})
    predict_with_generate: bool = field(
        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
    )
    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
    encoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
    )
    decoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
    )
    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
    attention_dropout: Optional[float] = field(
        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
    )
    lr_scheduler: Optional[str] = field(
        default="linear", metadata={"help": f"Which lr scheduler to use."}
    )

In [None]:

# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:

# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    #evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=2,  # set to 1000 for full training
    save_steps=16,  # set to 500 for full training
    eval_steps=500,  # set to 8000 for full training
    warmup_steps=500,  # set to 2000 for full training
    #max_steps=1500, # delete for full training
    overwrite_output_dir=True,
    save_total_limit=1,
    fp16=True, 
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=roberta_shared,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

The `config.pad_token_id` is `None`. Using `config.eos_token_id` = 2 for padding..
  return torch.tensor(x, **format_kwargs)


Step,Training Loss
2,11.335747
4,11.623138
6,11.60352
8,11.606628
10,11.41567
12,11.425371
14,11.280243
16,10.871471
18,10.871493
20,10.698437


Buffered data was truncated after reaching the output size limit.

In [None]:
import datasets
from transformers import BertTokenizer, EncoderDecoderModel

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = EncoderDecoderModel.from_pretrained("./checkpoint-6432")
model.to("cuda")
batch_size = 1024

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["Text"], padding="max_length", truncation=True, max_length=40, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch
test_data=train_data.select(range(2000))
#test_data = test_data.select(range(100))

results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["Text"])

pred_str = results["pred"]
label_str = results["Summary"]



HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




In [None]:
rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

Score(precision=0.08559583333333305, recall=0.06251614105071196, fmeasure=0.06678968426995044)


In [None]:
print("ROUGE 1 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])["rouge1"].mid)
print("ROUGE 2 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid)
print("ROUGE F SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rougeL"])["rougeL"].mid)

ROUGE 1 SCORE:  Score(precision=0.21340416666666873, recall=0.1636389503537622, fmeasure=0.17342903585939215)
ROUGE 2 SCORE:  Score(precision=0.08559583333333305, recall=0.06251614105071196, fmeasure=0.06678968426995044)
ROUGE F SCORE:  Score(precision=0.2109208333333353, recall=0.1622950767068766, fmeasure=0.1717394407783805)


In [None]:
print("ROUGE 1 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])["rouge1"].mid)
print("ROUGE 2 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid)
print("ROUGE F SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rougeL"])["rougeL"].mid)

ROUGE 1 SCORE:  Score(precision=0.2598333333333333, recall=0.20612388167388174, fmeasure=0.21629323980113466)
ROUGE 2 SCORE:  Score(precision=0.12000000000000002, recall=0.08766197691197694, fmeasure=0.09472258297258293)
ROUGE F SCORE:  Score(precision=0.25799999999999995, recall=0.20527016594516606, fmeasure=0.21531045971572313)


In [None]:
#1000
print("ROUGE 1 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])["rouge1"].mid)
print("ROUGE 2 SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid)
print("ROUGE F SCORE: ",rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rougeL"])["rougeL"].mid)

ROUGE 1 SCORE:  Score(precision=0.28300000000000025, recall=0.22678713596345187, fmeasure=0.23644438174055074)
ROUGE 2 SCORE:  Score(precision=0.13516666666666663, recall=0.1035207070707071, fmeasure=0.10974152791652791)
ROUGE F SCORE:  Score(precision=0.2805000000000001, recall=0.22537975080475076, fmeasure=0.23482952341346325)


In [None]:
pred_str[:10]

['Good quality dog food',
 'Misleading Description',
 'Fantastic!',
 'This is the best',
 'Great Taffy',
 'Great Taffy',
 'Great Taffy',
 'Great Taffy',
 'My cats love it',
 'Healthy dog food']

In [None]:
label_str[:10]

['Good Quality Dog Food',
 'Not as Advertised',
 '"Delight" says it all',
 'Cough Medicine',
 'Great taffy',
 'Nice Taffy',
 'Great!  Just as good as the expensive brands!',
 'Wonderful, tasty taffy',
 'Yay Barley',
 'Healthy Dog Food']

In [None]:
!cp -r /content/checkpoint-6432 "/content/drive/MyDrive/IIITD/ML PROJECT"