In [None]:
#Install and import the required packages.

# ! pip install datasets transformers rouge-score nltk
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import transformers
import tensorflow_datasets as tfds
import torch

In [None]:
from datasets import load_dataset

#download directly from the tensorflow datasets repo
dataset = load_dataset('reddit',split='train[0%:10%]')

#use pre-downloaded dataset.
# dataset = load_dataset('json', data_files='C:\\Users\\mishr\\tensorflow_datasets\\downloads\\extracted\\ZIP.zeno.org_reco_1043_file_corp-webwaD4xDdMcxTTyexQ3VBTA8U2Bi2HA31NynA1uJs2k4o.zipdownload=1\\corpus-webis-tldr-17.json',split='train[0%:10%]')


In [None]:
#to see how our dataset looks
dataset

In [None]:
#Filter posts with summary length greater than 10, but less than 280

dataset_needed = dataset.filter(lambda example: example['summary_len'] >= 10 and example['summary_len'] <= 280)

In [None]:
#Limit size of posts to 560 words

dataset_needed = dataset_needed.filter(lambda example: example['content_len'] >= 80 and example['content_len'] <= 560)

In [None]:
#Since we only require the Content and Summary columns

updated_dataset = dataset_needed.remove_columns(['author', 'body', 'normalizedBody', 'content_len', 'summary_len', 'id', 'subreddit', 'subreddit_id', 'title'])

In [None]:
model_checkpoint = "t5-small"

In [None]:
from datasets import load_metric


metric = load_metric("rouge")

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [None]:
max_input_length = 1024
max_target_length = 280

#Tokenizes the given text input
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["content"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True,padding=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = updated_dataset.map(preprocess_function, batched=True)

In [None]:
#Split data into train and test/eval
split_tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.1)

In [None]:
split_tokenized_datasets

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-reddit_small",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    # push_to_hub=True,
)

In [None]:
#takes care of batch formation
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np

#Computes the Rouge scores
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
#Used to only provide the relevant columns as input to the model

columns_to_return = ['input_ids', 'labels', 'attention_mask']
split_tokenized_datasets.set_format(type='torch', columns=columns_to_return)


In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=split_tokenized_datasets['train'],
    eval_dataset=split_tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
#Start training the model
trainer.train()

In [None]:
#To decode predictions on the test data set
predictions = trainer.predict(split_tokenized_datasets["test"])

In [None]:
import nltk
import numpy as np

#Decodes the tokenized text
def decode_labels(predictions, labels):
    predictions, labels = predictions, labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    

    
    return decoded_preds, decoded_labels


In [None]:
#d_preds has the decoded predicted summary, d_labels has the decoded golden summary.

d_preds, d_labels = decode_labels(predictions.predictions, predictions.label_ids)

In [None]:
#To test the model on custom input. Use input_tweet to provide text to be summarized.

input_tweet = "Dejan Kulusevski has created at least 1 shooting opportunity from inside the box for his teammates in 6 consecutive Premier League games now. Not a single shot for Dejan Kulusevski today, but his streak of creating at least 1 shooting chance from inside the penalty area for his teammates increases to 5 PL games now. Dejan Kulusevski received the most number of passes amongst Spurs' front 3. Even if you ignore the high quality chance he created for Sonny, if the player playing alongside Kane and Son, passes them the ball 30 of the times and only has 3/45 unsuccessful passes, his job is done."


inputs = tokenizer(input_tweet, max_length=5024, return_tensors="pt")

# Generate Summary
summary_ids = model.generate(inputs["input_ids"].cuda())
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [1]:
#To compare the summaries generated by the 2 models. Use tweets from csv files to extract keywords.

all_files = ['neutral_arsenal.csv', 'negative_aston_vila.csv', 'negative_chelsea.csv', 'neutral_aston_vila.csv', 'neutral_chelsea.csv',
'positive_arsenal.csv', 'positive_aston_vila.csv', 'positive_chelsea.csv','neutral_everton.csv', 'neutral_leeds.csv', 'neutral_leicester.csv', 'positive_everton.csv', 
'positive_leeds.csv', 'positive_leicester.csv','negative_leicester.csv', 'negative_leeds.csv', 'negative_everton.csv']

In [4]:
#Finds if the keywords for the respective tweets are present in the summarized tweet or not

import pandas as pd
import re

all_values = []
for files in all_files:
    path = '..\\..\\data\\' + files

    df = pd.read_csv(path)

    # print(df) 

    summary = []

    for index, row in df.iterrows():
        inputs = tokenizer(row[1], max_length=1024, return_tensors="pt")

        # Generate Summary
        # summary_ids = model.generate(inputs["input_ids"].cuda())

        summary_ids = model.generate(inputs["input_ids"],min_length = 10,max_length=50)
        summary.append(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])

    df['summary'] = summary

    count = 0
    for index, row in df.iterrows():
        res = re.sub(r'[^a-zA-Z]', ' ', row[2])
        for each in res.split("  "):
            if each:
                if each in row['summary']:
                    count = count +1
                    break

    print("len", len(df))
    print("count",count)

    all_values.append(count/len(df))
