In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install evaluate
!pip install py-rouge

In [None]:
import pandas as pd
import nltk
import re
import string
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')                # Used for tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import csv

def load_training_data_from_csv(csv_file):
    epoch_sources = []
    epoch_targets = []

    with open(csv_file, 'r', newline='') as csvfile:
        csvreader = csv.DictReader(csvfile)
        for row in csvreader:
            source_text = row['article']  # Use 'article' column as source
            target_summary = row['highlights']  # Use 'summary' column as target
            epoch_sources.append(source_text)
            epoch_targets.append(target_summary)

    return epoch_sources, epoch_targets

# Load your training data from a CSV file
csv_file = '/content/drive/MyDrive/NLP/Data/train.csv'
epoch_sources, epoch_targets = load_training_data_from_csv(csv_file)


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

def transform(source, target):
    source_encodings = tokenizer(source, padding=True, truncation=True, return_tensors='pt', max_length=1024, add_special_tokens=True)
    target_encodings = tokenizer(target, padding=True, truncation=True, return_tensors='pt', max_length=160, add_special_tokens=True)
    return source_encodings, target_encodings

def train(source_encodings, target_encodings):
    optimizer.zero_grad()
    input_ids = source_encodings['input_ids']
    attention_mask = source_encodings['attention_mask']
    labels = target_encodings['input_ids']

    loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
    loss.backward()
    optimizer.step()

# Training loop
for epoch in range(50):
    for source_batch, target_batch in zip(epoch_sources, epoch_targets):
        source_encodings, target_encodings = transform(source_batch, target_batch)
        train(source_encodings, target_encodings)

    # Print the current epoch
    print(f"Epoch {epoch + 1}/{50} complete.")

print("Training complete.")


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
model.save_pretrained("/content/drive/MyDrive/NLP/Model")
tokenizer.save_pretrained("/content/drive/MyDrive/NLP/Model")

('/content/drive/MyDrive/NLP/Model/tokenizer_config.json',
 '/content/drive/MyDrive/NLP/Model/special_tokens_map.json',
 '/content/drive/MyDrive/NLP/Model/vocab.json',
 '/content/drive/MyDrive/NLP/Model/merges.txt',
 '/content/drive/MyDrive/NLP/Model/added_tokens.json')

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load pre-trained model and tokenizer
model_name = '/content/drive/MyDrive/NLP/Model'
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

In [None]:
data = pd.read_csv('/content/drive/MyDrive/NLP/Data/summarized-data.csv')
# Sample input text
data.head()

Unnamed: 0,id,article,highlights,Summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...


In [None]:
output_text=[]
for i in data['Summary']:
    # Tokenize the input text
    input_ids = tokenizer.encode(i, return_tensors='pt')
    # Generate summary
    output_ids = model.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
    # Decode the generated summary
    output_text.append(tokenizer.decode(output_ids[0], skip_special_tokens=True))

data['Transformed_summary'] = output_text
data.head()

Unnamed: 0,id,article,highlights,Summary,Transformed_summary
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...,U.S consumer advisory group says minimum space...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin...","Rahul Kumar, 17, ran towards animals shouting ..."
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...,Nottingham Forest close to extending Dougie Fr...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...,Neto joined Firoentina Brazilian outfit Atleti...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...,comes amid continuing speculation transition w...


In [None]:
# dropping id and article column
data = data.drop(columns = ["id", "article"])
data.head()

Unnamed: 0,highlights,Summary,Transformed_summary
0,Experts question if packed out planes are put...,U.S consumer advisory group says minimum space...,U.S consumer advisory group says minimum space...
1,Drunk teenage boy climbed into lion enclosure ...,"Rahul Kumar , 17 , ran towards animals shoutin...","Rahul Kumar, 17, ran towards animals shouting ..."
2,Nottingham Forest are close to extending Dougi...,Forest boss took former manager Stuart Pearce ...,Nottingham Forest close to extending Dougie Fr...
3,Fiorentina goalkeeper Neto has been linked wit...,Neto joined Firoentina Brazilian outfit Atleti...,Neto joined Firoentina Brazilian outfit Atleti...
4,"Tell-all interview with the reality TV star, 6...",comes amid continuing speculation transition w...,comes amid continuing speculation transition w...


In [None]:
data.to_csv('Transformed-data.csv', index = False)

In [None]:
#calculating the rouge score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

def calculate_rouge_scores(generated_summaries, reference_summaries):
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    for generated, reference in zip(generated_summaries, reference_summaries):
        # Tokenize the generated and reference summaries
        gen_tokens = word_tokenize(generated)
        ref_tokens = word_tokenize(reference)

        # Calculate ROUGE-N scores
        rouge_1_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(1, 0, 0), smoothing_function=SmoothingFunction().method1))
        rouge_2_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(0.5, 0.5, 0), smoothing_function=SmoothingFunction().method1))

        # Calculate ROUGE-L score
        rouge_l_scores.append(sentence_bleu([ref_tokens], gen_tokens, weights=(0, 1, 0), smoothing_function=SmoothingFunction().method1))

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_rouge_1, avg_rouge_2, avg_rouge_l

# Example usage
generated_summaries = data['Transformed_summary']
reference_summaries = data['highlights']

rouge_1, rouge_2, rouge_l = calculate_rouge_scores(generated_summaries, reference_summaries)
print("ROUGE-1 Score:", rouge_1)
print("ROUGE-2 Score:", rouge_2)
print("ROUGE-L Score:", rouge_l)

ROUGE-1 Score: 0.4397386089700881
ROUGE-2 Score: 0.32880611126502907
ROUGE-L Score: 0.2482158724242829


In [None]:
from rouge import Rouge

def calculate_rouge_scores(generated_summaries, reference_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summaries, reference_summaries, avg=True)

    return scores

# Example usage
generated_summaries = data['Transformed_summary']
reference_summaries = data['highlights']

rouge_scores = calculate_rouge_scores(generated_summaries, reference_summaries)
print("ROUGE Scores:")
print("ROUGE-1 F1:", rouge_scores['rouge-1']['f'])
print("ROUGE-2 F1:", rouge_scores['rouge-2']['f'])
print("ROUGE-L F1:", rouge_scores['rouge-l']['f'])