In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, BartTokenizer, BartForConditionalGeneration
from datasets import Dataset
from rouge_score import rouge_scorer, scoring
from bert_score import score as bert_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


df = pd.read_csv("D:\\Downloads\\sem 6\\nlp\\Project\\EnglishNews_train_v2\\english_train.csv")

df['Article'] = df['Article'].replace('\n', ' ', regex=True)  
df['Article'] = df['Article'].str.replace('&amp;', '&')  

train_df, val_df = train_test_split(df, test_size=0.25, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

num_epochs = 3  
batch_size = 5   
optimizer = AdamW(model.parameters(), lr=2e-5)  

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_losses = []
print("training started.")
for epoch in range(num_epochs):
    print(f"epoch {epoch} running.")
    model.train()
    total_loss = 0
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    for batch in train_dataloader:
        inputs = tokenizer(batch['Article'], padding="max_length", truncation=True, max_length=1024, return_tensors="pt").to(device)
        labels = tokenizer(batch['Summary'], padding="max_length", truncation=True, max_length=150, return_tensors="pt")['input_ids'].to(device)

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        
    avg_loss = total_loss / len(train_dataloader)
    train_losses.append(avg_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.4f}")

plt.plot(range(1, num_epochs+1), train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss vs Epoch')
plt.legend()
plt.show()

model.eval()
generated_summaries = []
target_summaries = []
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
for batch in val_dataloader:
    inputs = tokenizer(batch['Article'], padding="max_length", truncation=True, max_length=1024, return_tensors="pt").to(device)
    labels = tokenizer(batch['Summary'], padding="max_length", truncation=True, max_length=150, return_tensors="pt")['input_ids'].to(device)

    generated_ids = model.generate(inputs.input_ids, max_length=150, num_beams=4, early_stopping=True)
    generated_summaries.extend(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
    target_summaries.extend(batch['Summary'])

rouge_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [rouge_scorer.score(gen_summary, target_summary)['rougeL'].fmeasure for gen_summary, target_summary in zip(generated_summaries, target_summaries)]
avg_rouge_l_f1 = sum(rouge_scores) / len(rouge_scores)

bert_scores = bert_score(generated_summaries, target_summaries, lang='en', model_type='bert-base-uncased')
avg_bert_score = bert_scores[2].mean().item()  

print("Validation ROUGE-L (n=2) F1:", avg_rouge_l_f1)
print("Validation BERTScore F1:", avg_bert_score)

model.save_pretrained("D:\\Downloads\\sem 6\\nlp\\Project\\model_checkpoint")

val_generated_df = pd.DataFrame({'Article': val_df['Article'], 'Generated_Summary': generated_summaries})
val_generated_df.to_csv("D:\\Downloads\\sem 6\\nlp\\Project\\validation_generated_summary.csv", index=False)