**Prepare data**

In [None]:
cd /content/drive/MyDrive/NLP/data

/content/drive/MyDrive/NLP/data


In [None]:
!unzip /content/drive/MyDrive/NLP/data/validation.csv.zip

Archive:  /content/drive/MyDrive/NLP/data/validation.csv.zip
  inflating: validation.csv          


In [None]:
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/NLP/data/train.csv")

In [None]:
data.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287113 entries, 0 to 287112
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          287113 non-null  object
 1   article     287113 non-null  object
 2   highlights  287113 non-null  object
dtypes: object(3)
memory usage: 6.6+ MB


## **Get data had a maximum 1024 tokens after tokenizing using the GPT tokenizer.​​**

In [None]:
%cd /content/drive/MyDrive/NLP

/content/drive/MyDrive/NLP


In [None]:
!pip install transformers rouge

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/7.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m5.9/7.4 MB[0m [31m86.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.4/7.4 MB[0m [31m96.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.4 MB/s

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Model, GPT2PreTrainedModel
# from torch.optim import Adam
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from rouge import Rouge

In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_length):
        self.data = pd.read_csv(data_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Filter the data based on token count
        self.filtered_data = self.filter_data()

    def filter_data(self):
        filtered_data = []
        count = 0
        for index, row in self.data.iterrows():
            text = row['article']
            input_ids = self.tokenizer.encode(text, add_special_tokens=True)

            # Truncate the input sequence if it exceeds the maximum length
            if len(input_ids) > self.max_length:
                input_ids = input_ids[:self.max_length]
                count += 1

            filtered_data.append((input_ids, row['highlights']))
            if (count == 1500):
              # count == 150 for validation
              break


        return filtered_data

    def __len__(self):
        return len(self.filtered_data)

    def __getitem__(self, index):
        input_ids, summary = self.filtered_data[index]

        # Padding
        padding_length = self.max_length - len(input_ids)
        input_ids += [self.tokenizer.pad_token_id] * padding_length

        attention_mask = [1] * len(input_ids)

        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'summary': summary
        }


In [None]:
# Custom model class for extractive summarization
class SummarizationModel(GPT2PreTrainedModel):
    def __init__(self, config):
        super(SummarizationModel, self).__init__(config)
        self.gpt2 = GPT2Model(config)
        self.linear = torch.nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0, :]
        logits = self.linear(pooled_output).squeeze(-1)
        return logits


In [None]:

# Function to train the model
def train(model, train_loader, optimizer, device):
    model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        summaries = batch['summary']
        labels = torch.zeros(input_ids.shape[0], dtype=torch.float).to(device)

        for i, summary in enumerate(summaries):
            summary_ids = tokenizer.encode_plus(summary, add_special_tokens=False)['input_ids']
            labels[i] = len(summary_ids)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = torch.nn.MSELoss()(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    return train_loss

In [None]:

# Example usage
# val_data_path  = '/content/drive/MyDrive/NLP/data/validation.csv'
train_data_path = '/content/drive/MyDrive/NLP/data/train.csv'
# test_data_path = '/content/drive/MyDrive/NLP/data/test.csv'
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos_token
max_length = 1024
batch_size = 2   # Reduce the batch size
epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare the datasets
train_dataset = SummarizationDataset(train_data_path, tokenizer, max_length)
# val_dataset = SummarizationDataset(val_data_path, tokenizer, max_length)
# test_dataset = SummarizationDataset(test_data_path, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)





Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1063 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
val_data_path  = '/content/drive/MyDrive/NLP/data/validation.csv'
class SummarizationDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_length):
        self.data = pd.read_csv(data_path)
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Filter the data based on token count
        self.filtered_data = self.filter_data()

    def filter_data(self):
        filtered_data = []
        count = 0
        for index, row in self.data.iterrows():
            text = row['article']
            input_ids = self.tokenizer.encode(text, add_special_tokens=True)

            # Truncate the input sequence if it exceeds the maximum length
            if len(input_ids) > self.max_length:
                input_ids = input_ids[:self.max_length]
                count += 1

            filtered_data.append((input_ids, row['highlights']))
            if (count == 150):
              # count == 150 for validation
              break


        return filtered_data

    def __len__(self):
        return len(self.filtered_data)

    def __getitem__(self, index):
        input_ids, summary = self.filtered_data[index]

        # Padding
        padding_length = self.max_length - len(input_ids)
        input_ids += [self.tokenizer.pad_token_id] * padding_length

        attention_mask = [1] * len(input_ids)

        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'summary': summary
        }
val_dataset = SummarizationDataset(val_data_path, tokenizer, max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
!pip install rouge-score

In [None]:
from rouge_score import rouge_scorer

def evaluate(model, data_loader, device):
    model.eval()
    rouge_scores = Rouge()
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_l_scores = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            summaries = batch['summary']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            for i, output in enumerate(outputs):
                summary_length = int(torch.round(output).item())
                predicted_summary = tokenizer.decode(input_ids[i, :summary_length].cpu().numpy())
                target_summary = summaries[i]
                rouge_scores = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
                scores = rouge_scores.score(target_summary, predicted_summary)
                rouge_1_scores.append(scores['rouge1'].fmeasure)
                rouge_2_scores.append(scores['rouge2'].fmeasure)
                rouge_l_scores.append(scores['rougeL'].fmeasure)

    avg_rouge_1 = sum(rouge_1_scores) / len(rouge_1_scores)
    avg_rouge_2 = sum(rouge_2_scores) / len(rouge_2_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_rouge_1, avg_rouge_2, avg_rouge_l


In [None]:
# Initialize the model
from tqdm import tqdm
import transformers
model = SummarizationModel.from_pretrained('gpt2')
model.to(device)
transformers.logging.set_verbosity_error()
# Initialize the optimizer
# optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()),lr=2e-5)
epochs = 6
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

resume_epoch = 1  # Epoch to resume training from
resume_path = f'model_GPT2_checkpoint_epoch_{resume_epoch}.pt'

if resume_epoch > 1:
    model.load_state_dict(torch.load(resume_path))


# Training loop
for epoch in range(resume_epoch, epochs):

    train_loss = train(model, train_loader, optimizer, device)
    print(f'Epoch: {epoch}/{epochs}, Train Loss: {train_loss:.4f}')

    # Save model after each epoch
    save_path = f'model_GPT2_checkpoint_epoch_{epoch}.pt'
    torch.save(model.state_dict(), save_path)
    # if epoch >= 3:
    # # Evaluate the model on the validation set
    #     rouge_score = evaluate(model, val_loader, device)
    #     print(f'Epoch: {epoch}/{epochs}, Rouge Score: {rouge_score:.4f}')

    #     # Check if the Rouge score has improved
    #     if rouge_score > best_rouge_score:
    #         best_rouge_score = rouge_score
    #         best_model_path = f'best_model_checkpoint.pt'
    #         torch.save(model.state_dict(), best_model_path)

    #     # Stop training if no improvement in Rouge score after epoch 3
    #     if epoch > 3 and rouge_score <= best_rouge_score:
    #         print("Training paused. Resume training later.")
    #         break


Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of the model checkpoint at gpt2 were not used when initializing SummarizationModel: ['h.2.mlp.c_fc.weight', 'h.0.ln_2.bias', 'h.4.attn.c_proj.bias', 'h.0.mlp.c_proj.weight', 'h.2.ln_2.bias', 'h.5.mlp.c_fc.bias', 'h.7.ln_1.bias', 'h.3.mlp.c_fc.weight', 'h.7.mlp.c_fc.weight', 'h.4.ln_1.bias', 'h.8.ln_2.bias', 'h.2.ln_2.weight', 'h.5.ln_2.bias', 'h.6.ln_1.weight', 'h.6.mlp.c_proj.bias', 'h.9.ln_2.weight', 'h.9.attn.c_proj.weight', 'h.5.attn.c_proj.bias', 'h.9.attn.c_proj.bias', 'h.6.mlp.c_proj.weight', 'h.11.ln_1.bias', 'h.7.ln_2.weight', 'h.0.attn.c_proj.weight', 'h.3.attn.c_proj.weight', 'h.2.mlp.c_proj.weight', 'h.1.mlp.c_proj.weight', 'h.10.attn.c_attn.weight', 'h.2.attn.c_attn.bias', 'h.3.ln_1.weight', 'h.0.ln_1.weight', 'h.3.ln_2.bias', 'h.9.ln_1.bias', 'h.3.attn.bias', 'h.4.mlp.c_fc.weight', 'h.8.mlp.c_fc.weight', 'h.6.attn.c_attn.weight', 'h.2.attn.c_attn.weight', 'h.9.mlp.c_proj.bias', 'h.3.attn.c_attn.weight', 'h.0.attn.bias', 'h.4.ln_1.weight', 'h.1.ln_1.bias', 'h.

### **Test**

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos_token
max_length =
batch_size = 4  # Reduce the batch size
test_data_path = '/content/drive/MyDrive/NLP/data/test.csv'
test_dataset = SummarizationDataset(test_data_path, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1039 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
test= pd.read_csv("/content/drive/MyDrive/NLP/data/test.csv")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11490 entries, 0 to 11489
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          11490 non-null  object
 1   article     11490 non-null  object
 2   highlights  11490 non-null  object
dtypes: object(3)
memory usage: 269.4+ KB


In [None]:
# Load the best model checkpoint for testing
model = SummarizationModel.from_pretrained('gpt2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.load_state_dict(torch.load("/content/drive/MyDrive/NLP/model_checkpoint_epoch_3.pt"))


Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of the model checkpoint at gpt2 were not used when initializing SummarizationModel: ['h.1.attn.bias', 'h.6.attn.bias', 'h.3.attn.c_attn.bias', 'h.1.mlp.c_proj.bias', 'h.4.ln_2.bias', 'h.6.ln_2.bias', 'h.8.ln_1.weight', 'h.10.mlp.c_fc.weight', 'h.3.attn.bias', 'h.8.ln_1.bias', 'h.4.mlp.c_fc.bias', 'h.11.mlp.c_proj.bias', 'h.2.ln_1.weight', 'h.1.attn.c_attn.bias', 'h.7.ln_2.weight', 'h.0.ln_2.bias', 'h.7.ln_1.weight', 'h.4.attn.c_proj.bias', 'h.8.mlp.c_proj.weight', 'h.7.attn.c_proj.bias', 'h.5.ln_1.bias', 'h.7.ln_1.bias', 'h.3.mlp.c_proj.bias', 'h.7.mlp.c_fc.bias', 'h.11.ln_2.bias', 'h.5.ln_2.bias', 'h.7.attn.c_attn.weight', 'h.6.mlp.c_fc.bias', 'h.2.attn.c_attn.weight', 'h.11.attn.c_attn.bias', 'h.3.mlp.c_proj.weight', 'h.4.attn.c_proj.weight', 'h.3.mlp.c_fc.bias', 'h.1.mlp.c_fc.weight', 'h.4.ln_1.bias', 'h.8.mlp.c_fc.weight', 'h.9.attn.c_attn.bias', 'wpe.weight', 'h.10.ln_2.bias', 'h.9.mlp.c_fc.weight', 'h.8.attn.c_attn.bias', 'h.7.mlp.c_proj.bias', 'h.6.attn.c_attn.weigh

<All keys matched successfully>

In [None]:
rouge =evaluate(model, test_loader, device)
print(rouge)