<a href="https://colab.research.google.com/github/kairamilanifitria/NLP-Projects/blob/main/Project%202%20Text%20Summarization/Fix/3_Modelling_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets transformers torch accelerate

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00

In [3]:
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, DataCollatorForSeq2Seq, EncoderDecoderModel
from transformers import pipeline

In [4]:
# Load the dataset
train_df = pd.read_csv('/content/drive/MyDrive/Bootcamp AI/Dataset/Task2 : Text Summarization/fix_dataset/30k/train_df.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/Bootcamp AI/Dataset/Task2 : Text Summarization/fix_dataset/30k/dev_df.csv')

# Basic overview
print(train_df.shape)
print(train_df.info())
print(train_df.isnull().sum())

print(dev_df.shape)
print(dev_df.info())
print(dev_df.isnull().sum())


(29842, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29842 entries, 0 to 29841
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Unnamed: 0                  29842 non-null  int64 
 1   original_text               29842 non-null  object
 2   abstractive_summary         29842 non-null  object
 3   extractive_summary          29842 non-null  object
 4   original_text_length        29842 non-null  int64 
 5   abstractive_summary_length  29842 non-null  int64 
 6   extractive_summary_length   29842 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 1.6+ MB
None
Unnamed: 0                    0
original_text                 0
abstractive_summary           0
extractive_summary            0
original_text_length          0
abstractive_summary_length    0
extractive_summary_length     0
dtype: int64
(3000, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
D

In [18]:
train_df.iloc[1]

Unnamed: 0,1
original_text,liputan6 com jakarta karyawan pt angkasa pura ...
abstractive_summary,karyawan pt angkasa pura i berencana mogok ker...
extractive_summary,liputan6 com jakarta karyawan pt angkasa pura ...


In [19]:
dev_df.iloc[1]

Unnamed: 0,1
original_text,liputan6 com jakarta peruntungan ahmad taufik ...
abstractive_summary,presiden menyempatkan diri untuk bertemu denga...
extractive_summary,seperti tak mau ketinggalan presiden abdurrahm...


In [20]:
# Check if GPU is available and set device accordingly
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')


Using device: cuda


In [21]:
# Load pre-trained BERT model for text summarization (abstractive and extractive)
model_name = "cahya/bert2bert-indonesian-summarization"
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token
model = EncoderDecoderModel.from_pretrained(model_name).to(device)



In [22]:
# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

In [23]:
def preprocess_function(examples):
    # Use a separator token to distinguish between extractive and abstractive summaries
    separator_token = " <|extractive|> "

    # Convert list summaries to strings if necessary
    extractive_summary = [
        " ".join(summary) if isinstance(summary, list) else summary
        for summary in examples["extractive_summary"]
    ]
    abstractive_summary = [
        " ".join(summary) if isinstance(summary, list) else summary
        for summary in examples["abstractive_summary"]
    ]

    # Combine both summaries into one string with a separator
    combined_summary = [
        ex_sum + separator_token + ab_sum
        for ex_sum, ab_sum in zip(extractive_summary, abstractive_summary)
    ]

    # Tokenize the text input
    model_inputs = tokenizer(
        examples["original_text"], max_length=512, truncation=True, padding="max_length"
    )

    # Tokenize the combined summary (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            combined_summary, max_length=256, truncation=True, padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [24]:
# Apply preprocessing to the train and dev datasets
tokenized_train_datasets = train_dataset.map(preprocess_function, batched=True)
tokenized_dev_datasets = dev_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/29842 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [25]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    push_to_hub=False,
    fp16=True,
    save_steps=500,           # Save the model every 500 steps
    eval_steps=500,           # Evaluate every 500 steps
    logging_dir="./logs",     # Directory for logs
    logging_steps=100,        # Log every 100 steps
)


# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,  # Use tokenized train dataset
    eval_dataset=tokenized_dev_datasets,     # Use tokenized dev dataset for validation
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [26]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.2176,0.306791
2,0.167,0.306361
3,0.1335,0.311002


Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
Non-default generation

TrainOutput(global_step=11193, training_loss=0.18909799561662008, metrics={'train_runtime': 8186.1014, 'train_samples_per_second': 10.936, 'train_steps_per_second': 1.367, 'total_flos': 5.49204982848553e+16, 'train_loss': 0.18909799561662008, 'epoch': 3.0})

In [27]:
# prompt: save the model that has already trained

trainer.save_model("./trained_model")

Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


In [28]:
import os

# Define the path in Google Drive where you want to save the model
output_dir = '/content/drive/MyDrive/Bootcamp AI/Dataset/Task2 : Text Summarization/BERT2BERT'

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the trained model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

Non-default generation parameters: {'max_length': 40, 'min_length': 20, 'early_stopping': True, 'num_beams': 10, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


Model saved to /content/drive/MyDrive/Bootcamp AI/Dataset/Task2 : Text Summarization/BERT2BERT
