# I. Project Team Members

| Prepared by | Email | Prepared for |
| :-: | :-: | :-: |
| **Hardefa Rogonondo** | hardefarogonondo@gmail.com | **Research Paper Summarization Engine** |

# II. Notebook Target Definition

_Insert Text Here_

# III. Notebook Setup

## III.A. Import Libraries

In [1]:
from datasets import load_metric
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup, PegasusForConditionalGeneration, PegasusTokenizer, T5ForConditionalGeneration, T5Tokenizer
import numpy as np
import os
import pandas as pd
import pickle
import random
import torch

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
cuda_available = torch.cuda.is_available()
print(f"CUDA available: {cuda_available}")
if cuda_available:
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device name: NVIDIA GeForce GTX 1660 SUPER


## III.B. Import Data

In [3]:
train_df = pd.read_pickle('../../data/processed/train_df_processed.pkl')
test_df= pd.read_pickle('../../data/processed/test_df_processed.pkl')
validation_df = pd.read_pickle('../../data/processed/validation_df_processed.pkl')

In [4]:
train_df.head()

Unnamed: 0,source,target
0,Due to the success of deep learning to solving...,We provide necessary and sufficient analytical...
1,The backpropagation (BP) algorithm is often th...,"Biologically plausible learning algorithms, pa..."
2,"We introduce the 2-simplicial Transformer, an ...",We introduce the 2-simplicial Transformer and ...
3,"We present Tensor-Train RNN (TT-RNN), a novel ...",Accurate forecasting over very long time horiz...
4,Recent efforts on combining deep models with p...,We propose a variational message-passing algor...


In [5]:
test_df.head()

Unnamed: 0,source,target
0,Incremental class learning involves sequential...,"FearNet is a memory efficient neural-network, ..."
1,Multi-view learning can provide self-supervisi...,Multi-view learning improves unsupervised sent...
2,We show how discrete objects can be learnt in ...,We show how discrete objects can be learnt in ...
3,Most recent gains in visual recognition have o...,A large-scale dataset for training attention m...
4,"In recent years, deep neural networks have dem...",We proposed a time-efficient defense method ag...


In [6]:
validation_df.head()

Unnamed: 0,source,target
0,Mixed precision training (MPT) is becoming a p...,We devise adaptive loss scaling to improve mix...
1,"Many real-world problems, e.g. object detectio...",We present a novel approach for learning to pr...
2,Foveation is an important part of human vision...,We compare object recognition performance on i...
3,We explore the concept of co-design in the con...,We develop methods to train deep neural models...
4,Batch Normalization (BatchNorm) has shown to b...,Investigation of how BatchNorm causes adversar...


# IV. Models Training and Evaluation

## IV.A. Data Shape Inspection

In [7]:
train_df.shape, test_df.shape, validation_df.shape

((1991, 2), (618, 2), (618, 2))

## IV.B. Data Information Inspection

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1991 entries, 0 to 1991
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  1991 non-null   object
 1   target  1991 non-null   object
dtypes: object(2)
memory usage: 46.7+ KB


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  618 non-null    object
 1   target  618 non-null    object
dtypes: object(2)
memory usage: 9.8+ KB


In [10]:
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 618 entries, 0 to 618
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   source  618 non-null    object
 1   target  618 non-null    object
dtypes: object(2)
memory usage: 14.5+ KB


## IV.C. Models Training

### IV.C.1. Random Seed Initialization

In [11]:
def set_seed(seed_value=777):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [12]:
set_seed(777)

### IV.C.2. Tokenizer Initialization

In [13]:
class SummarizationDataset(Dataset):
    def __init__(self, tokenizer, text_list, summary_list, max_length=512):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for text, summary in zip(text_list, summary_list):
            encodings = tokenizer(
                text,
                max_length=max_length,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            target_encodings = tokenizer(
                summary,
                max_length=max_length,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            self.input_ids.append(encodings.input_ids)
            self.attn_masks.append(encodings.attention_mask)
            self.labels.append(target_encodings.input_ids)


    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx].flatten(),
            "attention_mask": self.attn_masks[idx].flatten(),
            "labels": self.labels[idx].flatten()
        }


    def __len__(self):
        return len(self.input_ids)


def prepare_data(tokenizer, dfs, batch_size=8):
    datasets = {
        split: SummarizationDataset(
            tokenizer,
            df["source"].tolist(),
            df["target"].tolist()
        ) for split, df in dfs.items()
    }
    loaders = {
        f"{split}_loader": DataLoader(
            dataset,
            batch_size=batch_size,
            shuffle=(split == "train")
        ) for split, dataset in datasets.items()
    }
    return loaders

In [14]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)
pegasus_tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
dataframes = {
    "train": train_df,
    "test": test_df,
    "validation": validation_df
}

In [16]:
t5_loaders = prepare_data(t5_tokenizer, dataframes, batch_size=8)
pegasus_loaders = prepare_data(pegasus_tokenizer, dataframes, batch_size=2)

### IV.C.3. Load Pre-Trained Models

In [17]:
def initialize_model_and_optimizer(model_name, device, learning_rate, total_steps):
    if model_name.startswith("t5"):
        model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    elif model_name.startswith("google/pegasus"):
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
    else:
        raise ValueError("Unsupported model. Please use 't5-small' or 'google/pegasus-xsum'.")
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    return model, optimizer, scheduler


def train_model(model, optimizer, scheduler, device, data_loader, epochs=3):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(data_loader, desc=f"Epoch {epoch+1}", leave=True)
        for batch in progress_bar:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({"loss": loss.item()})
        average_train_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch+1} | Average Training Loss: {average_train_loss}")

In [18]:
epochs = 3

### IV.C.4. T5 Small Model Training

In [19]:
# t5_model_name = 't5-small'
# t5_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# t5_learning_rate = 5e-5
# t5_total_steps = len(t5_loaders["train_loader"]) * epochs
# t5_batch_size = 8
# t5_model, t5_optimizer, t5_scheduler = initialize_model_and_optimizer(t5_model_name, t5_device, t5_learning_rate, t5_total_steps)

In [20]:
# train_model(t5_model, t5_optimizer, t5_scheduler, t5_device, t5_loaders["train_loader"], epochs=epochs)

### IV.C.5. PEGASUS Model Training

In [21]:
pegasus_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pegasus_model_name = 'google/pegasus-xsum'
pegasus_learning_rate = 5e-5
pegasus_batch_size = 2
pegasus_total_steps = len(pegasus_loaders["train_loader"]) * epochs
pegasus_model, pegasus_optimizer, pegasus_scheduler = initialize_model_and_optimizer(pegasus_model_name, pegasus_device, pegasus_learning_rate, pegasus_total_steps)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
train_model(pegasus_model, pegasus_optimizer, pegasus_scheduler, pegasus_device, pegasus_loaders["train_loader"], epochs=epochs)

Epoch 1:   0%|          | 0/498 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 752.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 19.79 GiB is allocated by PyTorch, and 232.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## IV.D. Models Selection

### IV.D.1. Baseline Model Performance Review

In [None]:
def evaluate_summarization_model(model, model_name, tokenizer, data_loaders, device):
    model.eval()
    rouge = load_metric("rouge")
    results = []
    for key, data_loader in data_loaders.items():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            with torch.no_grad():
                outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
            pred_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            true_summaries = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
            rouge.add_batch(predictions=pred_summaries, references=true_summaries)
        final_scores = rouge.compute()
        result = {metric: value.mid.fmeasure * 100 for metric, value in final_scores.items()}
        result["model"] = model_name
        result["dataset"] = key
        results.append(result)
        rouge = load_metric("rouge")
    return pd.DataFrame(results)

In [None]:
data_loaders = {
    "train": t5_loaders["train_loader"],
    "validation": t5_loaders["validation_loader"],
    "test": t5_loaders["test_loader"]
}

df_t5 = evaluate_summarization_model(t5_model, "T5", t5_tokenizer, data_loaders, device)

In [None]:
data_loaders = {
    "train": pegasus_loaders["train_loader"],
    "validation": pegasus_loaders["validation_loader"],
    "test": pegasus_loaders["test_loader"]
}

df_t5 = evaluate_summarization_model(pegasus_model, "T5", pegasus_tokenizer, data_loaders, device)

### IV.E.3. Export Baseline Best Model

In [None]:
with open('../../models/baseline_best_model.pkl', 'wb') as file:
    pickle.dump(baseline_best_model, file)

## IV.F. Hyperparameters Tuning

### IV.F.1. Hyperparameters List

#### IV.F.1.A. Grid Search

In [None]:
log_reg_hyperparams = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear']
}

In [None]:
log_reg_grid_search = GridSearchCV(
    LogisticRegression(random_state=777),
    log_reg_hyperparams,
    n_jobs=-1,
    verbose=420,
    scoring='f1_macro'
)

In [None]:
log_reg_grid_search.fit(X_train, y_train)

In [None]:
best_estimator_from_grid = log_reg_grid_search.best_estimator_

In [None]:
models_list["fine-tuned"] = [{"model_name": "GridSearchBest-LogisticRegression",
                              "model_object": best_estimator_from_grid, "model_uid": ""}]

#### IV.F.1.B. Bayesian Search

In [None]:
log_reg_space = {
    'penalty': hyperopt.hp.choice('penalty', ['l1', 'l2']),
    'C': hyperopt.hp.loguniform('C', np.log(1e-4), np.log(1e4)),
    'solver': 'liblinear'
}

In [None]:
def objective(params):
    classifier = LogisticRegression(**params, random_state=777)
    score = cross_val_score(classifier, X_train,
                            y_train, cv=5, scoring='f1_macro').mean()
    return {'loss': -score, 'status': STATUS_OK}

In [None]:
trials = Trials()
best = fmin(fn=objective,
            space=log_reg_space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)
best_params = space_eval(log_reg_space, best)

In [None]:
print("The best parameters are: ", best_params)

In [None]:
optimal_log_reg = LogisticRegression(**best_params, random_state=777)

In [None]:
models_list["fine-tuned"].append({"model_name": "BayesOpt-LogisticRegression",
                                  "model_object": optimal_log_reg, "model_uid": ""})

### IV.F.2. Best Model Hyperparameter Retraining

In [None]:
training_log, models_list_tuned = model_training_and_evaluation(
    models_list["fine-tuned"],
    "tuned_model",
    X_train,
    y_train,
    X_test,
    y_test,
    "tuned",
    '../../models/logs/training_log.json'
)

In [None]:
models_list

### IV.F.3. Hyperparameter-tuned Model Performance Review

In [None]:
all_training_logs_df_tuned = training_log_to_df_converter(training_log)
all_training_logs_df_tuned

#### IV.F.3.A. Grid Searched Model Performance Review

In [None]:
models_dict_tuned = {"fine-tuned": models_list_tuned}
tuned_best_model = tuned_model_finder(
    models_dict_tuned["fine-tuned"], "GridSearchBest")
tuned_best_model

In [None]:
metrics_df = get_metrics_dataframe(
    tuned_best_model, X_train, y_train, X_test, y_test)
metrics_df

In [None]:
display_confusion_matrix(
    tuned_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_train_vs_test_error(
    tuned_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_roc_curve(tuned_best_model, X_train,
               y_train, X_test, y_test)

In [None]:
plot_model_learning_curve(tuned_best_model, X_train, y_train)

#### IV.F.3.B. Bayesian Searched Model Performance Review

In [None]:
models_dict_tuned = {"fine-tuned": models_list_tuned}
tuned_best_model = tuned_model_finder(
    models_dict_tuned["fine-tuned"], "BayesOpt")
tuned_best_model

In [None]:
metrics_df = get_metrics_dataframe(
    tuned_best_model, X_train, y_train, X_test, y_test)
metrics_df

In [None]:
display_confusion_matrix(
    tuned_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_train_vs_test_error(
    tuned_best_model, X_train, y_train, X_test, y_test)

In [None]:
plot_roc_curve(tuned_best_model, X_train,
               y_train, X_test, y_test)

In [None]:
plot_model_learning_curve(tuned_best_model, X_train, y_train)

### IV.F.4. Export Hyperparameter-tuned Best Model

In [None]:
with open('../../models/tuned_best_model.pkl', 'wb') as file:
    pickle.dump(tuned_best_model, file)