In [1]:
import pandas

# Read in the JSONL training and validation data set
data = pandas.read_json("../data/training/training_validation_set.jsonl", lines=True)
# Show random sample of 10 data sets
data.sample(10)

Unnamed: 0,input,output
229,I rebuilt my unit's Electronic Records Managem...,- Rebuilt unit's Electronic Records Management...
309,This individual led the development of six upd...,- Led development for six PEPP/AIMWTS updates;...
59,I developed 7 analysis tools that raised comma...,- Dev'd 7 analysis tools; raised cmd awareness...
222,As a well-rounded Noncommissioned Officer (NCO...,- Well-rounded NCO; will produce exceptional r...
118,"Serving as a member of the alert photo team, I...",- Member of alert photo team; responded to 5 c...
111,By completing a 40-hour Faculty Development co...,- Comp 40 hr Fac Dev crse; enhanced tech mat/w...
284,"I revamped the HVAC (Heating, Ventilation, and...","- Revamped HVAC system; built and installed 6,..."
56,"I excelled in the EPME/ALS 6-week, 240-hour co...",- Excelled EPME/ALS 6 wk/240 hrs crse; rcv'd c...
51,I led the Small Package Initial Communications...,- Led Small Pkg Initial Comm Element 'Rodeo' t...
10,I successfully drained and purged 7 excess veh...,- Drained/purged 7 excess vehicles/$1.3M; DLA-...


In [2]:
# Prepend T5's summarize task keyword to inputs
data["input"] = "summarize: "+data["input"]
# Show random sample of 10 data sets
data.sample(10)

Unnamed: 0,input,output
26,summarize: I effectively managed the vehicle r...,- Managed 57 volunteers vehicle requirements; ...
216,summarize: I was handpicked for the Fighter Wi...,- Handpicked for FW Civic Leader event; prepar...
205,summarize: I repaired damaged Weapons System S...,- Repaired damaged WSS training munitions; rep...
315,"summarize: By writing, testing, and applying a...",- Wrote/tested/applied new PM print program; a...
253,"summarize: As a Total Force champion, I certif...",- Total Force champion; certified 20 AFRC/ANG ...
155,summarize: Taking the lead as the quarterback ...,- Quarterbacked cooling duct TCTO; designed dr...
187,"summarize: As a custodian account manager, I e...",- Managed 22 custody accounts; outlined provis...
321,summarize: I coordinated fire support for 180 ...,- Coordinated fire support to 180 test mission...
176,"summarize: With a strong focus on safety, I vo...",- Safety conscious; volunteered as 'Top III' d...
186,summarize: I completed training in pylon remov...,- Completed training in pylon removal for TCTO...


In [2]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rich.table import Column, Table
from rich import box
from rich.console import Console

# Define a rich console logger
console = Console(record=True)

# Setup the data frame display
def display_df(df):
    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for _, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)


training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

In [6]:
# Setting up the device for GPU usage, if available
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
# Creating a custom dataset for reading the dataset and loading it into the dataloader 
# to pass it to the neural network for fine tuning the model
class YourDataSetClass(Dataset):

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())

    source = self.tokenizer.batch_encode_plus([source_text], max_length= self.source_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], max_length= self.summ_len, pad_to_max_length=True, truncation=True, padding="max_length", return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

NameError: name 'Dataset' is not defined

In [4]:
# Function to be called for training with the parameters passed from main function
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 10 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss))
            console.print(training_logger)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [3]:
# Function to evaluate model for predictions
def validate(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data["target_ids"].to(device, dtype=torch.long)
            ids = data["source_ids"].to(device, dtype=torch.long)
            mask = data["source_mask"].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True,
            )
            predictions = [
                tokenizer.decode(
                    g, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for g in generated_ids
            ]
            target = [
                tokenizer.decode(
                    t, skip_special_tokens=True, clean_up_tokenization_spaces=True
                )
                for t in y
            ]
            if _ % 10 == 0:
                console.print(f"Completed {_}")

            predictions.extend(predictions)
            actuals.extend(target)
    return predictions, actuals

In [2]:
# T5 training main function
def T5Trainer(
    dataframe, source_text, target_text, model_params, output_dir="./outputs/"
):
    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # Tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"], model_max_length=512)

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of summary
    # Further this model is sent to device (GPU/TPU) for using the hardware
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]
    display_df(dataframe.head(2))

    # Creation of Dataset and Dataloader
    # 80% of the data will be used for training and the rest for validation
    train_size = 0.8
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    console.print(f"FULL Dataset: {dataframe.shape}")
    console.print(f"TRAIN Dataset: {train_dataset.shape}")
    console.print(f"VALIDATION Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of data loader
    training_set = YourDataSetClass(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = YourDataSetClass(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of data loaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of data loaders for testing and validation - this will be used down for training and validation stage for the model
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    # Evaluating validation dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [8]:
model_params = {
    # model_type: t5-base/t5-large
    "MODEL": "t5-large",
    # training batch size
    "TRAIN_BATCH_SIZE": 8,
    # validation batch size
    "VALID_BATCH_SIZE": 8,
    # number of training epochs
    "TRAIN_EPOCHS": 3,
    # number of validation epochs
    "VAL_EPOCHS": 1,
    # learning rate
    "LEARNING_RATE": 1e-4,
    # max length of source text
    "MAX_SOURCE_TEXT_LENGTH": 512,
    # max length of target text
    "MAX_TARGET_TEXT_LENGTH": 50,
    # set seed for reproducibility
    "SEED": 42,
}

In [None]:
output_directory = input("Insert the directory for the model")

# Run training function on the T5 model using data set and training parameters
T5Trainer(dataframe=data[:500], source_text="input", target_text="output", model_params=model_params, output_dir=f"{output_directory}")

In [4]:
# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained('../models/t5-large')

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base', model_max_length=512)

# Preprocess input
input_text = input("Insert a statement to summarize")
input_expected_summary = input("Insert the expected statement summary")
prefix = "summarize: "
input_text = prefix + input_text

inputs = tokenizer.encode_plus(input_text, return_tensors='pt', truncation=True, max_length=512)

# Generate summary
summary_ids = model.generate(
    inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    # Adjust the max length according to your desired summary length
    max_length=150,
    # Adjust the number of beams for beam search
    num_beams=20,
    early_stopping=True
)

summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("> Expected Summary: ", input_expected_summary)
print("> Generated Summary: ", summary)


> Expected Summary:  - Co-authored 64 pg SABER Guide; used to train CONS personnel--120 mnhrs saved/identified as ORI Wing strength
> Generated Summary:  co-author of 64-page SABER Guide used for training CONS personnel. guide saved 120 manpower hours and recognized as strength during ORI
