In [1]:
import torch
import time
import numpy as np
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer

# 1. Data

In [2]:
import os 

data_folder = "./data/"

opensource_data = os.path.join(data_folder, "open_source_dataset.json")
opensource_data_df = pd.read_json(opensource_data).T.reset_index()

airbus_data = os.path.join(data_folder, "airbus_helicopters_train_set.json")
airbus_data_df = pd.read_json(airbus_data).T.reset_index()

In [3]:
airbus_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   index              413 non-null    object
 1   original_text      413 non-null    object
 2   reference_summary  413 non-null    object
 3   uid                413 non-null    object
dtypes: object(4)
memory usage: 13.0+ KB


In [4]:
airbus_data_df.head(5)

Unnamed: 0,index,original_text,reference_summary,uid
0,train_sum01,These general Standard Conditions of Sale appl...,These terms and conditions apply as soon as th...,train_sum01
1,train_sum010,Each Party represents to the other as at the d...,Each Party represents that the other is not a ...,train_sum010
2,train_sum0100,"All living, travelling and accommodation expen...","Expenses relating to the travelling, living an...",train_sum0100
3,train_sum0101,"Unless otherwise specified in the Contract, th...","Unless otherwise specified in the Contract, th...",train_sum0101
4,train_sum0102,Reasonable insurance coverage of risks arising...,Reasonable insurance coverage of risks arising...,train_sum0102


In [5]:
opensource_data_df.head(5)

Unnamed: 0,index,doc,id,original_text,reference_summary,title,uid,case_code,case_text,note,title_code,title_text,urls,tldr_code,tldr_text
0,legalsum01,Pokemon GO Terms of Service,5786730a6cca83a54c0035b7,welcome to the pokémon go video game services ...,hi.,,legalsum01,,,,,,,,
1,legalsum02,Pokemon GO Terms of Service,57866df76cca83a54c0035a1,by using our services you are agreeing to thes...,by playing this game you agree to these terms....,Agreement To Terms,legalsum02,,,,,,,,
2,legalsum03,Pokemon GO Terms of Service,5786730a6cca83a54c0035b6,if you want to use certain features of the ser...,you have to use google pokemon trainer club or...,Eligibility and Account Registration,legalsum03,,,,,,,,
3,legalsum04,Pokemon GO Terms of Service,57866df76cca83a54c0035a0,during game play please be aware of your surro...,don t die or hurt others and if you do it s no...,Safe Play,legalsum04,,,,,,,,
4,legalsum05,Pokemon GO Terms of Service,57866df76cca83a54c00359f,subject to your compliance with these terms ni...,don t copy modify resell distribute or reverse...,Rights in App,legalsum05,,,,,,,,


# 2. Model

In [6]:
model_name = "lidiya/bart-base-samsum"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 139420416
all model parameters: 139420416
percentage of trainable model parameters: 100.00%


In [8]:
index = 200

text = airbus_data_df.loc[0, 'original_text']
summary = airbus_data_df.loc[0, 'reference_summary']

prompts = f"""
Summarize the following legal text.

{text}

Summary:
"""

inputs = tokenizer(prompts, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompts}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Summarize the following legal text.

These general Standard Conditions of Sale apply to any sale of Products and/or Services sold by the Seller to its Customer(s), excluding brokerage or other distributor activities. The purchase of the Products and/or Services by a Customer is considered to be performed within the framework of its professional activities.

Summary:

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
These terms and conditions apply as soon as the Seller sells Products or Services to the Customer (excluding sales made for brokerage or distributor activities).

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
The Standard Conditions of Sale apply to any sale of Products and/or Services sold by the Seller t

# 3. Fine-tuning 

In [9]:
def tokenize_function(data_df : pd.DataFrame):
    data_df['input_ids'], data_df['labels'] = None, None

    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompts = [start_prompt + text + end_prompt for text in data_df["original_text"]]

    for idx, prompt in enumerate(prompts): 
        input_ids =  tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
        labels_ids = tokenizer(data_df.loc[idx, "reference_summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
        data_df.at[idx, 'input_ids'] = input_ids
        data_df.at[idx, 'labels'] = labels_ids
    
    return data_df

tokenized_airbus_data_df = tokenize_function(airbus_data_df)
tokenized_airbus_data_df.head(5)

Unnamed: 0,index,original_text,reference_summary,uid,input_ids,labels
0,train_sum01,These general Standard Conditions of Sale appl...,These terms and conditions apply as soon as th...,train_sum01,"[[tensor(0), tensor(38182), tensor(3916), tens...","[[tensor(0), tensor(4528), tensor(1110), tenso..."
1,train_sum010,Each Party represents to the other as at the d...,Each Party represents that the other is not a ...,train_sum010,"[[tensor(0), tensor(38182), tensor(3916), tens...","[[tensor(0), tensor(20319), tensor(1643), tens..."
2,train_sum0100,"All living, travelling and accommodation expen...","Expenses relating to the travelling, living an...",train_sum0100,"[[tensor(0), tensor(38182), tensor(3916), tens...","[[tensor(0), tensor(39891), tensor(21526), ten..."
3,train_sum0101,"Unless otherwise specified in the Contract, th...","Unless otherwise specified in the Contract, th...",train_sum0101,"[[tensor(0), tensor(38182), tensor(3916), tens...","[[tensor(0), tensor(33757), tensor(3680), tens..."
4,train_sum0102,Reasonable insurance coverage of risks arising...,Reasonable insurance coverage of risks arising...,train_sum0102,"[[tensor(0), tensor(38182), tensor(3916), tens...","[[tensor(0), tensor(9064), tensor(44640), tens..."


In [10]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

tokenized_airbus_data_df_clean = tokenized_airbus_data_df.drop(['index', 'original_text', 'reference_summary', 'uid'],axis=1)

tokenized_airbus_data_df_clean['input_ids'] = tokenized_airbus_data_df_clean['input_ids'].apply(lambda x: x.tolist())
tokenized_airbus_data_df_clean['labels'] = tokenized_airbus_data_df_clean['labels'].apply(lambda x: x.tolist())

airbus_dataset = Dataset.from_pandas(tokenized_airbus_data_df_clean).train_test_split(test_size=0.05, seed = 42)

In [11]:
airbus_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 392
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 21
    })
})

In [12]:
import configparser

config = configparser.ConfigParser()
config.read('config.ini')

output_dir = f'./legal-doc-summary-training-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=float(config['PARAMETERS_FINE_TUNING']['LR']),
    num_train_epochs=int(config['PARAMETERS_FINE_TUNING']['EPOCHS']),
    weight_decay=float(config['PARAMETERS_FINE_TUNING']['W_DECAY']),
    logging_steps=int(config['PARAMETERS_FINE_TUNING']['LOGGING_STEPS']),
    max_steps=int(config['PARAMETERS_FINE_TUNING']['MAX_STEPS']),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=airbus_dataset['train'],
    eval_dataset=airbus_dataset['test']
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)

# 4. qLoRA

In [14]:
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [15]:
from peft import LoraConfig, get_peft_model, TaskType

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head'], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, config)
print_number_of_trainable_model_parameters(model)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


'trainable model parameters: 1071816\nall model parameters: 140492232\npercentage of trainable model parameters: 0.76%'

In [16]:
args_training = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    max_steps=10,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    optim="paged_adamw_8bit"
)

trainer = Trainer(
    model=model,
    train_dataset=airbus_dataset["train"],
    args=args_training,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).