## Installations

In [None]:
!pip install -U -q bitsandbytes langchain_experimental langchain_huggingface

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Imports

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from langchain_huggingface import HuggingFacePipeline
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document

## Load subset of large dataset

In [None]:
# 3. Load your Excel file into a pandas DataFrame
file_path = 'relationships.xlsx'
df = pd.read_excel(file_path)

# Subset of dataset used for fine-tuning
# df = df[:10000]

## Prepare the data

In [None]:
# 4. Prepare the data

# System prompt
prefix = """
Extract relationship triplet from given sentence.
Return only the target triplets in response.
"""
# Format for training prompt
def format_for_t5(row):
    return {
        "input": prefix + f"sentence: {row['sentence']}",
        "target": row['triplets']
    }

# Apply formatting to loaded dataset
formatted_data = df.apply(format_for_t5, axis=1).to_list()
dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))
dataset = dataset.train_test_split(test_size=0.1, seed=42)

## Test the created dataset

In [None]:
print(dataset["train"][0])
print(dataset["test"][0])

{'input': '\nExtract relationship triplet from given sentence.\nReturn only the target triplets in response.\nsentence: Caesar also made Lepidus "magister equitum" ("Master of the Horse"), effectively his deputy.', 'target': '(Marcus Aemilius Lepidus ; position held ; Master of the Horse)'}
{'input': '\nExtract relationship triplet from given sentence.\nReturn only the target triplets in response.\nsentence: Twain was born Eilleen Regina Edwards in Windsor, Ontario, on August 28, 1965, to Sharon ("née" Morrison) and Clarence Edwards.', 'target': '(Shania Twain ; birth name ; Eilleen Regina Edwards)'}


## Initialize tokenizer

In [None]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Tokenize the data

In [None]:
max_input_length = 100
max_target_length = 30

def tokenize_function(examples):
    inputs = tokenizer(
        examples["input"],
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )
    outputs = tokenizer(
        examples["target"],
        max_length=max_target_length,
        padding="max_length",
        truncation=True
    )

    batch = {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": outputs.input_ids,
    }

    # labels token_id changed to -100
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in batch["labels"]
    ]

    return batch

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Load the base

In [None]:
bnb_config = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

## Configure LoRA

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

## Apply LoRA

In [None]:
model = get_peft_model(model, lora_config)

# Checking if adapter weights loaded
model.print_trainable_parameters()

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


## Set up training loop and parameters

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-small-lora-triplet-extractor",
    run_name="t5-small-triplet-extractor",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-3,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16=True,
    label_names=["labels"]
)

## Create data collator

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

## Intialize trainer object and run training loop

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,1.4379,1.213023
2,1.3176,1.148156
3,1.2541,1.127503


TrainOutput(global_step=6750, training_loss=1.3933607855902779, metrics={'train_runtime': 1246.2288, 'train_samples_per_second': 21.665, 'train_steps_per_second': 5.416, 'total_flos': 747233869824000.0, 'train_loss': 1.3933607855902779, 'epoch': 3.0})

## Save model weights (local)

In [None]:
model.save_pretrained("./t5-small-lora-triplet-extractor")
tokenizer.save_pretrained("./t5-small-lora-triplet-extractor")

('./t5-small-lora-triplet-extractor/tokenizer_config.json',
 './t5-small-lora-triplet-extractor/special_tokens_map.json',
 './t5-small-lora-triplet-extractor/spiece.model',
 './t5-small-lora-triplet-extractor/added_tokens.json',
 './t5-small-lora-triplet-extractor/tokenizer.json')

## Save model weights (remote: HuggingFace)

In [None]:
notebook_login()

model_name_on_hub = "xxx-i-am-raahul-m-xxx/t5-small-lora-triplet-extractor-2"
model.push_to_hub(model_name_on_hub, use_auth_token=True)
tokenizer.push_to_hub(model_name_on_hub, use_auth_token=True)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



Uploading...:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Uploading...:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/xxx-i-am-raahul-m-xxx/t5-small-lora-triplet-extractor-2/commit/90bf37159d9875b6fe460507a8ddd4bb4115849d', commit_message='Upload tokenizer', commit_description='', oid='90bf37159d9875b6fe460507a8ddd4bb4115849d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/xxx-i-am-raahul-m-xxx/t5-small-lora-triplet-extractor-2', endpoint='https://huggingface.co', repo_type='model', repo_id='xxx-i-am-raahul-m-xxx/t5-small-lora-triplet-extractor-2'), pr_revision=None, pr_num=None)

## Testing the model

In [None]:
from transformers import pipeline

model_name_on_hub = "xxx-i-am-raahul-m-xxx/t5-small-lora-triplet-extractor-2"

# Load the model from the Hub
extractor = pipeline(
    "text2text-generation",
    model=model_name_on_hub,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Sample sentence
test_sentence = "A doctor treats diseased patients in the hospital premises"
result = extractor("extract triplets: " + test_sentence)
print(result[0]['generated_text'])

Device set to use cuda:0


(Anthony ; hospital premises ; hospital)
