In [60]:
import fitz
import torch
from transformers import pipeline 

In [61]:
def pdf_to_text(file_path):
    text = ""
    with fitz.open(file_path) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

In [62]:
credit_agreement_1 = pdf_to_text('documents/credit_agreement.pdf')
credit_agreement_2 = pdf_to_text('documents/credit-agreement_2.pdf')
assumption_agreement = pdf_to_text('documents/assumption-agreement.pdf')
collateral_assignment = pdf_to_text('documents/collateral_assignment-li.pdf')
participation_agreement = pdf_to_text('documents/participation_agreement.pdf')

# Organize Extracted Texts into Labeled Data

In [63]:
training_data = [
    {"text": credit_agreement_1, "label" : "Credit Agreement"},
    {"text": credit_agreement_2, "label" : "Credit Agreement"},
    {"text": assumption_agreement, "label" : "Assumption Agreement"},
    {"text": collateral_assignment, "label" : "Collateral Assignment"},
    {"text": participation_agreement, "label" : "Participation Agreement"}
]

# Save Labeled Data for training

In [64]:
import json

with open('training_data.json', 'w') as f:
    json.dump(training_data, f)

# Load tokenizer and model

In [88]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", num_labels=4)

model.resize_token_embeddings(len(tokenizer))


  0%|          | 0/6 [11:11<?, ?it/s]

[A
Loading checkpoint shards: 100%|██████████| 2/2 [00:29<00:00, 14.71s/it]


Embedding(32000, 4096)

# Prepare dataset

In [90]:
import torch

class DocumentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self,idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        self.tokenizer.add_special_token({'pad_token': '\0'})
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        return encoding
    
    

# Convert labels to indices

In [91]:
label_map = {"Assumption Agreement": 0, "Collateral Assignment": 1, "Credit Agreement": 2, "Participation Agreement": 3}
texts = [item['text'] for item in training_data]
labels = [label_map[item['label']] for item in training_data]

dataset = DocumentDataset(texts,labels, tokenizer)

# Training arguments

In [92]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)



# Trainer

In [93]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset= dataset,
)

# Train the model

In [94]:
trainer.train()

RuntimeError: MPS backend out of memory (MPS allocated: 36.14 GB, other allocations: 384.00 KB, max allowed: 36.27 GB). Tried to allocate 172.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).