In [89]:
import torch
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import Trainer, TrainingArguments
from torch.utils.data import TensorDataset

In [90]:
# Load the data
data = pd.read_csv("data/summarization.csv")

In [91]:
# Split the data into training and validation sets
train_data = data.sample(frac=0.8, random_state=42)
val_data = data.drop(train_data.index)

In [92]:
train_data

Unnamed: 0,resolution,summary
199,resolution miami city commission co designatin...,Miami city commission co designates southwest ...
420,ordinance miami city commission amending chapt...,Miami city commission amending chapter article...
694,ordinance miami city commission amend ing ordi...,Miami city commission passed first ading next ...
750,resolution miami city commission attachment au...,Miami city commission attachment is attached t...
507,resolution miami city commission authorizing c...,Miami city commission authorizing city manager...
...,...,...
950,ordinance miami city commission attachment ame...,Miami city commission attachment amending ordi...
619,ordinance miami city commission amending zonin...,Miami city commission amending zoning atlas is...
814,resolution miami city commission granting deny...,Miami city commission reversing affirming plan...
645,resolution miami city commission attachment ra...,Miami city commission attachment ratifying ame...


In [93]:
# Define a function to encode the data
def encode_data(data, tokenizer):
    input_ids = []
    attention_masks = []
    labels = []

    for i in range(len(data)):
        encoded_dict = tokenizer(
            data.iloc[i]['resolution'],
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        labels.append(tokenizer.encode(
            data.iloc[i]['summary'],
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        ))

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.cat(labels, dim=0)

    return input_ids, attention_masks, labels

In [94]:
# Load the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM')
model = BartForConditionalGeneration.from_pretrained('knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM')

# Encode the training and validation data
train_input_ids, train_attention_masks, train_labels = encode_data(train_data, tokenizer)
val_input_ids, val_attention_masks, val_labels = encode_data(val_data, tokenizer)

# Create PyTorch datasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    num_train_epochs=3,
    report_to=[],  # Disable wandb integration
)


loading file vocab.json from cache at C:\Users\Administrator/.cache\huggingface\hub\models--knkarthick--MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM\snapshots\b2f95a34d1ea84ca28fa8d0fb8075b46b140f3ef\vocab.json
loading file merges.txt from cache at C:\Users\Administrator/.cache\huggingface\hub\models--knkarthick--MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM\snapshots\b2f95a34d1ea84ca28fa8d0fb8075b46b140f3ef\merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\Administrator/.cache\huggingface\hub\models--knkarthick--MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM\snapshots\b2f95a34d1ea84ca28fa8d0fb8075b46b140f3ef\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\Administrator/.cache\huggingface\hub\models--knkarthick--MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM\snapshots\b2f95a34d1ea84ca28fa8d0fb8075b46b140f3ef\tokenizer_config.json
loading configuration file config.json fro

In [95]:
from torch.utils.data.dataloader import default_collate

def custom_data_collator(features):
    input_ids = default_collate([f[0] for f in features])
    attention_masks = default_collate([f[1] for f in features])
    labels = default_collate([f[2] for f in features])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'labels': labels
    }

In [96]:
# Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=custom_data_collator,
)

OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB (GPU 0; 8.00 GiB total capacity; 7.27 GiB already allocated; 0 bytes free; 7.33 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF