In [1]:
import pandas as pd

# Load the Q&A data from the CSV file
qa_data = pd.read_csv('/Users/ernestgaisie/Desktop/Final Projects/CANADA_MORTGAGE_RATES_ANALYSIS/qa_pairs.csv')

# Inspect the data to ensure it loaded correctly
print(qa_data.head())

                                            Question  \
0  What was the mortgage price in Corner Brook, N...   
1  What was the mortgage price in Gander, Newfoun...   
2  What was the mortgage price in Gander, Newfoun...   
3  What was the mortgage price in Gander, Newfoun...   
4  What was the mortgage price in Labrador City, ...   

                                              Answer  
0  The mortgage price in Corner Brook, Newfoundla...  
1  The mortgage price in Gander, Newfoundland and...  
2  The mortgage price in Gander, Newfoundland and...  
3  The mortgage price in Gander, Newfoundland and...  
4  The mortgage price in Labrador City, Newfoundl...  


In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Assign the eos_token as the pad_token
tokenizer.pad_token = tokenizer.eos_token



In [3]:
import torch

# Determine the maximum sequence length
max_length = max(qa_data['Question'].apply(lambda x: len(tokenizer.encode(x))) +
                 qa_data['Answer'].apply(lambda x: len(tokenizer.encode(x))))

# Tokenize and pad the Q&A pairs
qa_pairs = []

for q, a in zip(qa_data['Question'], qa_data['Answer']):
    encoded_q = tokenizer.encode(q, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)
    encoded_a = tokenizer.encode(a, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)
    qa_pairs.append({'input_ids': encoded_q.squeeze(), 'labels': encoded_a.squeeze()})

# Create a custom PyTorch dataset
class QADataset(torch.utils.data.Dataset):
    def __len__(self):
        return len(qa_pairs)

    def __getitem__(self, idx):
        return qa_pairs[idx]

dataset = QADataset()

In [4]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',            # Output directory
    per_device_train_batch_size=1,     # Batch size per device
    num_train_epochs=3,                # Number of epochs
    save_steps=10,                 # Save checkpoint every 10 steps
    save_total_limit=2,                # Limit the total amount of checkpoints
)

In [None]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()



Step,Training Loss
500,0.922
1000,0.6361
1500,0.5642
2000,0.5362
2500,0.5238
3000,0.513
3500,0.5038
4000,0.4982
4500,0.4981
5000,0.493
