In [13]:
import pandas as pd

# Load the Q&A data from the CSV file
qa_data = pd.read_csv('/Users/ernestgaisie/Desktop/Final Projects/CANADA_MORTGAGE_RATES_ANALYSIS/qa_pairs.csv')

# Display the first few rows of the dataset
print(qa_data.head())

# Check for any missing values
print(qa_data.isnull().sum())

# Inspect the distribution of question and answer lengths
qa_data['Question_length'] = qa_data['Question'].apply(len)
qa_data['Answer_length'] = qa_data['Answer'].apply(len)

print(qa_data[['Question_length', 'Answer_length']].describe())


                                            Question  \
0  What was the mortgage price in Corner Brook, N...   
1  What was the mortgage price in Gander, Newfoun...   
2  What was the mortgage price in Gander, Newfoun...   
3  What was the mortgage price in Gander, Newfoun...   
4  What was the mortgage price in Labrador City, ...   

                                              Answer  
0  The mortgage price in Corner Brook, Newfoundla...  
1  The mortgage price in Gander, Newfoundland and...  
2  The mortgage price in Gander, Newfoundland and...  
3  The mortgage price in Gander, Newfoundland and...  
4  The mortgage price in Labrador City, Newfoundl...  
Question    0
Answer      0
dtype: int64
       Question_length  Answer_length
count     68759.000000   68759.000000
mean         86.897148      87.979392
std           7.828006       7.850963
min          73.000000      74.000000
25%          81.000000      83.000000
50%          85.000000      86.000000
75%          91.000000    

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Assign the eos_token as the pad_token
tokenizer.pad_token = tokenizer.eos_token

In [4]:
import torch

# Determine the maximum sequence length
max_length = max(qa_data['Question'].apply(lambda x: len(tokenizer.encode(x))) +
                 qa_data['Answer'].apply(lambda x: len(tokenizer.encode(x))))

# Tokenize and pad the Q&A pairs
qa_pairs = []

for q, a in zip(qa_data['Question'], qa_data['Answer']):
    encoded_q = tokenizer.encode(q, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)
    encoded_a = tokenizer.encode(a, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)
    qa_pairs.append({'input_ids': encoded_q.squeeze(), 'labels': encoded_a.squeeze()})

# Create a custom PyTorch dataset
class QADataset(torch.utils.data.Dataset):
    def __len__(self):
        return len(qa_pairs)

    def __getitem__(self, idx):
        return qa_pairs[idx]

dataset = QADataset()

In [5]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results2',            # Output directory
    per_device_train_batch_size=4,     # Batch size per device
    num_train_epochs=1,                # Number of epochs
    save_steps=10_000,                 # Save checkpoint every 10 steps
    save_total_limit=2,                # Limit the total amount of checkpoints
)

In [6]:
from transformers import Trainer

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()



Step,Training Loss
500,0.7804
1000,0.5181
1500,0.497
2000,0.4878
2500,0.4808
3000,0.4789
3500,0.4793
4000,0.4753
4500,0.4735
5000,0.47


TrainOutput(global_step=17190, training_loss=0.4768675222169389, metrics={'train_runtime': 17437.0669, 'train_samples_per_second': 3.943, 'train_steps_per_second': 0.986, 'total_flos': 2140501774464000.0, 'train_loss': 0.4768675222169389, 'epoch': 1.0})

In [7]:
# Save the fine-tuned model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [12]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')

# Example input
input_text = "What is the mortgage rate?"
input_ids = tokenizer.encode(input_text, return_tensors='pt')

output = model.generate(input_ids, max_length=30, num_return_sequences=1)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the mortgage rate?.,,,,,,,,,,,,,,,,,,,,,,,
