# Import necessary libraires

In [1]:
!pip install datasets torch torchvision torchaudio
import torch
import os
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Set display options to show all rows
pd.set_option('display.max_rows', None)



# Verification of PyTorch detecting the Metal device in MacBook:

This is useful to check if GPU is available in Macbook. If available, then we can make use of it to train the models.

In [2]:
if torch.backends.mps.is_available():
    print("MacBook has metal available. We can use GPU to train the model")

MacBook has metal available. We can use GPU to train the model


In [3]:
dataset_common_path = "/Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/NLP_Datasets"
split1_train_df_path = f"{dataset_common_path}/train_dataset_split_1.csv"

# Read first part train dataset into a dataframe
split1_train_df = pd.read_csv(split1_train_df_path)

In [4]:
split1_train_df.shape

(21900, 4)

In [5]:
split1_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21900 entries, 0 to 21899
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           21900 non-null  object
 1   title        21900 non-null  object
 2   input_text   21900 non-null  object
 3   target_text  21900 non-null  object
dtypes: object(4)
memory usage: 684.5+ KB


In [6]:
split1_train_df.head()

Unnamed: 0,id,title,input_text,target_text
0,5733be284776f41900661182,University_of_Notre_Dame,['Question: To whom did the Virgin Mary allege...,['Saint Bernadette Soubirous']
1,5733be284776f4190066117f,University_of_Notre_Dame,['Question: What is in front of the Notre Dame...,['a copper statue of Christ']
2,5733be284776f41900661180,University_of_Notre_Dame,['Question: The Basilica of the Sacred heart a...,['the Main Building']
3,5733be284776f41900661181,University_of_Notre_Dame,['Question: What is the Grotto at Notre Dame? ...,['a Marian place of prayer and reflection']
4,5733be284776f4190066117e,University_of_Notre_Dame,['Question: What sits on top of the Main Build...,['a golden statue of the Virgin Mary']


In [7]:
split1_train_df.tail()

Unnamed: 0,id,title,input_text,target_text
21895,56f8da749b226e1400dd10f3,Near_East,['Question: Until what year did the Ottomans r...,['1912']
21896,56f8da749b226e1400dd10f4,Near_East,['Question: When did the Ottomans lose the ter...,['the two Balkan Wars of 1912–13']
21897,56f8dbf69b226e1400dd1118,Near_East,['Question: How was the Ottoman Empire portray...,['as the sick man of Europe']
21898,56f8dbf69b226e1400dd1119,Near_East,['Question: The Balkan states were primarily w...,['Christian']
21899,56f8dbf69b226e1400dd111a,Near_East,['Question: When did the Ottomans strike at th...,['1894']


In [8]:
split1_train_df.describe().T

Unnamed: 0,count,unique,top,freq
id,21900,21900,5733be284776f41900661182,1
title,21900,97,New_York_City,817
input_text,21900,21856,['Question: Who was a pop idol that started on...,5
target_text,21900,16283,['three'],68


In [9]:
split1_train_df_missing_values = split1_train_df.isnull().sum()
split1_train_df_missing_values

id             0
title          0
input_text     0
target_text    0
dtype: int64

In [10]:
# Access the first row
first_row = split1_train_df.iloc[0]

# Print each value separately
print(f"ID: {first_row['id']}\n")
print(f"Title: {first_row['title']}\n")
print(f"Input Text: {first_row['input_text']}\n")
print(f"Target Text: {first_row['target_text']}")

ID: 5733be284776f41900661182

Title: University_of_Notre_Dame

Input Text: ['Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? Context: Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.']

Target Text: ['Saint Bernadette Soubirous']


In [11]:
first_row

id                                      5733be284776f41900661182
title                                   University_of_Notre_Dame
input_text     ['Question: To whom did the Virgin Mary allege...
target_text                       ['Saint Bernadette Soubirous']
Name: 0, dtype: object

In [12]:
# Check the count of each class in the 'title'
class_counts = split1_train_df['title'].value_counts()

class_counts_df = class_counts.reset_index()
# Rename columns for clarity
class_counts_df.columns = ['title', 'count']

# Print the DataFrame with class counts
print("Count of each class in 'class_column':")
print(class_counts_df)

Count of each class in 'class_column':
                                                title  count
0                                       New_York_City    817
1                                       American_Idol    802
2                                             Beyoncé    758
3                                     Frédéric_Chopin    697
4                                            Buddhism    610
5                             2008_Sichuan_earthquake    521
6                    2008_Summer_Olympics_torch_relay    500
7                                            Portugal    435
8                                          Kanye_West    428
9                                         Southampton    426
10                                                Dog    392
11                Financial_crisis_of_2007%E2%80%9308    390
12                                                Pub    377
13                                           Plymouth    367
14                                            

# Decide which architecture to use

## Comparison of the architectures

To develop chatbots, we have different architectures. 
Let us better understand about them before deciding which one to use.

# Differences Between Seq2Seq, Transformers, and GPT

| Feature       | Seq2Seq                                           | Transformers                                    | GPT                                               |
|---------------|--------------------------------------------------|------------------------------------------------|--------------------------------------------------|
| **Definition**| A model that transforms an input sequence into an output sequence using an encoder and decoder. | A deep learning architecture using self-attention mechanisms to process input sequences. | A specific Transformer model designed for generating text by predicting the next word in a sequence. |
| **Usage**     | Tasks where input and output are sequences, like translation and summarization. | A wide range of NLP tasks, including translation and summarization. | Primarily used for text generation tasks like chatbots and text completion. |
| **Information**| Consists of an encoder that processes the input and a decoder that generates the output. | Composed of an encoder and decoder stack, using self-attention to capture relationships between words. | Utilizes only the decoder part of the Transformer, focusing on unidirectional text generation. |
| **Strengths** | Effective for varying output lengths; good at capturing context. | Can process sequences in parallel; captures long-range dependencies well. | Excellent at generating coherent and contextually relevant text; adapts to various topics. |
| **Limitations**| Struggles with long sequences due to fixed-length context vectors; may not capture long-range dependencies well. | Requires substantial data and computational power; complexity can make fine-tuning harder. | May generate repetitive or nonsensical outputs; unidirectional nature limits contextual understanding compared to bidirectional models. |
| **Applications**| Machine translation, text summarization, conversational agents. | Machine translation, text generation, sentiment analysis. | Chatbots, text completion, creative writing assistance. |

## Final Decision of the Architecture to use for training

For this project, I will be going with GPT architecture. 

Following are the reasons for the same:

- **Text Generation Capability:** GPT has been specifically designed for generating the text, making it highly effective for producing logically reasoned responses in conversational contexts such as to-and-fro chats with chatbot.

- **Availability of Pre-trained Models:** Models like GPT-3 are pre-trained on extensive datasets. These kind of models deliver contextually appropriate responses on a wide range of topics without the need for extensive additional training.

- **Adaptability and Versatility:** The GPT architecture's exposure to a variety of text sources, allows it to adjust to different conversational styles, thereby imprvoing the user interactions.

- **Contextual Awareness:** GPT is very good at maintaining context throughout multiple chat exchanges. It leads to smoother and more continuous dialogues.

- **Implementation and Fine-tuning:** Making use of a pre-trained GPT model, we can conserve time and resources, as it demands less data and training compared to Seq2Seq models.

- **Performance in Chatbot Applications:** GPT has shown exceptional capability in generating human-like responses.

Considering all these points, we have decided to go with the GPT architecture.

# Model Training using GPT2

Usually for these kind of usecases, model training involves so many steps.
Look at the following steps for the same:

- Load the `split1_train_df` dataframe.
- Here, there is no use of converting the rows of the dataframe into different formats. All the rows are already present in proper structure. 
- Hence, we can directly go ahead with splitting the dataframe to training (80%) and testing (20%) datasets.


## Split the dataframe into train (80%) and test (20%) datasets

In [13]:
splitted_train_df, splitted_test_df = train_test_split(
    split1_train_df, 
    test_size=0.2, 
    random_state=42
)

# Display the sizes of the train and test sets
print(f"Training set size: {len(splitted_train_df)}")
print(f"Testing set size: {len(splitted_test_df)}")

Training set size: 17520
Testing set size: 4380


## Text Tokenization using GPT2Tokenizer

- Tokenize the both `splitted_train_df` and `splitted_test_df` using `gpt2` model's `GPT2Tokenizer` tokenizer.
    - ***Pass the following custom parameters to tokenize:***
        - In models like GPT-2, padding is required to make sure that all the sequences in a batch are having same length. Hence, by setting the padding token to the `eos_token`, we make the data compatible with the model's requirements and make sure that the model understands that these tokens are simply placeholders for the space that was in shorter sequences.
        - ***Define the tokenizing actual function having the following details.***
            - Convert the input text column of the DataFrame into a list of strings, which will be tokenized.
            - `truncation=True` to be sure that if the input text exceeds the specified maximum length, it will be truncated to fit
            - `padding=True` to be sure that all sequences in the output are padded to the length of the longest sequence in the batch, so they all have the same length.
            - Return the tokenized output as PyTorch tensors, which can be used directly for training with PyTorch using `return_tensors='pt'`.
            - Set the maximum length of the tokenized sequences to `512`.

In [14]:
# Load pre-trained tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)



In [15]:
# Set the padding token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Define and write the logic to tokenize the data
def tokenize_data(df):
    # Tokenize input texts and target texts
    input_encodings = tokenizer(
        df['input_text'].tolist(),
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        max_length=512
    )
    
    target_encodings = tokenizer(
        df['target_text'].tolist(),
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        max_length=512
    )
    
    return input_encodings, target_encodings

# Tokenize the training data
train_input_encodings, train_target_encodings = tokenize_data(splitted_train_df)

# Tokenize the testing data
test_input_encodings, test_target_encodings = tokenize_data(splitted_test_df)

# Display shapes of the encodings
print("Display the shapes after tokenization")
print("*************************************")
print(f"Training Input Encoding Shape: {train_input_encodings['input_ids'].shape}")
print(f"Training Target Encoding Shape: {train_target_encodings['input_ids'].shape}")
print(f"Testing Input Encoding Shape: {test_input_encodings['input_ids'].shape}")
print(f"Testing Target Encoding Shape: {test_target_encodings['input_ids'].shape}")

Display the shapes after tokenization
*************************************
Training Input Encoding Shape: torch.Size([17520, 512])
Training Target Encoding Shape: torch.Size([17520, 512])
Testing Input Encoding Shape: torch.Size([4380, 512])
Testing Target Encoding Shape: torch.Size([4380, 512])


## Create Dataset Objects and DataLoader

- Now, we should create custom dataset objects for training and testing using PyTorch's Dataset class. This is usually done to allow us to handle batching and shuffling during training.
- We should use PyTorch's DataLoader to create iterable batches from our dataset objects. This makes easy to load data during training and it can also include options for shuffling and parallel loading.

Let us understand better about the same:

- ***Custom Dataset Class Creation:***
    - The `GPT2Dataset` class inherits from `torch.utils.data.Dataset`.
    - The constructor is to take the input and target encodings and stores them.
    - The `__len__` method returns the total number of samples.
    - The `__getitem__` method retrieves the input and target data for a specific index.

- ***Dataset Objects Creation:***
    - Instances of `GPT2Dataset` are created for both the training and testing datasets.
    
- ***DataLoader Objects Creation:***
    - `DataLoader` is used to create iterable batches from the datasets.
    - We have the facility to adjust the `batch_size` parameter as needed. I have set to 8 for the time being.
    - The training `DataLoader` is set to shuffle the data, while the test `DataLoader` is not.

In [17]:
# Custom Dataset class that inherits from PyTorch's Dataset class
class GPT2Dataset(Dataset):
    # Constructor to initialize the dataset object
    def __init__(self, input_encodings, target_encodings):
        # Store the input_ids from the input_encodings dictionary
        self.input_ids = input_encodings['input_ids']
        # Store the attention_mask from the input_encodings dictionary
        self.attention_mask = input_encodings['attention_mask']
        # Store the target_ids from the target_encodings dictionary
        self.target_ids = target_encodings['input_ids']

    # Get the size of the dataset
    def __len__(self):
        # Return the total number of input samples (input_ids)
        return len(self.input_ids)

    # Method to get a specific sample from the dataset
    def __getitem__(self, idx):
        # Return a dictionary containing input_ids, attention_mask, and target_ids for the specified index
        return {
            'input_ids': self.input_ids[idx],  # Get input IDs for the specified index
            'attention_mask': self.attention_mask[idx],  # Get attention mask for the specified index
            'target_ids': self.target_ids[idx],  # Get target IDs for the specified index
        }

# Create Dataset objects for training and testing datasets
# Create training dataset
train_dataset = GPT2Dataset(train_input_encodings, train_target_encodings)

# Create testing dataset
test_dataset = GPT2Dataset(test_input_encodings, test_target_encodings)

# Create DataLoader objects for both datasets to facilitate batch processing
# Create DataLoader for training with shuffling
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Create DataLoader for testing without shuffling
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Display the size of the datasets
print(f"Training dataset size: {len(train_dataset)}")
print(f"Testing dataset size: {len(test_dataset)}")

# Get an example batch from the training DataLoader
# Retrieve the first batch from the training DataLoader
example_batch = next(iter(train_dataloader))

# Print the shape of input IDs in the example batch
print("Example batch input IDs shape:", example_batch['input_ids'].shape)  

# Print the shape of target IDs in the example batch
print("Example batch target IDs shape:", example_batch['target_ids'].shape)  

Training dataset size: 17520
Testing dataset size: 4380
Example batch input IDs shape: torch.Size([8, 512])
Example batch target IDs shape: torch.Size([8, 512])


## Training the Model

In [18]:
# Check if MPS (Metal Performance Shaders) is available for GPU training in MacBook
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device '{device}' to train the model.")

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

# Set the model to training mode
model.train()

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop starts here
num_epochs = 3

# Define folder structure to create dir to save models
output_dir = "/Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/Chatbot_Code/model_checkpoints"

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

for epoch in range(num_epochs):
    total_loss = 0
    # Add tqdm for the training loop to show progress
    with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{num_epochs}", unit="batch") as pbar:
        for batch in train_dataloader:
            # Move input data to GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            target_ids = batch['target_ids'].to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=target_ids)
            
            # Get the loss from the model's outputs
            loss = outputs.loss

            # Backward pass
            loss.backward()

            # Update the parameters
            optimizer.step()

            # Accumulate the loss
            total_loss += loss.item()

            # Update the progress bar
            pbar.update(1)
            pbar.set_postfix(loss=loss.item())

    # Average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    
    print(f"Epoch-{epoch + 1}/{num_epochs} --- Average Loss: {avg_loss:.4f}")

    # Save model and optimizer state after each epoch
    print(f"Saving model for Epoch {epoch + 1}")
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'num_epochs': num_epochs,
        'batch_size': 8,
        # Save average loss for this epoch
        'avg_loss': avg_loss
    }, os.path.join(output_dir, f"epoch_{epoch + 1}_model.pt"))
    print(f"Model for Epoch {epoch + 1} is saved successfully.")

# Save the final model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'num_epochs': num_epochs,
    'batch_size': 8,
}, os.path.join(output_dir, "final_model.pt"))

print("All models are saved successfully.")

Using device 'mps' to train the model.


Epoch 1/3: 100%|█████████████████████████████████████████████████████| 2190/2190 [1:16:36<00:00,  2.10s/batch, loss=0.0637]


Epoch-1/3 --- Average Loss: 0.0935
Saving model for Epoch 1
Model for Epoch 1 is saved successfully.


Epoch 2/3: 100%|█████████████████████████████████████████████████████| 2190/2190 [1:16:34<00:00,  2.10s/batch, loss=0.0477]


Epoch-2/3 --- Average Loss: 0.0759
Saving model for Epoch 2
Model for Epoch 2 is saved successfully.


Epoch 3/3: 100%|██████████████████████████████████████████████████████| 2190/2190 [1:16:40<00:00,  2.10s/batch, loss=0.114]


Epoch-3/3 --- Average Loss: 0.0738
Saving model for Epoch 3
Model for Epoch 3 is saved successfully.
All models are saved successfully.


In [None]:
# # Check if MPS (Metal Performance Shaders) is available for GPU training in Macbook
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# print(f"Using device '{device}' to train the model.")

# # Load the pre-trained GPT-2 model
# model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

# # Set the model to training mode
# model.train()

# # Define the optimizer
# optimizer = AdamW(model.parameters(), lr=5e-5)

# # Training loop starts here
# num_epochs = 3

# # Define folder structure to create dir to save models
# output_dir = "/Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/Chatbot_Code/model_checkpoints"

# # Create the directory if it doesn't exist
# os.makedirs(output_dir, exist_ok=True)

# for epoch in range(num_epochs):
#     total_loss = 0
#     for batch in train_dataloader:
#         # Move input data to GPU
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         target_ids = batch['target_ids'].to(device)

#         # Zero the gradients
#         optimizer.zero_grad()

#         # Forward pass
#         outputs = model(input_ids, attention_mask=attention_mask, labels=target_ids)
        
#         # Get the loss from the model's outputs
#         loss = outputs.loss

#         # Backward pass
#         loss.backward()

#         # Update the parameters
#         optimizer.step()

#         # Accumulate the loss
#         total_loss += loss.item()

#     # Average loss for the epoch
#     avg_loss = total_loss / len(train_dataloader)
    
#     print(f"Epoch-{epoch + 1}/{num_epochs} --- Average Loss: {avg_loss:.4f}")


#     # Save model and optimizer state after each epoch
#     torch.save({
#         'model_state_dict': model.state_dict(),
#         'optimizer_state_dict': optimizer.state_dict(),
#         'num_epochs': num_epochs,
#         'batch_size': 8,
#         # Save average loss for this epoch
#         'avg_loss': avg_loss
#     }, os.path.join(output_dir, f"epoch_{epoch + 1}_model.pt"))

# # Save the final model
# torch.save({
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict(),
#     'num_epochs': num_epochs,
#     'batch_size': 8,
# }, os.path.join(output_dir, "final_model.pt"))

# print("All models saved successfully.")

In [14]:
# from transformers import GPT2LMHeadModel, GPT2Tokenizer

# # Load pre-trained model and tokenizer
# model_name = 'gpt2'
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)

# # Set the pad_token to eos_token
# tokenizer.pad_token = tokenizer.eos_token

# # Resize the model embeddings to include the new pad_token
# model.resize_token_embeddings(len(tokenizer))

# # Move the model to GPU
# model = model.to(device)

# # Assuming 'target_text' is your label column
# def tokenize_function(examples):
#     # Tokenize the input_text
#     # print(examples['input_text'])
#     tokenized_inputs = tokenizer(
#         examples['input_text'],
#         padding=True,
#         truncation=True,
#         max_length=512,
#         return_tensors="pt"
#     )

#     # Tokenize the target_text
#     tokenized_targets = tokenizer(
#         examples['target_text'],
#         padding=True,
#         truncation=True,
#         max_length=512,
#         return_tensors="pt"
#     )

#     # Set labels
#     tokenized_inputs["labels"] = tokenized_targets["input_ids"]

#     return tokenized_inputs

# # Apply the tokenize function
# train_tokenized = train_dataset.map(tokenize_function, batched=False)
# val_tokenized = val_dataset.map(tokenize_function, batched=False)

# # Set the format for PyTorch, including 'labels' instead of 'target_text'
# train_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
# val_tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [15]:
# import os
# import math
# from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# # Assume 'full_dataset' is your complete dataset with 100,000 records
# chunk_size = 10000
# num_chunks = math.ceil(len(train_tokenized) / chunk_size)
# output_dir = './fine_tuned_model'

# # Ensure the output directory exists
# os.makedirs(output_dir, exist_ok=True)

# # Define training arguments with checkpointing enabled
# training_args = TrainingArguments(
#     output_dir=output_dir,
#     evaluation_strategy='epoch',
#     learning_rate=2e-5,
#     per_device_train_batch_size=11,  # Adjust based on your GPU memory
#     per_device_eval_batch_size=11,
#     # Train three epochs per chunk
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     # Save at the end of each epoch
#     save_strategy='epoch',
#     # Keep only the latest checkpoint to save space
#     save_total_limit=1,
#     # You can change this if you prefer loading the best model
#     load_best_model_at_end=False
# )

# # Initialize the data collator
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# # Function to check for the latest checkpoint
# def get_latest_checkpoint(output_dir):
#     checkpoints = [os.path.join(output_dir, d) for d in os.listdir(output_dir) if d.startswith("checkpoint")]
#     if len(checkpoints) > 0:
#         return max(checkpoints, key=os.path.getctime)  # Return the latest checkpoint
#     return None

# # Track which chunk to start from, in case of failure
# chunk_start_idx = 0
# if os.path.exists(os.path.join(output_dir, "gpt2_training_progress.txt")):
#     with open(os.path.join(output_dir, "gpt2_training_progress.txt"), "r") as f:
#         # Load the index of the chunk to resume from
#         chunk_start_idx = int(f.read().strip())

# # Loop over each chunk of the dataset
# for i in range(chunk_start_idx, num_chunks):
#     start_idx = i * chunk_size
#     end_idx = min((i + 1) * chunk_size, len(train_tokenized))
#     print(f"Training on records {start_idx} to {end_idx - 1}")

#     # Create a subset for the current chunk
#     train_chunk = train_tokenized.select(range(start_idx, end_idx))

#     # Check if there is a saved checkpoint to load from
#     last_checkpoint = get_latest_checkpoint(output_dir)

#     # Initialize the trainer with the saved model or start fresh
#     trainer = Trainer(
#         # Ensure the same model is used
#         model=model,
#         args=training_args,
#         train_dataset=train_chunk,
#         # Assuming 'val_tokenized' remains the same
#         eval_dataset=val_tokenized,
#         data_collator=data_collator
#     )

#     # Train from the last checkpoint if it exists, otherwise from scratch
#     trainer.train(resume_from_checkpoint=last_checkpoint)

#     # Save the model and tokenizer after each chunk
#     trainer.save_model(output_dir)
#     tokenizer.save_pretrained(output_dir)

#     # Save the progress to a text file, so we know which chunk to resume from if something fails
#     with open(os.path.join(output_dir, "gpt2_training_progress.txt"), "w") as f:
#         f.write(str(i + 1))  # Write the index of the next chunk to train

#     # Optionally clear the CUDA cache to free memory
#     torch.cuda.empty_cache()

# print("Incremental training completed.")

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=11,
    per_device_eval_batch_size=11,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define a data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
)

# Train the model
trainer.train()

model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

In [None]:
# Load the fine-tuned model
# model = GPT2LMHeadModel.from_pretrained('fine_tuned_model')
# tokenizer = GPT2Tokenizer.from_pretrained('fine_tuned_model')