\**Downloading, Installing & Importing Required Libraries**

In [1]:
import os
import h5py
import math
import torch
from torch.utils.data import Dataset

In [2]:
!pip install transformers
!pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
from transformers import (
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
    TrainerCallback
)
import accelerate



**Mounting Google Drive for importing the Data Files which will be used in the Tokenization**

**Selecting the GPU to Train the Model**

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["CUDA_LAUNCH_BLOCKING"]="0"

**Defining the Method that will Create the Pytorch Compatible Dataset Class**

In [5]:
class H5Dataset(Dataset):
    def __init__(self, tokenizer, file_path="C:\Users\devth\Desktop\NLP_Project\train_temp.txt", block_size=512):
        cached_features_file = "./data_temp.h5"

        # logger.info("Loading features from cached file %s", cached_features_file)
        print(("Loading features from cached file %s", cached_features_file))
        with h5py.File(cached_features_file, 'r') as f:
            if file_path=="C:\Users\devth\Desktop\NLP_Project\test_temp.txt":
                self.samples = f[file_path][:] #this is a dev set, 30% of a test set
            else:
                self.samples = f[file_path][:]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        return torch.tensor(self.samples[item])

In [15]:
def get_dataset( tokenizer, evaluate=False, local_rank=-1):
  file_path = "./test_temp.txt" if evaluate else "./train_temp.txt"
  return H5Dataset(tokenizer=tokenizer, file_path=file_path)

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1201534707.py, line 2)

**Performing Transformer Configuration**

In [7]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [8]:
print(torch.cuda.is_available())  # Should return True
print(torch.version.cuda)         # Should print the CUDA version


False
None


In [9]:
config = AutoConfig.from_pretrained('gpt2', cache_dir='cache')
set_seed(20)

**Defining the Tokenizer for the Model Training**

In [10]:
tokenizer = AutoTokenizer.from_pretrained('gpt2', cache_dir= 'cache')

**Initialising the GPT2 Model**

In [11]:
model = AutoModelWithLMHead.from_pretrained('gpt2',config=config,cache_dir='cache',)



**Adding the Special Recipe Token to the Tokenizer**

In [12]:
special_tokens = {
    "additional_special_tokens": [
        '<RECIPE_START>',
        '<INPUT_START>',
        '<NEXT_INPUT>',
        '<INPUT_END>',
        '<INGR_START>',
        '<NEXT_INGR>',
        '<INGR_END>',
        '<INSTR_START>',
        '<NEXT_INSTR>',
        '<INSTR_END>',
        '<TITLE_START>',
        '<TITLE_END>',
        '<RECIPE_END>'
    ]
}

**Resizeing the Model to Fit the Tokenizer with Special Tokens**

In [13]:
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

Embedding(50270, 768)

**Converting the Train and Validation Dataset to Pytorch Dataset so as it can be given to the Model as Input for Training**

In [14]:
train_dataset = (get_dataset(tokenizer=tokenizer))
eval_dataset = (get_dataset(tokenizer=tokenizer, evaluate=True))

('Loading features from cached file %s', './data_temp.h5')


KeyError: "Unable to synchronously open object (object 'train_temp.txt' doesn't exist)"

**To be able to build batches, data collators may apply some processing (like padding).Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation (like random masking) oin the formed batch.
Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset.Forming the batches to dataset to be trained
source :- Hugginface.co**

In [None]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, mlm_probability=0.15  )

: 

In [None]:
training_args = TrainingArguments(
    output_dir="./project_model",
    logging_dir="./logs",  # Directory for the logs
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    eval_steps=500,  # Add this to perform evaluation every 500 steps
    save_strategy="steps",
    save_steps=500,  # Checkpoints will be saved every 500 steps
    logging_steps=10,  # Log training information every 10 steps
    fp16=True,
    fp16_opt_level='O1',
    warmup_steps=100,
    learning_rate=5e-4,
    adam_epsilon=1e-8,
    weight_decay=0.01,
    save_total_limit=3,  # Keep only the 3 most recent checkpoints
    metric_for_best_model="loss",  # Assumes you want to use loss to determine the best model
    greater_is_better=False,  # Lower loss indicates a better model
    report_to="tensorboard",  # Enable logging to TensorBoard
)


: 

**Initializing PyTorch Trainer**

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)


: 

**Saving the Tokenizer Object & Starting Training and Saving the model after Finishing the training**

In [None]:
tokenizer.save_pretrained('./project_model')
trainer.train()
trainer.save_model()

: 

**Saving the Tokenizer**

In [None]:
tokenizer.save_pretrained('./project_model')

: 

In [None]:
import os

model_directory = './project_model'

for filename in os.listdir(model_directory):
    file_path = os.path.join(model_directory, filename)
    
    print(f"File available at: {file_path}")

: 