# Importing modules that we will later use

In [1]:
import re
import json
import os
from sklearn.model_selection import train_test_split
import wandb
from transformers import GPT2Tokenizer
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel
from transformers.integrations import WandbCallback


wandb.login(key = "b8d79798e68fbc5835e01fa69f72e0edcd567f96") 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkuldeepbishnoi[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\kulde/.netrc


True

In [2]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="gpt2"
os.environ["WANDB_NOTEBOOK_NAME"]="model.ipynb"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

# Data Preprocessing and Data Loading

In [3]:
with open('scraped_data.json') as f:
    scraped_json = json.load(f)

def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        # print(texts['content']['p'])
        # break
        if texts:
            if 'content' in texts:
                if 'p' in texts['content']:
                    # print(f'{p}')
                    summary = str(texts['content']['p']).strip()
                    # print(f'{summary}')
                    # break
                    summary = re.sub(r"\s", " ", summary)
                    summary = re.sub(r"\\t", " ", summary)
                    summary = re.sub(r"\\n", " ", summary)
                    summary = re.sub(r"\\u200b", " ", summary)
                    data += summary + "  "
    print(f'{dest_path} is {len(data)}')
    f.write(data)

train_json, test_json = train_test_split(scraped_json, test_size=0.15)

build_text_files(train_json,'train_dataset.txt')
build_text_files(test_json,'test_dataset.txt')

train_dataset.txt is 71149
test_dataset.txt is 9594


In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

In [5]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



# Training the Model

In [6]:
# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to('cuda')

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned", #The output directory
    report_to="wandb",
    logging_steps=5, 
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps = 20, # Number of update steps between two evaluations.
    save_steps=100, # after # steps model is saved 
    prediction_loss_only=True,
    learning_rate = 5e-7
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[WandbCallback()]
)

You are adding a <class 'transformers.integrations.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback


In [7]:
trainer.train()



  0%|          | 0/21 [00:00<?, ?it/s]

{'loss': 4.1864, 'learning_rate': 3.809523809523809e-05, 'epoch': 0.71}
{'loss': 3.8865, 'learning_rate': 2.6190476190476192e-05, 'epoch': 1.43}
{'loss': 3.6935, 'learning_rate': 1.4285714285714285e-05, 'epoch': 2.14}
{'loss': 3.68, 'learning_rate': 2.3809523809523808e-06, 'epoch': 2.86}
{'train_runtime': 465.8951, 'train_samples_per_second': 1.41, 'train_steps_per_second': 0.045, 'train_loss': 3.8640074729919434, 'epoch': 3.0}


TrainOutput(global_step=21, training_loss=3.8640074729919434, metrics={'train_runtime': 465.8951, 'train_samples_per_second': 1.41, 'train_steps_per_second': 0.045, 'train_loss': 3.8640074729919434, 'epoch': 3.0})

# Saving, Loading & Running

In [8]:
trainer.save_model()

In [9]:
from transformers import pipeline

model = pipeline('text-generation',model='./gpt2-finetuned', 
                 tokenizer=tokenizer)

In [10]:
from transformers import set_seed
set_seed(42)
model("hello friends", max_length=30, num_return_sequences=5)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "hello friends: what you need you need to know to make everything easier!'\n\nThe pair will be making an appearance at the Edinburgh Comedy Festival,"},
 {'generated_text': "hello friends', 'I would love to see you at your best.', 'I am a bit overworked. I've been in my first job"},
 {'generated_text': 'hello friends, family and friends, if anyone needs support help or advice, please contact us directly at (812) 576-4226 or'},
 {'generated_text': "hello friends, and I'm so sorry you're not able to come to my house today, but your friend's family needs you. I'm gonna"},
 {'generated_text': "hello friends. I don't know where to start. I haven't been to the store recently and am waiting. I will definitely stay here to check"}]