<a href="https://colab.research.google.com/github/krishnavenirouthu/Dl-Assignment-2/blob/main/DL_A2_Q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

def load_model_and_tokenizer(model_name="gpt2"):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

    return tokenizer, model

def prepare_dataset(tokenizer, file_path="data/lyrics.txt", block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

def train_model(model, tokenizer, dataset):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir="./model",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )

    trainer.train()
    trainer.save_model("./model")
    tokenizer.save_pretrained("./model")

tokenizer, model = load_model_and_tokenizer()
dataset = prepare_dataset(tokenizer)
train_model(model, tokenizer, dataset)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,4.5794
200,4.4909
300,4.2509
400,4.0657
500,3.9915
600,3.8419
700,3.8178


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

def load_finetuned_model(model_path="./model"):
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.config.pad_token_id = tokenizer.eos_token_id
    return tokenizer, model

def generate_lyrics(prompt="Yeah, I told you once and", max_length=100):
    tokenizer, model = load_finetuned_model()
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones_like(input_ids)

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            do_sample=True,
            temperature=0.8,
            top_k=40,
            top_p=0.9,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Try it!
print(generate_lyrics("Yeah, I told you once and"))


Yeah, I told you once and Im back to my old self
I guess that was the last time we had sex again now she still talkin bout me like a girl who cant wait til late in her life get it all over with us soon enough

We gon hit up some bars at least for one night but thats just about everything else so far as chilltime goings on dont be mad if they cause its too cold or rain outside wont kill this place well know where these girls come from
