In [1]:
#!pip install pytorch_lightning==1.5.10
#!pip install sentencepiece==0.1.97
#!pip install transformers==4.27.2

#!pip show sentencepiece --version

In [1]:
from sklearn.model_selection import train_test_split

from transformers import T5Tokenizer, T5ForConditionalGeneration

from transformers import AdamW
import pandas as pd
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
# from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

pl.seed_everything(100)
import warnings
warnings.filterwarnings("ignore")

INFO:pytorch_lightning.utilities.seed:Global seed set to 100


In [2]:
data = pd.read_csv('gamedev.csv')
# take first 100 rows ony from question and answer columns
data = data[['question', 'answer']].head(100)
data.head()

Unnamed: 0,question,answer
0,How can I pause my game?,"In the Editor, you can just click the pause bu..."
1,What is the best way to pause my game?,"In the Editor, you can just click the pause bu..."
2,"When I play a game, how do I pause it?","In the Editor, you can just click the pause bu..."
3,Could you please tell me how I can pause my game?,"In the Editor, you can just click the pause bu..."
4,"In order to pause my game, what should I do?","In the Editor, you can just click the pause bu..."


In [2]:
data = pd.read_csv('Conversation.csv')
data.drop(columns = ['Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_MAX_LEN = 128 #input length
OUTPUT_MAX_LEN = 128 # output length
TRAIN_BATCH_SIZE = 8 # batch size of training
VAL_BATCH_SIZE = 2 # batch size for validation
EPOCHS = 5 # number of epoch

In [4]:
MODEL_NAME = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, model_max_length=512)

In [5]:
class T5Dataset:

  def __init__(self,question,answer):

    self.question = question
    self.answer = answer
    self.tokenizer = tokenizer
    self.input_max_len = INPUT_MAX_LEN
    self.output_max_len = OUTPUT_MAX_LEN

  def __len__(self):                      # This method retrives the number of item from the dataset
    return len(self.question)

  def __getitem__(self,item):             # This method retrieves the item at the specified index item.

    question = str(self.question[item])
    question = ''.join(question.split())

    answer = str(self.answer[item])
    answer = ''.join(answer.split())

    input_tokenize = self.tokenizer(
            question,
            add_special_tokens=True,
            max_length=self.input_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"
        )
    output_tokenize = self.tokenizer(
            answer,
            add_special_tokens=True,
            max_length=self.output_max_len,
            padding = 'max_length',
            truncation = True,
            return_attention_mask=True,
            return_tensors="pt"

        )


    input_ids = input_tokenize["input_ids"].flatten()
    attention_mask = input_tokenize["attention_mask"].flatten()
    labels = output_tokenize['input_ids'].flatten()

    out = {
            'question':question,
            'answer':answer,
            'input_ids': input_ids,
            'attention_mask':attention_mask,
            'target':labels
        }

    return out

In [6]:
class T5DataLoad(pl.LightningDataModule):

    def __init__(self,df_train,df_test):
        super().__init__()
        self.df_train = df_train
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.input_max_len = INPUT_MAX_LEN
        self.out_max_len = OUTPUT_MAX_LEN

    def setup(self, stage=None):

        self.train_data = T5Dataset(
            question = self.df_train.question.values,
            answer = self.df_train.answer.values
        )

        self.valid_data = T5Dataset(
            question = self.df_test.question.values,
            answer = self.df_test.answer.values
        )
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
         self.train_data,
         batch_size= TRAIN_BATCH_SIZE,
         shuffle=True,
         num_workers=2
        )
    def val_dataloader(self):
        return torch.utils.data.DataLoader(
        self.valid_data,
        batch_size= VAL_BATCH_SIZE,
        num_workers = 2
        )

In [9]:
class T5Model(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)


    def forward(self, input_ids, attention_mask, labels=None):

        output = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['target']

        loss, logits = self(input_ids, attention_mask, labels)

        preds = logits.argmax(dim = -1).cpu()
        targets = labels.cpu()

        accuracies = []
        for pred, target in zip(preds, targets):
            seq_acc = accuracy_score(pred, target)
            accuracies.append(seq_acc)

        mean_acc = sum(accuracies) / len(accuracies)

        f1 = []
        for pred, target in zip(preds, targets):
            seq_f1 = precision_recall_fscore_support(pred, target, average='macro')[2]
            f1.append(seq_f1)

        f1 = sum(f1) / len(f1)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_accuracy', mean_acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_f1', f1, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        return {'loss': loss, 'train_acc': mean_acc, 'train_f1': f1, 'logits': logits}

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['target']

        loss, logits = self(input_ids, attention_mask, labels)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        preds = logits.argmax(dim = -1).cpu()
        targets = labels.cpu()

        accuracies = []
        for pred, target in zip(preds, targets):
            seq_acc = accuracy_score(pred, target)
            accuracies.append(seq_acc)

        mean_acc = sum(accuracies) / len(accuracies)

        self.log('val_acc', mean_acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        f1 = []
        for pred, target in zip(preds, targets):
            seq_f1 = precision_recall_fscore_support(pred, target, average='macro')[2]
            f1.append(seq_f1)

        f1 = sum(f1) / len(f1)

        self.log('val_f1', f1, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        return {'val_loss': loss, 'val_acc': mean_acc, 'val_f1': f1}

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

In [11]:
def run():
    df_train, df_test = train_test_split(data,test_size = 0.2, random_state=100)
    dataload = T5DataLoad(df_train,df_test)
    dataload.setup()
    device = DEVICE
    model = T5Model()
    model.to(device)

    checkpoint = ModelCheckpoint(
        dirpath="/kaggle/working",
        filename='best-model',
        save_top_k=2,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )
    trainer = pl.Trainer(
        callbacks = checkpoint,
        max_epochs= 3,
        gpus=1,
        accelerator="gpu"
    )
    trainer.fit(model, dataload)
run()

INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.seed:Global seed set to 100


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 0, global step 372: val_loss reached 0.28988 (best 0.28988), saving model to "/kaggle/working/best-model-v9.ckpt" as top 2


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 1, global step 745: val_loss reached 0.27059 (best 0.27059), saving model to "/kaggle/working/best-model-v10.ckpt" as top 2


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 2, global step 1118: val_loss reached 0.26100 (best 0.26100), saving model to "/kaggle/working/best-model-v9.ckpt" as top 2


In [12]:
train_model = T5Model.load_from_checkpoint('/kaggle/working/best-model.ckpt')
train_model.freeze()

def generate_question(question):

    inputs_encoding =  tokenizer(
        question,
        add_special_tokens=True,
        max_length= INPUT_MAX_LEN,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        )
    train_model.to(DEVICE)

    inputs_encoding = {key: val.to(DEVICE) for key, val in inputs_encoding.items()}

    generate_ids = train_model.model.generate(
        input_ids = inputs_encoding["input_ids"],
        attention_mask = inputs_encoding["attention_mask"],
        max_length = INPUT_MAX_LEN,
        num_beams = 4,
        num_return_sequences = 1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        )

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]

    return "".join(preds)

In [13]:
ques = "hello"
print("Quesiton: ", ques)
print("Answer: ", generate_question(ques))

Quesiton:  hello
Answer:  i'm so glad you're here.


In [None]:
data.head()

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.


In [None]:
data.head(2)

Unnamed: 0,question,answer
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.


Quesiton:  What is game development?
Answer:  
