# Install Required libraries

In [1]:
#!pip install accelerate -U
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv('data/gamedev.csv')
data.head(2)

Unnamed: 0,id,user_id,tags,question,answer
0,7544,4450,"['pause', 'timescale']",How can I pause my game?,"In the Editor, you can just click the pause bu..."
1,7544,4450,"['pause', 'timescale']",What is the best way to pause my game?,"In the Editor, you can just click the pause bu..."


## Load pre-trained Bert Model

In [4]:
# check for null values
data.isnull().sum()

id           0
user_id      0
tags         0
question     0
answer      20
dtype: int64

In [5]:
# remove null value rows
data = data.dropna()

# check for null values
data.isnull().sum()

id          0
user_id     0
tags        0
question    0
answer      0
dtype: int64

In [6]:
labels = data['answer'].unique().tolist()
labels = [str(s).strip() for s in labels]
labels

['In the Editor, you can just click the pause button.',
 'Log into Apple\'s developer portal and click the "iTunes Connect" link in the sidebar.',
 'You can attach the debugger to the editor process or the running built debug exe.',
 "To attempt to solve the editor crash, create a folder in a separate location where you can move files and directories you've moved or created within the project folder since the last time Unity ran successfully.",
 'Set the time of AnimationState of the relevant clip in the Update function.',
 'Vamos',
 'You can use the SharpUSBLib to access USB applications through Unity.',
 'CharacterController',
 'Keep your allocations small, and if you allocate a large buffer consider re-using it instead of destroying it (for instance, wiping out a large matrix of values and using the same one, instead of re-allocating different sizes across iterations).',
 'Make it inactive.',
 '1. Compressing images 2. Compressing JavaScript and CSS 3. Caching data 4. Streamlining s

In [7]:
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label:id for id, label in enumerate(labels)}

**Prepare** **FAQ** Data

In [8]:
id2label

{0: 'In the Editor, you can just click the pause button.',
 1: 'Log into Apple\'s developer portal and click the "iTunes Connect" link in the sidebar.',
 2: 'You can attach the debugger to the editor process or the running built debug exe.',
 3: "To attempt to solve the editor crash, create a folder in a separate location where you can move files and directories you've moved or created within the project folder since the last time Unity ran successfully.",
 4: 'Set the time of AnimationState of the relevant clip in the Update function.',
 5: 'Vamos',
 6: 'You can use the SharpUSBLib to access USB applications through Unity.',
 7: 'CharacterController',
 8: 'Keep your allocations small, and if you allocate a large buffer consider re-using it instead of destroying it (for instance, wiping out a large matrix of values and using the same one, instead of re-allocating different sizes across iterations).',
 9: 'Make it inactive.',
 10: '1. Compressing images 2. Compressing JavaScript and CSS

In [9]:
label2id

{'In the Editor, you can just click the pause button.': 0,
 'Log into Apple\'s developer portal and click the "iTunes Connect" link in the sidebar.': 1,
 'You can attach the debugger to the editor process or the running built debug exe.': 2,
 "To attempt to solve the editor crash, create a folder in a separate location where you can move files and directories you've moved or created within the project folder since the last time Unity ran successfully.": 3,
 'Set the time of AnimationState of the relevant clip in the Update function.': 4,
 'Vamos': 5,
 'You can use the SharpUSBLib to access USB applications through Unity.': 6,
 'CharacterController': 7,
 'Keep your allocations small, and if you allocate a large buffer consider re-using it instead of destroying it (for instance, wiping out a large matrix of values and using the same one, instead of re-allocating different sizes across iterations).': 8,
 'Make it inactive.': 9,
 '1. Compressing images 2. Compressing JavaScript and CSS 3. 

In [10]:
data['labels'] = data['answer'].map(lambda x: label2id[str(x).strip()] if isinstance(x, str) else None)
data.head()

Unnamed: 0,id,user_id,tags,question,answer,labels
0,7544,4450,"['pause', 'timescale']",How can I pause my game?,"In the Editor, you can just click the pause bu...",0
1,7544,4450,"['pause', 'timescale']",What is the best way to pause my game?,"In the Editor, you can just click the pause bu...",0
2,7544,4450,"['pause', 'timescale']","When I play a game, how do I pause it?","In the Editor, you can just click the pause bu...",0
3,7544,4450,"['pause', 'timescale']",Could you please tell me how I can pause my game?,"In the Editor, you can just click the pause bu...",0
4,7544,4450,"['pause', 'timescale']","In order to pause my game, what should I do?","In the Editor, you can just click the pause bu...",0


In [11]:
X = list(data['question'])[:1000]
X[:5]

# print only questions and answers


['How can I pause my game?',
 'What is the best way to pause my game?',
 'When I play a game, how do I pause it?',
 'Could you please tell me how I can pause my game?',
 'In order to pause my game, what should I do?']

In [12]:
y = list(data['labels'])[:1000]
y[:5]

[0, 0, 0, 0, 0]

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 123)

# Built Data Loader

In [14]:
model_name = "bert-base-uncased"
max_len = 256

tokenizer = BertTokenizer.from_pretrained(model_name,
                                          max_length=max_len)

model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=num_labels,
                                                      id2label=id2label,
                                                      label2id = label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [15]:
train_encoding = tokenizer(X_train, truncation=True, padding=True)
test_encoding = tokenizer(X_test, truncation=True, padding=True)

full_data = tokenizer(X, truncation=True, padding=True)

In [16]:
class DataLoader(Dataset):

    def __init__(self, encodings, labels):

        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):

        return len(self.labels)

## Training Arguments

In [17]:
train_dataloader = DataLoader(train_encoding, y_train)
test_dataloader = DataLoader(test_encoding, y_test)

In [18]:
fullDataLoader = DataLoader(full_data, y_test)

In [19]:
def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)

    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [20]:
training_args = TrainingArguments(
    output_dir='./output',
    do_train=True,
    do_eval=True,
    num_train_epochs=10, #100
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.05,
    logging_strategy='steps',
    logging_dir='./multi-class-logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    load_best_model_at_end=True
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=test_dataloader,
    compute_metrics= compute_metrics
)

In [22]:
trainer.train()

 10%|█         | 50/500 [00:41<06:03,  1.24it/s]

{'loss': 9.3026, 'learning_rate': 2.5e-05, 'epoch': 1.0}


                                                
 10%|█         | 50/500 [00:43<06:03,  1.24it/s]

{'eval_loss': 8.933938980102539, 'eval_Accuracy': 0.005, 'eval_F1': 0.0007598784194528875, 'eval_Precision': 0.0004625346901017576, 'eval_Recall': 0.002127659574468085, 'eval_runtime': 2.7268, 'eval_samples_per_second': 73.345, 'eval_steps_per_second': 9.168, 'epoch': 1.0}


 20%|██        | 100/500 [01:24<05:22,  1.24it/s]

{'loss': 8.0523, 'learning_rate': 5e-05, 'epoch': 2.0}


                                                 
 20%|██        | 100/500 [01:27<05:22,  1.24it/s]

{'eval_loss': 6.912737846374512, 'eval_Accuracy': 0.005, 'eval_F1': 0.0015822784810126582, 'eval_Precision': 0.0008438818565400844, 'eval_Recall': 0.012658227848101266, 'eval_runtime': 2.7844, 'eval_samples_per_second': 71.828, 'eval_steps_per_second': 8.978, 'epoch': 2.0}


 30%|███       | 150/500 [02:08<04:49,  1.21it/s]

{'loss': 5.8222, 'learning_rate': 4.375e-05, 'epoch': 3.0}


                                                 
 30%|███       | 150/500 [02:10<04:49,  1.21it/s]

{'eval_loss': 5.137669086456299, 'eval_Accuracy': 0.045, 'eval_F1': 0.05681818181818181, 'eval_Precision': 0.0546875, 'eval_Recall': 0.0625, 'eval_runtime': 2.7448, 'eval_samples_per_second': 72.866, 'eval_steps_per_second': 9.108, 'epoch': 3.0}


 40%|████      | 200/500 [02:51<04:04,  1.23it/s]

{'loss': 4.6894, 'learning_rate': 3.7500000000000003e-05, 'epoch': 4.0}


                                                 
 40%|████      | 200/500 [02:54<04:04,  1.23it/s]

{'eval_loss': 4.539067268371582, 'eval_Accuracy': 0.15, 'eval_F1': 0.18807676063773623, 'eval_Precision': 0.1945121951219512, 'eval_Recall': 0.20121951219512196, 'eval_runtime': 3.2607, 'eval_samples_per_second': 61.336, 'eval_steps_per_second': 7.667, 'epoch': 4.0}


 50%|█████     | 250/500 [03:35<03:26,  1.21it/s]

{'loss': 4.1856, 'learning_rate': 3.125e-05, 'epoch': 5.0}


                                                 
 50%|█████     | 250/500 [03:37<03:26,  1.21it/s]

{'eval_loss': 4.120138645172119, 'eval_Accuracy': 0.365, 'eval_F1': 0.37095420543696406, 'eval_Precision': 0.3599890530925014, 'eval_Recall': 0.41954022988505746, 'eval_runtime': 2.6191, 'eval_samples_per_second': 76.361, 'eval_steps_per_second': 9.545, 'epoch': 5.0}


 60%|██████    | 300/500 [04:17<02:38,  1.26it/s]

{'loss': 3.7676, 'learning_rate': 2.5e-05, 'epoch': 6.0}


                                                 
 60%|██████    | 300/500 [04:19<02:38,  1.26it/s]

{'eval_loss': 3.757772922515869, 'eval_Accuracy': 0.645, 'eval_F1': 0.6175697865353038, 'eval_Precision': 0.6534756431308155, 'eval_Recall': 0.632183908045977, 'eval_runtime': 2.4163, 'eval_samples_per_second': 82.77, 'eval_steps_per_second': 10.346, 'epoch': 6.0}


 70%|███████   | 350/500 [04:59<01:57,  1.28it/s]

{'loss': 3.4042, 'learning_rate': 1.8750000000000002e-05, 'epoch': 7.0}


                                                 
 70%|███████   | 350/500 [05:01<01:57,  1.28it/s]

{'eval_loss': 3.4394872188568115, 'eval_Accuracy': 0.795, 'eval_F1': 0.7442414174972315, 'eval_Precision': 0.7643410852713178, 'eval_Recall': 0.7523255813953489, 'eval_runtime': 2.4253, 'eval_samples_per_second': 82.465, 'eval_steps_per_second': 10.308, 'epoch': 7.0}


 80%|████████  | 400/500 [05:41<01:20,  1.25it/s]

{'loss': 3.1282, 'learning_rate': 1.25e-05, 'epoch': 8.0}


                                                 
 80%|████████  | 400/500 [05:43<01:20,  1.25it/s]

{'eval_loss': 3.193568706512451, 'eval_Accuracy': 0.88, 'eval_F1': 0.8233333333333334, 'eval_Precision': 0.8305882352941176, 'eval_Recall': 0.8274509803921568, 'eval_runtime': 2.6706, 'eval_samples_per_second': 74.891, 'eval_steps_per_second': 9.361, 'epoch': 8.0}


 90%|█████████ | 450/500 [06:23<00:40,  1.25it/s]

{'loss': 2.9262, 'learning_rate': 6.25e-06, 'epoch': 9.0}


                                                 
 90%|█████████ | 450/500 [06:26<00:40,  1.25it/s]

{'eval_loss': 3.0478515625, 'eval_Accuracy': 0.895, 'eval_F1': 0.8518072289156625, 'eval_Precision': 0.857429718875502, 'eval_Recall': 0.859437751004016, 'eval_runtime': 2.6791, 'eval_samples_per_second': 74.652, 'eval_steps_per_second': 9.331, 'epoch': 9.0}


100%|██████████| 500/500 [07:05<00:00,  1.30it/s]

{'loss': 2.8072, 'learning_rate': 0.0, 'epoch': 10.0}


                                                 
100%|██████████| 500/500 [07:07<00:00,  1.30it/s]

{'eval_loss': 2.994574546813965, 'eval_Accuracy': 0.92, 'eval_F1': 0.8763052208835341, 'eval_Precision': 0.8995983935742973, 'eval_Recall': 0.8759036144578312, 'eval_runtime': 2.4394, 'eval_samples_per_second': 81.989, 'eval_steps_per_second': 10.249, 'epoch': 10.0}


100%|██████████| 500/500 [07:10<00:00,  1.16it/s]

{'train_runtime': 430.2969, 'train_samples_per_second': 18.592, 'train_steps_per_second': 1.162, 'train_loss': 4.808554565429687, 'epoch': 10.0}





TrainOutput(global_step=500, training_loss=4.808554565429687, metrics={'train_runtime': 430.2969, 'train_samples_per_second': 18.592, 'train_steps_per_second': 1.162, 'train_loss': 4.808554565429687, 'epoch': 10.0})

In [23]:
q = [trainer.evaluate(eval_dataset=data) for data in (train_dataloader, test_dataloader)]

pd.DataFrame(q, index=['train', 'test']).iloc[:, :5]

100%|██████████| 100/100 [00:10<00:00,  9.09it/s]
100%|██████████| 25/25 [00:02<00:00, 10.85it/s]


Unnamed: 0,eval_loss,eval_Accuracy,eval_F1,eval_Precision,eval_Recall
train,2.63726,0.9875,0.985278,0.991748,0.983424
test,2.994575,0.92,0.876305,0.899598,0.875904


In [24]:
def predict(text):
    
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    pred_label_idx = probs.argmax()
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label

In [None]:
text = "How to make a game?"
predict(text)