# Install & Load Packages

In [1]:
%pip install -U "sentence-transformers[train]" " transformers[torch]" accelerate datasets pandas matplotlib seaborn numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [41]:
# Import libraries
from datasets import load_dataset, Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments
)
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import pandas as pd
import numpy as np

# Fine-tuning Model

In [3]:
# Define model
model = SentenceTransformer("firqaaa/indo-sentence-bert-base")

# Define loss function (CoSENTLoss | Cosine Sentence Loss -> Returning float similarity score)
loss = CoSENTLoss(model)

In [4]:
# Define datasets
train_dataset = load_dataset("csv", data_files="data/preprocessed-data.csv")
test_dataset = load_dataset("csv", data_files="data/dataset-question.csv")
eval_dataset = load_dataset("csv", data_files="data/dataset-eval.csv")
train_dataset, test_dataset

Generating train split: 247 examples [00:00, 3249.79 examples/s]


(DatasetDict({
     train: Dataset({
         features: ['pattern', 'tag'],
         num_rows: 247
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['pattern', 'tag'],
         num_rows: 259
     })
 }))

In [39]:
# Specify training args
args = SentenceTransformerTrainingArguments(
    output_dir="fine-tuned/sbert-fine-tuned-chatPMB",
    num_train_epochs=100,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
)

In [11]:
# Creating eval dataset
train_patterns = pd.read_csv('data/preprocessed-data.csv')['pattern']
test_patterns = pd.read_csv('data/dataset-question.csv')['pattern']
test_patterns = test_patterns.iloc[:-12]
print(train_patterns.shape, test_patterns.shape)

(247,) (247,)


In [12]:
embed_train = model.encode(train_patterns)
embed_test = model.encode(test_patterns)

In [71]:
scores = []
sentences_1 = []
sentences_2 = []

for i in range(len(test_patterns)):
    data = model.similarity(embed_train[int(i)], embed_test[int(i)])
    data = float(data[0][0])
    s1 = train_patterns[int(i)]
    s2 = test_patterns[int(i)]
    sentences_1.append(s1)
    sentences_2.append(s2)
    scores.append(data)

df = pd.DataFrame({
    "sentence_1": sentences_1,
    "sentence_2": sentences_2,
    "label": scores,
})
df.to_csv('data/fine-tuned-dataset.csv')
df.tail()

Unnamed: 0,sentence_1,sentence_2,label
242,uin sunan gunung djati bandung punya beasiswa,Bisakah Anda memberitahu saya apakah di UIN Su...,0.845474
243,info soal beasiswa uin sunan gunung djati bandung,Dapatkah Anda memberikan informasi tentang ada...,0.864547
244,beasiswa uin sunan gunung djati bandung info l...,Mohon penjelasan apakah ada program beasiswa d...,0.815647
245,minta info dong uin sunan gunung djati bandung...,Saya ingin menanyakan apakah di UIN Sunan Gunu...,0.86283
246,beasiswa uin sunan gunung djati bandung gak sih,Mohon konfirmasi apakah UIN Sunan Gunung Djati...,0.779388


In [47]:
# Create evaluator & evaluate the base model
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=df["sentence_1"],
    sentences2=df["sentence_2"],
    scores=df["label"],
    main_similarity=SimilarityFunction.COSINE,
    show_progress_bar=True,
    precision="float32",
    name="train-evaluator",
)
dev_evaluator(model)


[A


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 16/16 [00:19<00:00,  1.22s/it]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 16/16 [00:19<00:00,  1.21s/it]


{'train-evaluator_pearson_cosine': 0.9999999999998636,
 'train-evaluator_spearman_cosine': 0.999995022854554,
 'train-evaluator_pearson_manhattan': 0.9370126899993185,
 'train-evaluator_spearman_manhattan': 0.9988644150622566,
 'train-evaluator_pearson_euclidean': 0.9386299766186968,
 'train-evaluator_spearman_euclidean': 0.9999920365732318,
 'train-evaluator_pearson_dot': 0.9999999999999291,
 'train-evaluator_spearman_dot': 0.9999962173872572,
 'train-evaluator_pearson_max': 0.9999999999999291,
 'train-evaluator_spearman_max': 0.9999962173872572}

In [48]:
training_data = Dataset.from_dict(df)
training_data, train_dataset

(Dataset({
     features: ['sentence_1', 'sentence_2', 'label'],
     num_rows: 247
 }),
 DatasetDict({
     train: Dataset({
         features: ['pattern', 'tag'],
         num_rows: 247
     })
 }))

In [49]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=training_data,
    # eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

  0%|          | 0/1100 [6:13:51<?, ?it/s]
  0%|          | 0/1100 [03:24<?, ?it/s]
  9%|▉         | 100/1100 [20:12<2:44:34,  9.87s/it]

{'loss': 4.1953, 'grad_norm': 155.7838592529297, 'learning_rate': 1.8181818181818182e-05, 'epoch': 9.09}


Batches: 100%|██████████| 16/16 [00:13<00:00,  1.17it/s]
Batches: 100%|██████████| 16/16 [00:18<00:00,  1.14s/it]
  9%|▉         | 100/1100 [20:45<2:44:34,  9.87s/it]

{'eval_train-evaluator_pearson_cosine': 0.8093898640561992, 'eval_train-evaluator_spearman_cosine': 0.824334797888826, 'eval_train-evaluator_pearson_manhattan': 0.7798400142510288, 'eval_train-evaluator_spearman_manhattan': 0.8241380285194678, 'eval_train-evaluator_pearson_euclidean': 0.7826685689888642, 'eval_train-evaluator_spearman_euclidean': 0.8243323361809636, 'eval_train-evaluator_pearson_dot': 0.8093898617977469, 'eval_train-evaluator_spearman_dot': 0.8243353547107994, 'eval_train-evaluator_pearson_max': 0.8093898640561992, 'eval_train-evaluator_spearman_max': 0.8243353547107994, 'eval_runtime': 32.1151, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 9.09}


 18%|█▊        | 200/1100 [42:25<2:37:33, 10.50s/it]

{'loss': 3.8985, 'grad_norm': 38.27452087402344, 'learning_rate': 1.8181818181818182e-05, 'epoch': 18.18}


Batches: 100%|██████████| 16/16 [00:12<00:00,  1.29it/s]
Batches: 100%|██████████| 16/16 [00:17<00:00,  1.07s/it]
 18%|█▊        | 200/1100 [42:55<2:37:33, 10.50s/it]

{'eval_train-evaluator_pearson_cosine': 0.7442912049979398, 'eval_train-evaluator_spearman_cosine': 0.7888823035658308, 'eval_train-evaluator_pearson_manhattan': 0.7274242660573856, 'eval_train-evaluator_spearman_manhattan': 0.7867099133952896, 'eval_train-evaluator_pearson_euclidean': 0.7317339537045231, 'eval_train-evaluator_spearman_euclidean': 0.788879947729617, 'eval_train-evaluator_pearson_dot': 0.7442912135059014, 'eval_train-evaluator_spearman_dot': 0.7888815736169931, 'eval_train-evaluator_pearson_max': 0.7442912135059014, 'eval_train-evaluator_spearman_max': 0.7888823035658308, 'eval_runtime': 29.6626, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 18.18}


 27%|██▋       | 300/1100 [1:03:29<2:41:00, 12.08s/it]

{'loss': 3.4653, 'grad_norm': 33.83391571044922, 'learning_rate': 1.616161616161616e-05, 'epoch': 27.27}


Batches: 100%|██████████| 16/16 [00:13<00:00,  1.18it/s]
Batches: 100%|██████████| 16/16 [00:18<00:00,  1.14s/it]
 27%|██▋       | 300/1100 [1:04:01<2:41:00, 12.08s/it]

{'eval_train-evaluator_pearson_cosine': 0.7314217791145274, 'eval_train-evaluator_spearman_cosine': 0.8063740270502889, 'eval_train-evaluator_pearson_manhattan': 0.7153677105089545, 'eval_train-evaluator_spearman_manhattan': 0.8049270530033896, 'eval_train-evaluator_pearson_euclidean': 0.7192350350987506, 'eval_train-evaluator_spearman_euclidean': 0.8063716189786079, 'eval_train-evaluator_pearson_dot': 0.731421782423749, 'eval_train-evaluator_spearman_dot': 0.8063701741459546, 'eval_train-evaluator_pearson_max': 0.731421782423749, 'eval_train-evaluator_spearman_max': 0.8063740270502889, 'eval_runtime': 31.8666, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 27.27}


 36%|███▋      | 400/1100 [1:22:51<2:04:50, 10.70s/it]

{'loss': 3.2363, 'grad_norm': 57.30267333984375, 'learning_rate': 1.4141414141414143e-05, 'epoch': 36.36}


Batches: 100%|██████████| 16/16 [00:12<00:00,  1.32it/s]
Batches: 100%|██████████| 16/16 [00:15<00:00,  1.03it/s]
 36%|███▋      | 400/1100 [1:23:19<2:04:50, 10.70s/it]

{'eval_train-evaluator_pearson_cosine': 0.7352591247935228, 'eval_train-evaluator_spearman_cosine': 0.8007669601231745, 'eval_train-evaluator_pearson_manhattan': 0.7145743915983467, 'eval_train-evaluator_spearman_manhattan': 0.8001083822668705, 'eval_train-evaluator_pearson_euclidean': 0.7178974343838966, 'eval_train-evaluator_spearman_euclidean': 0.8007645687958561, 'eval_train-evaluator_pearson_dot': 0.7352591439893935, 'eval_train-evaluator_spearman_dot': 0.8007645687958561, 'eval_train-evaluator_pearson_max': 0.7352591439893935, 'eval_train-evaluator_spearman_max': 0.8007669601231745, 'eval_runtime': 27.698, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 36.36}


 45%|████▌     | 500/1100 [1:41:29<2:04:50, 12.48s/it]

{'loss': 2.9939, 'grad_norm': 40.00443649291992, 'learning_rate': 1.2121212121212122e-05, 'epoch': 45.45}


Batches: 100%|██████████| 16/16 [00:14<00:00,  1.13it/s]
Batches: 100%|██████████| 16/16 [00:19<00:00,  1.20s/it]
 45%|████▌     | 500/1100 [1:42:02<2:04:50, 12.48s/it]

{'eval_train-evaluator_pearson_cosine': 0.7199298898939839, 'eval_train-evaluator_spearman_cosine': 0.7922882722133623, 'eval_train-evaluator_pearson_manhattan': 0.6975104350879451, 'eval_train-evaluator_spearman_manhattan': 0.7919888703135487, 'eval_train-evaluator_pearson_euclidean': 0.7015712284662784, 'eval_train-evaluator_spearman_euclidean': 0.7922859062059172, 'eval_train-evaluator_pearson_dot': 0.7199298913934264, 'eval_train-evaluator_spearman_dot': 0.7922865264965321, 'eval_train-evaluator_pearson_max': 0.7199298913934264, 'eval_train-evaluator_spearman_max': 0.7922882722133623, 'eval_runtime': 33.359, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 45.45}


 55%|█████▍    | 600/1100 [2:00:39<1:37:08, 11.66s/it]

{'loss': 2.7431, 'grad_norm': 114.22269439697266, 'learning_rate': 1.0101010101010103e-05, 'epoch': 54.55}


Batches: 100%|██████████| 16/16 [00:11<00:00,  1.36it/s]
Batches: 100%|██████████| 16/16 [00:16<00:00,  1.01s/it]
 55%|█████▍    | 600/1100 [2:01:07<1:37:08, 11.66s/it]

{'eval_train-evaluator_pearson_cosine': 0.7072685301315268, 'eval_train-evaluator_spearman_cosine': 0.7847795328218323, 'eval_train-evaluator_pearson_manhattan': 0.6803517783032225, 'eval_train-evaluator_spearman_manhattan': 0.7847102964362085, 'eval_train-evaluator_pearson_euclidean': 0.6823756592952265, 'eval_train-evaluator_spearman_euclidean': 0.7847771892377071, 'eval_train-evaluator_pearson_dot': 0.7072685016882372, 'eval_train-evaluator_spearman_dot': 0.7847707077360904, 'eval_train-evaluator_pearson_max': 0.7072685301315268, 'eval_train-evaluator_spearman_max': 0.7847795328218323, 'eval_runtime': 28.0607, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 54.55}


 64%|██████▎   | 700/1100 [2:19:56<1:14:49, 11.22s/it]

{'loss': 2.6148, 'grad_norm': 23.569095611572266, 'learning_rate': 8.08080808080808e-06, 'epoch': 63.64}


Batches: 100%|██████████| 16/16 [00:12<00:00,  1.32it/s]
Batches: 100%|██████████| 16/16 [00:16<00:00,  1.05s/it]
 64%|██████▎   | 700/1100 [2:20:26<1:14:49, 11.22s/it]

{'eval_train-evaluator_pearson_cosine': 0.6941265754467635, 'eval_train-evaluator_spearman_cosine': 0.77256439298445, 'eval_train-evaluator_pearson_manhattan': 0.6674279804719248, 'eval_train-evaluator_spearman_manhattan': 0.7725158979916013, 'eval_train-evaluator_pearson_euclidean': 0.6696985162968796, 'eval_train-evaluator_spearman_euclidean': 0.7725620858783503, 'eval_train-evaluator_pearson_dot': 0.6941265935031556, 'eval_train-evaluator_spearman_dot': 0.772563760495404, 'eval_train-evaluator_pearson_max': 0.6941265935031556, 'eval_train-evaluator_spearman_max': 0.77256439298445, 'eval_runtime': 29.6088, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 63.64}


 73%|███████▎  | 800/1100 [2:39:22<1:03:24, 12.68s/it]

{'loss': 2.4904, 'grad_norm': 59.78976821899414, 'learning_rate': 6.060606060606061e-06, 'epoch': 72.73}


Batches: 100%|██████████| 16/16 [00:12<00:00,  1.27it/s]
Batches: 100%|██████████| 16/16 [00:17<00:00,  1.09s/it]
 73%|███████▎  | 800/1100 [2:39:52<1:03:24, 12.68s/it]

{'eval_train-evaluator_pearson_cosine': 0.6912692980546019, 'eval_train-evaluator_spearman_cosine': 0.779272009051356, 'eval_train-evaluator_pearson_manhattan': 0.6618673555257739, 'eval_train-evaluator_spearman_manhattan': 0.7780464992583549, 'eval_train-evaluator_pearson_euclidean': 0.6634079620704592, 'eval_train-evaluator_spearman_euclidean': 0.779269681914328, 'eval_train-evaluator_pearson_dot': 0.6912692868418887, 'eval_train-evaluator_spearman_dot': 0.779267199657811, 'eval_train-evaluator_pearson_max': 0.6912692980546019, 'eval_train-evaluator_spearman_max': 0.779272009051356, 'eval_runtime': 30.1735, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 72.73}


 82%|████████▏ | 900/1100 [2:59:29<45:49, 13.75s/it]  

{'loss': 2.362, 'grad_norm': 38.84254837036133, 'learning_rate': 4.04040404040404e-06, 'epoch': 81.82}


Batches: 100%|██████████| 16/16 [00:12<00:00,  1.24it/s]
Batches: 100%|██████████| 16/16 [00:17<00:00,  1.09s/it]
 82%|████████▏ | 900/1100 [2:59:59<45:49, 13.75s/it]

{'eval_train-evaluator_pearson_cosine': 0.692645173001219, 'eval_train-evaluator_spearman_cosine': 0.7752767449165267, 'eval_train-evaluator_pearson_manhattan': 0.6616407193160773, 'eval_train-evaluator_spearman_manhattan': 0.7748340521006754, 'eval_train-evaluator_pearson_euclidean': 0.6629959255977005, 'eval_train-evaluator_spearman_euclidean': 0.7752744297105407, 'eval_train-evaluator_pearson_dot': 0.692645171433253, 'eval_train-evaluator_spearman_dot': 0.7752739510311966, 'eval_train-evaluator_pearson_max': 0.692645173001219, 'eval_train-evaluator_spearman_max': 0.7752767449165267, 'eval_runtime': 30.4977, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 81.82}


 91%|█████████ | 1000/1100 [3:18:58<18:05, 10.86s/it] 

{'loss': 2.2524, 'grad_norm': 16.314697265625, 'learning_rate': 2.02020202020202e-06, 'epoch': 90.91}


Batches: 100%|██████████| 16/16 [00:11<00:00,  1.37it/s]
Batches: 100%|██████████| 16/16 [00:15<00:00,  1.02it/s]
 91%|█████████ | 1000/1100 [3:19:25<18:05, 10.86s/it]

{'eval_train-evaluator_pearson_cosine': 0.6895443176920923, 'eval_train-evaluator_spearman_cosine': 0.7747806218238421, 'eval_train-evaluator_pearson_manhattan': 0.6579397980099163, 'eval_train-evaluator_spearman_manhattan': 0.7735869791584528, 'eval_train-evaluator_pearson_euclidean': 0.6587958282062126, 'eval_train-evaluator_spearman_euclidean': 0.7747783080994266, 'eval_train-evaluator_pearson_dot': 0.6895443400293891, 'eval_train-evaluator_spearman_dot': 0.7747799760982238, 'eval_train-evaluator_pearson_max': 0.6895443400293891, 'eval_train-evaluator_spearman_max': 0.7747806218238421, 'eval_runtime': 27.513, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 90.91}


100%|██████████| 1100/1100 [3:38:02<00:00, 10.31s/it]

{'loss': 2.1732, 'grad_norm': 2.187600612640381, 'learning_rate': 0.0, 'epoch': 100.0}


Batches: 100%|██████████| 16/16 [00:15<00:00,  1.02it/s]
Batches: 100%|██████████| 16/16 [00:23<00:00,  1.46s/it]
100%|██████████| 1100/1100 [3:38:42<00:00, 10.31s/it]

{'eval_train-evaluator_pearson_cosine': 0.6899927880800748, 'eval_train-evaluator_spearman_cosine': 0.77506730611496, 'eval_train-evaluator_pearson_manhattan': 0.6575149776113081, 'eval_train-evaluator_spearman_manhattan': 0.7749200571311735, 'eval_train-evaluator_pearson_euclidean': 0.6583362837014505, 'eval_train-evaluator_spearman_euclidean': 0.7750649915344203, 'eval_train-evaluator_pearson_dot': 0.6899927847975237, 'eval_train-evaluator_spearman_dot': 0.7750636027960498, 'eval_train-evaluator_pearson_max': 0.6899927880800748, 'eval_train-evaluator_spearman_max': 0.77506730611496, 'eval_runtime': 39.2342, 'eval_samples_per_second': 0.0, 'eval_steps_per_second': 0.0, 'epoch': 100.0}


100%|██████████| 1100/1100 [3:38:49<00:00, 10.31s/it]

{'train_runtime': 13129.7054, 'train_samples_per_second': 1.881, 'train_steps_per_second': 0.084, 'train_loss': 2.947750868363814, 'epoch': 100.0}


100%|██████████| 1100/1100 [3:38:50<00:00, 11.94s/it]


TrainOutput(global_step=1100, training_loss=2.947750868363814, metrics={'train_runtime': 13129.7054, 'train_samples_per_second': 1.881, 'train_steps_per_second': 0.084, 'total_flos': 0.0, 'train_loss': 2.947750868363814, 'epoch': 100.0})

In [61]:
model.save_pretrained("fine-tuned/models/chatPMB-pretrained-2")

# Get Tags by index

In [84]:
labels = pd.read_csv('data/dataset-question.csv')
labels = labels.iloc[:-12]
labels = labels['tag']
labels

0         gr_hi
1         gr_ha
2         gr_pa
3         gr_si
4         gr_so
         ...   
242    beasiswa
243    beasiswa
244    beasiswa
245    beasiswa
246    beasiswa
Name: tag, Length: 247, dtype: object

# Usage of Pretrained Model

In [63]:
model = SentenceTransformer("fine-tuned/models/chatPMB-pretrained-2")

In [74]:
sentence = model.encode("Saya sedang mencari informasi seputar biaya UKT di UIN Sunan Gunung Djati Bandung")
to_test = model.encode(df['sentence_2'])

In [86]:
result = np.array(model.similarity(sentence, to_test))
index = np.argmax(result)
detected_label = labels[index]
index, detected_label, result

(224,
 'ukt',
 array([[-0.08102501, -0.09274451, -0.0194903 , -0.09801254, -0.07346375,
          0.01122001, -0.04107093,  0.02444862,  0.00458639,  0.16658112,
          0.376706  ,  0.2224128 ,  0.39654344,  0.12525932,  0.1698928 ,
          0.33939266,  0.18692902,  0.42380178,  0.15407269, -0.02946443,
          0.0310082 , -0.04368572,  0.10426041,  0.04318966,  0.11615118,
          0.08738878,  0.00334094,  0.05256866,  0.0368742 ,  0.0036604 ,
         -0.08304228, -0.1332272 , -0.13874346,  0.08654732,  0.2074686 ,
          0.1313885 ,  0.44935143,  0.14125264,  0.2567071 ,  0.4249447 ,
          0.18028516,  0.20853628,  0.37663478,  0.05177975,  0.14704998,
          0.17423037,  0.21683279,  0.17834032,  0.02997787,  0.05023821,
          0.12785752,  0.0825028 ,  0.04757181,  0.05177975,  0.14704998,
          0.17423037,  0.21683279,  0.17834032,  0.02997787,  0.05023821,
          0.12785752,  0.0825028 ,  0.04757181,  0.12470879,  0.08080865,
          0.15623328,  0