# Install & Load Packages

In [2]:
%pip install -U "sentence-transformers[train]" " transformers[torch]" accelerate datasets pandas matplotlib seaborn numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Import libraries
from datasets import load_dataset, Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments
)
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# Fine-tuning Model

In [38]:
# Define model
model = SentenceTransformer("firqaaa/indo-sentence-bert-base")

# Define loss function (CoSENTLoss | Cosine Sentence Loss -> Returning float similarity score)
loss = CoSENTLoss(model)



In [39]:
# Specify training args
args = SentenceTransformerTrainingArguments(
    output_dir="fine-tuned/sbert-fine-tuned-chatPMB-lite",
    num_train_epochs=1,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
)

In [40]:
# Creating eval dataset
train_patterns = pd.read_csv('data/preprocessed-data-v2.csv')['pattern']
test_patterns = pd.read_csv('data/dataset-question-v2.csv')['pattern']
# test_patterns = test_patterns.iloc[:-154]
print(train_patterns.shape, test_patterns.shape)

(558,) (558,)


In [41]:
embed_train = model.encode(train_patterns)
embed_test = model.encode(test_patterns)

In [42]:
scores = []
sentences_1 = []
sentences_2 = []

for i in range(len(test_patterns)):
    data = model.similarity(embed_train[int(i)], embed_test[int(i)])
    data = float(data[0][0])
    s1 = train_patterns[int(i)]
    s2 = test_patterns[int(i)]
    sentences_1.append(s1)
    sentences_2.append(s2)
    scores.append(data)

df = pd.DataFrame({
    "sentence_1": sentences_1,
    "sentence_2": sentences_2,
    "label": scores,
})
df.to_csv('data/fine-tuned-dataset.csv')
df.tail()

Unnamed: 0,sentence_1,sentence_2,label
553,program beasiswa sedia uin sunan gunung djati ...,Apakah program beasiswa disediakan oleh UIN Su...,0.90367
554,program beasiswa uin sunan gunung djati bandun...,Program beasiswa di UIN Sunan Gunung Djati Ban...,0.887859
555,beasiswa uin sunan gunung djati bandung ada,Apakah beasiswa di UIN Sunan Gunung Djati Band...,0.888745
556,program beasiswa uin sunan gunung djati bandun...,Program beasiswa di UIN Sunan Gunung Djati Ban...,0.888634
557,beasiswa uin sunan gunung djati bandung beri,Apakah beasiswa di UIN Sunan Gunung Djati Band...,0.892302


In [43]:
# Create evaluator & evaluate the base model
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=df["sentence_1"],
    sentences2=df["sentence_2"],
    scores=df["label"],
    main_similarity=SimilarityFunction.COSINE,
    show_progress_bar=True,
    precision="float32",
    name="train-evaluator",
)
dev_evaluator(model)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 35/35 [00:19<00:00,  1.81it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 35/35 [00:22<00:00,  1.57it/s]


{'train-evaluator_pearson_cosine': 0.9999999999984652,
 'train-evaluator_spearman_cosine': 0.9999993956485574,
 'train-evaluator_pearson_manhattan': 0.9754313537197102,
 'train-evaluator_spearman_manhattan': 0.9992904394150622,
 'train-evaluator_pearson_euclidean': 0.9749967817818792,
 'train-evaluator_spearman_euclidean': 0.9999992747782398,
 'train-evaluator_pearson_dot': 0.9999999999990659,
 'train-evaluator_spearman_dot': 0.9999993956487196,
 'train-evaluator_pearson_max': 0.9999999999990659,
 'train-evaluator_spearman_max': 0.9999993956487196}

In [44]:
df.shape

(558, 3)

In [45]:
training_data = Dataset.from_dict(df)
training_data

Dataset({
    features: ['sentence_1', 'sentence_2', 'label'],
    num_rows: 558
})

In [46]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=training_data,
    # eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

[A[A

[A[A

[A[A

[A[A                                         
                                                 
100%|██████████| 24/24 [03:02<00:00,  7.59s/it]t]

{'train_runtime': 182.1946, 'train_samples_per_second': 3.063, 'train_steps_per_second': 0.132, 'train_loss': 4.762681007385254, 'epoch': 1.0}





TrainOutput(global_step=24, training_loss=4.762681007385254, metrics={'train_runtime': 182.1946, 'train_samples_per_second': 3.063, 'train_steps_per_second': 0.132, 'total_flos': 0.0, 'train_loss': 4.762681007385254, 'epoch': 1.0})

In [47]:
model.save_pretrained("fine-tuned/models/chatPMB-pretrained-3")

# Get Tags by index

In [48]:
labels = pd.read_csv('data/dataset-question-v2.csv')
labels = labels.iloc[:-12]
labels = labels['tag']
labels

0         gr_hi
1         gr_ha
2         gr_pa
3         gr_si
4         gr_so
         ...   
541    beasiswa
542    beasiswa
543    beasiswa
544    beasiswa
545    beasiswa
Name: tag, Length: 546, dtype: object

# Usage of Pretrained Model

In [49]:
model = SentenceTransformer("fine-tuned/models/chatPMB-pretrained-3")

In [52]:
sentence = model.encode("berikan saya informasi seputar jurusan yang tersedia di UIN Bandung")
to_test = model.encode(df['sentence_2'])

In [53]:
result = np.array(model.similarity(sentence, to_test))
index = np.argmax(result)
confidence = np.max(result)
detected_label = labels[index]
index, detected_label, result, confidence

(451,
 'portal_pmb',
 array([[0.18736595, 0.11880892, 0.1242495 , 0.12559177, 0.1277437 ,
         0.08699293, 0.55312836, 0.572655  , 0.56964266, 0.54294395,
         0.52441937, 0.50616586, 0.4651739 , 0.56614757, 0.54704666,
         0.6884822 , 0.62849593, 0.46074635, 0.58449197, 0.43678868,
         0.47267354, 0.4620644 , 0.54724437, 0.48340786, 0.51290834,
         0.6049539 , 0.48184198, 0.5259249 , 0.58709013, 0.59267443,
         0.50119734, 0.5464079 , 0.5603583 , 0.5822822 , 0.52441937,
         0.54704666, 0.5400777 , 0.46074635, 0.56964266, 0.45083416,
         0.46101725, 0.51525605, 0.5265329 , 0.5822822 , 0.4351947 ,
         0.5365207 , 0.5059828 , 0.578082  , 0.5015778 , 0.5443532 ,
         0.53259087, 0.5335934 , 0.56620836, 0.47344503, 0.46369505,
         0.5756334 , 0.48916224, 0.6273665 , 0.6197324 , 0.5795686 ,
         0.54166627, 0.5372874 , 0.54149604, 0.47164226, 0.5908709 ,
         0.49474216, 0.555352  , 0.6080853 , 0.5779739 , 0.52568346,
         0.59