# Data Preparation

## Fetch Data from CSV

In [1]:
%pip install numpy pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd

In [3]:
question_path = './data/dataset-question.csv'
answer_path = './data/dataset-answer.csv'

question_attr = ['pattern', 'tag']
answer_attr = ['responses', 'tag']

question_data = pd.read_csv(question_path)[question_attr]
answer_data = pd.read_csv(answer_path)[answer_attr]

# data_answer.head(10)
print("Jumlah pertanyaan: ", question_data.shape)
print("Jumlah pertanyaan: ", answer_data.shape)

Jumlah pertanyaan:  (259, 2)
Jumlah pertanyaan:  (17, 2)


In [4]:
question_data.head(10)

Unnamed: 0,pattern,tag
0,Hi,gr_hi
1,Halo,gr_ha
2,Selamat Pagi,gr_pa
3,Selamat Siang,gr_si
4,Selamat Sore,gr_so
5,Selamat Malam,gr_ma
6,Ada program studi apa saja di UIN Sunan Gunung...,prodi
7,Apa saja jurusan di UIN Bandung,prodi
8,Ada jurusan apa saja?,prodi
9,Mohon informasi mengenai program studi yang te...,prodi


In [5]:
answer_data.head(10)

Unnamed: 0,responses,tag
0,Halo! Apa yang ingin anda cari tahu hari ini?,gr_hi
1,Hi! Apa yang bisa kami bantu?,gr_ha
2,Selamat Pagi! Apa yang bisa kami bantu untuk m...,gr_pa
3,Selamat Siang! Apa yang ingin anda cari tahu?,gr_si
4,Selamat Sore! Apa yang ingin anda ketahui?,gr_so
5,Selamat Malam! Apa yang bisa kami bantu?,gr_ma
6,UIN Sunan Gunung Djati Bandung menyelenggaraka...,prodi
7,SNBP adalah singkatan dari Seleksi Nasional Be...,jm_s1
8,Untuk jurusan yang bisa dipilih pada Seleksi M...,snbp_jur
9,Untuk jurusan yang bisa dipilih pada Seleksi M...,snbt_jur


## Data Cleaning

In [6]:
%pip install Sastrawi transformers tensorflow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from transformers import AutoTokenizer
import re
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
SBERT_TOKENIZER = 'firqaaa/indo-sentence-bert-base'

class Preprocess():
    def __init__(self, max_len=128):
        self.stemmer = StemmerFactory().create_stemmer()
        self.stopword = StopWordRemoverFactory().create_stop_word_remover()
        self.tokenizer = AutoTokenizer.from_pretrained(SBERT_TOKENIZER)
        self.max_len = max_len

    def cleaning(self, val):
        val = re.sub(r'\s+', ' ', val)
        val = re.sub("[^a-zA-Z0-9;]", " ", val)
        return val
    
    def casefolding(self, val):
        return str(val).lower()
    
    def stemming(self, val):
        return self.stemmer.stem(str(val))
    
    def stopwordRemoval(self, val):
        return self.stopword.remove(str(val))
    
    def embedding(self, val):
        return self.tokenizer.encode_plus(
            val,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='tf'
        )
    
    def preprocessing(self, sentences):
        for i in range(len(sentences)):
            input = sentences[i]
            input = self.cleaning(input)
            input = self.casefolding(input)
            input = self.stemming(input)
            input = self.stopwordremove(input)
            sentences[i] = input
        return sentences

    def tokenizing(self, sentences):
        input_ids, attention_mask = [], []
        for sentence in sentences:
            output = self.embedding(sentence)
            input_ids.append(output['input_ids'])
            attention_mask.append(output['attention_mask'])
        return {
            'input_ids': tf.convert_to_tensor(
                np.asarray(input_ids).squeeze(),
                dtype=tf.int32
            ),
            'attention_mask': tf.convert_to_tensor(
                np.asarray(attention_mask).squeeze(),
                dtype=tf.int32
            )
        }

    def preprocess_get_token(self, sentences, display_len=20):
        preprocessing = self.preprocessing(sentences)
        tokenized = self.tokenizing(preprocessing)
        return [self.tokenizer.convert_ids_to_tokens(tokenized['input_ids'][i][:display_len]) for i in range(len(sentences))]

preprocess = Preprocess()    

### Casefolding

In [9]:
sample_texts = question_data['pattern']
print(sample_texts[7])
print(sample_texts[8])
print(sample_texts[9])

Apa saja jurusan di UIN Bandung
Ada jurusan apa saja?
Mohon informasi mengenai program studi yang tersedia di UIN Sunan Gunung Djati Bandung.


In [10]:
preprocess = Preprocess()
hasil_casefolding = [preprocess.casefolding(sample) for sample in sample_texts]

bf_cf = sample_texts
af_cf = hasil_casefolding

df = pd.DataFrame({
    'Before Casefolding': bf_cf,
    'After Casefolding': af_cf 
})

df.tail(10)

Unnamed: 0,Before Casefolding,After Casefolding
249,Ada beasiswa di UIN Sunan Gunung Djati Bandung?,ada beasiswa di uin sunan gunung djati bandung?
250,Beasiswa ada di UIN Sunan Gunung Djati Bandung...,beasiswa ada di uin sunan gunung djati bandung...
251,Di UIN Sunan Gunung Djati Bandung ada beasiswa...,di uin sunan gunung djati bandung ada beasiswa...
252,Beasiswa di UIN Sunan Gunung Djati Bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
253,UIN Sunan Gunung Djati Bandung punya beasiswa?,uin sunan gunung djati bandung punya beasiswa?
254,Ada info soal beasiswa di UIN Sunan Gunung Dja...,ada info soal beasiswa di uin sunan gunung dja...
255,Beasiswa di UIN Sunan Gunung Djati Bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
256,"Minta info dong, di UIN Sunan Gunung Djati Ban...","minta info dong, di uin sunan gunung djati ban..."
257,Beasiswa di UIN Sunan Gunung Djati Bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
258,UIN Sunan Gunung Djati Bandung ada beasiswa?,uin sunan gunung djati bandung ada beasiswa?


### Cleaning

In [11]:
preprocess = Preprocess()
hasil_cleaning = [preprocess.cleaning(sample) for sample in hasil_casefolding]
bf_cl = hasil_casefolding
af_cl = hasil_cleaning
df = pd.DataFrame({
    'Before Cleaning': bf_cl,
    'After Cleaning': af_cl
})
df.tail(10)

Unnamed: 0,Before Cleaning,After Cleaning
249,ada beasiswa di uin sunan gunung djati bandung?,ada beasiswa di uin sunan gunung djati bandung
250,beasiswa ada di uin sunan gunung djati bandung...,beasiswa ada di uin sunan gunung djati bandung...
251,di uin sunan gunung djati bandung ada beasiswa...,di uin sunan gunung djati bandung ada beasiswa...
252,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
253,uin sunan gunung djati bandung punya beasiswa?,uin sunan gunung djati bandung punya beasiswa
254,ada info soal beasiswa di uin sunan gunung dja...,ada info soal beasiswa di uin sunan gunung dja...
255,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
256,"minta info dong, di uin sunan gunung djati ban...",minta info dong di uin sunan gunung djati ban...
257,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
258,uin sunan gunung djati bandung ada beasiswa?,uin sunan gunung djati bandung ada beasiswa


### Stemming

In [12]:
preprocess = Preprocess()
hasil_stemming = [preprocess.stemming(sample) for sample in hasil_cleaning]
bf_st = hasil_cleaning
af_st = hasil_stemming
df = pd.DataFrame({
    "Before Stemming" : bf_st,
    "After Stemming" : af_st
})
df.tail(10)

Unnamed: 0,Before Stemming,After Stemming
249,ada beasiswa di uin sunan gunung djati bandung,ada beasiswa di uin sunan gunung djati bandung
250,beasiswa ada di uin sunan gunung djati bandung...,beasiswa ada di uin sunan gunung djati bandung...
251,di uin sunan gunung djati bandung ada beasiswa...,di uin sunan gunung djati bandung ada beasiswa...
252,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
253,uin sunan gunung djati bandung punya beasiswa,uin sunan gunung djati bandung punya beasiswa
254,ada info soal beasiswa di uin sunan gunung dja...,ada info soal beasiswa di uin sunan gunung dja...
255,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
256,minta info dong di uin sunan gunung djati ban...,minta info dong di uin sunan gunung djati band...
257,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
258,uin sunan gunung djati bandung ada beasiswa,uin sunan gunung djati bandung ada beasiswa


### Stopword Removal

In [13]:
preprocess = Preprocess()
hasil_stopword_removal = [preprocess.stopwordRemoval(sample) for sample in hasil_stemming]
bf_sr = hasil_stemming
af_sr = hasil_stopword_removal
df = pd.DataFrame({
    "Before Stopword Removal": bf_sr,
    "After Stopword Removal": af_sr
})
df.tail(10)

Unnamed: 0,Before Stopword Removal,After Stopword Removal
249,ada beasiswa di uin sunan gunung djati bandung,beasiswa uin sunan gunung djati bandung
250,beasiswa ada di uin sunan gunung djati bandung...,beasiswa di uin sunan gunung djati bandung
251,di uin sunan gunung djati bandung ada beasiswa...,uin sunan gunung djati bandung beasiswa
252,beasiswa di uin sunan gunung djati bandung ada...,beasiswa uin sunan gunung djati bandung nggak
253,uin sunan gunung djati bandung punya beasiswa,uin sunan gunung djati bandung punya beasiswa
254,ada info soal beasiswa di uin sunan gunung dja...,info soal beasiswa uin sunan gunung djati bandung
255,beasiswa di uin sunan gunung djati bandung ada...,beasiswa uin sunan gunung djati bandung info l...
256,minta info dong di uin sunan gunung djati band...,minta info dong uin sunan gunung djati bandung...
257,beasiswa di uin sunan gunung djati bandung ada...,beasiswa uin sunan gunung djati bandung gak sih
258,uin sunan gunung djati bandung ada beasiswa,uin sunan gunung djati bandung beasiswa


### Filtering (Drop Duplicate Data)

In [14]:
to_be_filtered = pd.DataFrame({
    "pattern": hasil_stopword_removal,
    "tag": question_data['tag']
})
to_be_filtered.head(10)
hasil_filtering = to_be_filtered.drop_duplicates(subset=['pattern'])

hasil_filtering.to_csv('data/preprocessed-data.csv', index=False)

print('Filtered data', hasil_filtering.shape)
hasil_filtering.tail(10)

Filtered data (247, 2)


Unnamed: 0,pattern,tag
248,mohon klarifikasi ada beasiswa tawar uin sunan...,beasiswa
249,beasiswa uin sunan gunung djati bandung,beasiswa
250,beasiswa di uin sunan gunung djati bandung,beasiswa
251,uin sunan gunung djati bandung beasiswa,beasiswa
252,beasiswa uin sunan gunung djati bandung nggak,beasiswa
253,uin sunan gunung djati bandung punya beasiswa,beasiswa
254,info soal beasiswa uin sunan gunung djati bandung,beasiswa
255,beasiswa uin sunan gunung djati bandung info l...,beasiswa
256,minta info dong uin sunan gunung djati bandung...,beasiswa
257,beasiswa uin sunan gunung djati bandung gak sih,beasiswa


### Preprocessed Data

In [15]:
dataset = pd.read_csv('data/preprocessed-data.csv')
print("Total data: ", dataset.shape)
dataset.tail(10)

Total data:  (247, 2)


Unnamed: 0,pattern,tag
237,mohon klarifikasi ada beasiswa tawar uin sunan...,beasiswa
238,beasiswa uin sunan gunung djati bandung,beasiswa
239,beasiswa di uin sunan gunung djati bandung,beasiswa
240,uin sunan gunung djati bandung beasiswa,beasiswa
241,beasiswa uin sunan gunung djati bandung nggak,beasiswa
242,uin sunan gunung djati bandung punya beasiswa,beasiswa
243,info soal beasiswa uin sunan gunung djati bandung,beasiswa
244,beasiswa uin sunan gunung djati bandung info l...,beasiswa
245,minta info dong uin sunan gunung djati bandung...,beasiswa
246,beasiswa uin sunan gunung djati bandung gak sih,beasiswa


## Data Labelling

In [16]:
# # Defining labels
# label_list = dataset['tag'].drop_duplicates()
# LABELS = label_list.tolist()
# len(LABELS), LABELS

In [17]:
# # Give 0 as initial value
# labelled_data = dataset.copy()
# labelled_data[LABELS] = 0
# labels = pd.Series(label_list)

# for i, row in labelled_data.iterrows():
#     tag = row['tag'].split(';')
#     labelled_data.loc[i, tag] = 1
# labelled_data.tail(1)
# # labelled_data[LABELS].sum()

## Working Model

In [18]:
# from sentence_transformers import SentenceTransformer
# sentences = []
# for sentence in dataset['pattern']:
#     sentences.append(sentence)

# model = SentenceTransformer('firqaaa/indo-sentence-bert-base')
# embeddings = model.encode(sentences)

In [19]:
# def low_confident(indexes):
#     tags = []
#     data = dataset.to_numpy().tolist()
#     pos = 0
#     for tag in data:
#         for id in indexes:
#             if(id == pos):
#                 tags.append(tag)
#         pos += 1
#     tags = pd.DataFrame(tags)
#     tags = tags.drop_duplicates()
#     return tags

In [20]:
# asked_quetion = 'Informasi seputar biaya UKT di UIN Sunan Gunung Djati Bandung'
# encoded_question = model.encode(asked_quetion)
# result = np.array(model.similarity(encoded_question, embeddings))
# highestIndex = np.argmax(result)
# highestVal = np.max(result)
# tag = dataset.loc[highestIndex]['tag']

# print
# highestIndex, highestVal, tag

In [21]:
# # get confidents higher than 0.5
# confidents = []
# indexes = []
# pos = 0

# for item in result:
#     for value in item:
#         if(value >= 0.8):
#             confidents.append(value)
#             indexes.append(pos)
#         elif(value < 0.8):
#             recommended_tags = low_confident(indexes)
#             recommended_tags
#         pos += 1
# confidents, indexes

### Manual

In [22]:
# from transformers import AutoTokenizer, AutoModel
# import torch


# #Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# # Sentences we want sentence embeddings for
# sentences = ["Ibukota Perancis adalah Paris", 
#              "Menara Eifel terletak di Paris, Perancis", 
#              "Pizza adalah makanan khas Italia", 
#              "Saya kuliah di Carneige Mellon University"]


# # Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('firqaaa/indo-sentence-bert-base')
# model = AutoModel.from_pretrained('firqaaa/indo-sentence-bert-base')

# # Tokenize sentences
# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# # Compute token embeddings
# with torch.no_grad():
#     model_output = model(**encoded_input)

# # Perform pooling. In this case, mean pooling.
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# print("Sentence embeddings:")
# print(sentence_embeddings)

## Fine-tuned Model

In [23]:
%pip install -U "sentence-transformers[train]" " transformers[torch]" accelerate datasets

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
# Import libraries
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments
)
from sentence_transformers.losses import CoSENTLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction

In [50]:
# Define model
model = SentenceTransformer("firqaaa/indo-sentence-bert-base")

In [38]:
# Define datasets
train_dataset = load_dataset("csv", data_files="data/preprocessed-data.csv")
test_dataset = load_dataset("csv", data_files="data/dataset-question.csv")
eval_dataset = load_dataset("csv", data_files="data/dataset-eval.csv")
train_dataset, test_dataset

Generating train split: 247 examples [00:00, 22000.75 examples/s]


(DatasetDict({
     train: Dataset({
         features: ['pattern', 'tag'],
         num_rows: 247
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['pattern', 'tag'],
         num_rows: 259
     })
 }))

In [37]:
# Define loss function (CoSENTLoss | Cosine Sentence Loss -> Returning float similarity score)
loss = CoSENTLoss(model)

In [51]:
# Specify training args
args = SentenceTransformerTrainingArguments(
    output_dir="fine-tuned/sbert-fine-tuned-chatPMB",
    num_train_epochs=50,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
)

In [78]:
# Creating eval dataset
train_patterns = pd.read_csv('data/preprocessed-data.csv')['pattern']
test_patterns = pd.read_csv('data/dataset-question.csv')['pattern']
test_patterns = test_patterns.iloc[:-12]
train_patterns, test_patterns

(0                                                     hi
 1                                                   halo
 2                                           selamat pagi
 3                                          selamat siang
 4                                           selamat sore
                              ...                        
 242        uin sunan gunung djati bandung punya beasiswa
 243    info soal beasiswa uin sunan gunung djati bandung
 244    beasiswa uin sunan gunung djati bandung info l...
 245    minta info dong uin sunan gunung djati bandung...
 246      beasiswa uin sunan gunung djati bandung gak sih
 Name: pattern, Length: 247, dtype: object,
 0                                                     Hi
 1                                                   Halo
 2                                           Selamat Pagi
 3                                          Selamat Siang
 4                                           Selamat Sore
                            

In [79]:
embed_train = model.encode(train_patterns)
embed_test = model.encode(test_patterns)
# embed_test, embed_train

In [82]:
scores = []
temp = 0
result = np.array(model.similarity(embed_train, embed_test))
# result
for val in result:
    for data in val:
        if(temp<data):
            temp=data
    scores.append(temp)
dev = pd.DataFrame({
    "train_patterns": train_patterns,
    "test_patterns": test_patterns,
    "score": scores
})
# dev.to_csv('data/dataset-eval.csv')
dev

Unnamed: 0,train_patterns,test_patterns,score
0,hi,Hi,1.0
1,halo,Halo,1.0
2,selamat pagi,Selamat Pagi,1.0
3,selamat siang,Selamat Siang,1.0
4,selamat sore,Selamat Sore,1.0
...,...,...,...
242,uin sunan gunung djati bandung punya beasiswa,Bisakah Anda memberitahu saya apakah di UIN Su...,1.0
243,info soal beasiswa uin sunan gunung djati bandung,Dapatkah Anda memberikan informasi tentang ada...,1.0
244,beasiswa uin sunan gunung djati bandung info l...,Mohon penjelasan apakah ada program beasiswa d...,1.0
245,minta info dong uin sunan gunung djati bandung...,Saya ingin menanyakan apakah di UIN Sunan Gunu...,1.0


In [None]:
# %pip install seaborn matplotlib

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# plt.figure(figsize=(15, 15))
# sns.heatmap(
#     result,
#     annot=True,
#     fmt="d",
#     cmap="Blues",
#     xticklabels=train_patterns,
#     yticklabels=test_patterns,
# )
# plt.xlabel("Predicted labels")
# plt.ylabel("True labels")
# plt.title("Training Data")
# plt.show()

In [34]:
# Create evaluator & evaluate the base model
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=dev["train_patterns"],
    sentences2=dev["test_patterns"],
    scores=dev["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)
dev_evaluator(model)

{'sts-dev_pearson_cosine': 0.6937476450106933,
 'sts-dev_spearman_cosine': 0.11990915114298019,
 'sts-dev_pearson_manhattan': 0.6584539015552953,
 'sts-dev_spearman_manhattan': 0.1382666264786833,
 'sts-dev_pearson_euclidean': 0.6609010834465888,
 'sts-dev_spearman_euclidean': 0.13864217644517413,
 'sts-dev_pearson_dot': 0.7086632913413077,
 'sts-dev_spearman_dot': 0.09030494260026221,
 'sts-dev_pearson_max': 0.7086632913413077,
 'sts-dev_spearman_max': 0.13864217644517413}

In [35]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

  9%|▉         | 100/1100 [09:48<1:37:06,  5.83s/it]

{'loss': 3.9566, 'grad_norm': 127.71992492675781, 'learning_rate': 1.8181818181818182e-05, 'epoch': 9.09}


TypeError: 'int' object is not subscriptable

In [None]:
model.save_pretrained("fine-tuned/models/chatPMB-SBERT-pretrained")

In [None]:
test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=dev["train_patterns"],
    sentences2=dev["test_patterns"],
    scores=dev["score"],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)
test_evaluator(model)

{'sts-dev_pearson_cosine': 0.6937476450106933,
 'sts-dev_spearman_cosine': 0.11990915114298019,
 'sts-dev_pearson_manhattan': 0.6584539015552953,
 'sts-dev_spearman_manhattan': 0.1382666264786833,
 'sts-dev_pearson_euclidean': 0.6609010834465888,
 'sts-dev_spearman_euclidean': 0.13864217644517413,
 'sts-dev_pearson_dot': 0.7086632913413077,
 'sts-dev_spearman_dot': 0.09030494260026221,
 'sts-dev_pearson_max': 0.7086632913413077,
 'sts-dev_spearman_max': 0.13864217644517413}