# Data Preparation

## Fetch Data from CSV

In [1]:
%pip install numpy pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import pandas as pd

In [3]:
question_path = './data/dataset-question.csv'
answer_path = './data/dataset-answer.csv'

question_attr = ['pattern', 'tag']
answer_attr = ['responses', 'tag']

question_data = pd.read_csv(question_path)[question_attr]
answer_data = pd.read_csv(answer_path)[answer_attr]

# data_answer.head(10)
print("Jumlah pertanyaan: ", question_data.shape)
print("Jumlah pertanyaan: ", answer_data.shape)

Jumlah pertanyaan:  (259, 2)
Jumlah pertanyaan:  (17, 2)


In [4]:
question_data.head(10)

Unnamed: 0,pattern,tag
0,Hi,gr_hi
1,Halo,gr_ha
2,Selamat Pagi,gr_pa
3,Selamat Siang,gr_si
4,Selamat Sore,gr_so
5,Selamat Malam,gr_ma
6,Ada program studi apa saja di UIN Sunan Gunung...,prodi
7,Apa saja jurusan di UIN Bandung,prodi
8,Ada jurusan apa saja?,prodi
9,Mohon informasi mengenai program studi yang te...,prodi


In [5]:
answer_data.head(10)

Unnamed: 0,responses,tag
0,Halo! Apa yang ingin anda cari tahu hari ini?,gr_hi
1,Hi! Apa yang bisa kami bantu?,gr_ha
2,Selamat Pagi! Apa yang bisa kami bantu untuk m...,gr_pa
3,Selamat Siang! Apa yang ingin anda cari tahu?,gr_si
4,Selamat Sore! Apa yang ingin anda ketahui?,gr_so
5,Selamat Malam! Apa yang bisa kami bantu?,gr_ma
6,UIN Sunan Gunung Djati Bandung menyelenggaraka...,prodi
7,SNBP adalah singkatan dari Seleksi Nasional Be...,jm_s1
8,Untuk jurusan yang bisa dipilih pada Seleksi M...,snbp_jur
9,Untuk jurusan yang bisa dipilih pada Seleksi M...,snbt_jur


## Data Cleaning

In [6]:
%pip install Sastrawi transformers tensorflow





[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from transformers import AutoTokenizer
import re
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
SBERT_TOKENIZER = 'firqaaa/indo-sentence-bert-base'

class Preprocess():
    def __init__(self, max_len=128):
        self.stemmer = StemmerFactory().create_stemmer()
        self.stopword = StopWordRemoverFactory().create_stop_word_remover()
        self.tokenizer = AutoTokenizer.from_pretrained(SBERT_TOKENIZER)
        self.max_len = max_len

    def cleaning(self, val):
        val = re.sub(r'\s+', ' ', val)
        val = re.sub("[^a-zA-Z0-9;]", " ", val)
        return val
    
    def casefolding(self, val):
        return str(val).lower()
    
    def stemming(self, val):
        return self.stemmer.stem(str(val))
    
    def stopwordRemoval(self, val):
        return self.stopword.remove(str(val))
    
    def embedding(self, val):
        return self.tokenizer.encode_plus(
            val,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='tf'
        )
    
    def preprocessing(self, sentences):
        for i in range(len(sentences)):
            input = sentences[i]
            input = self.cleaning(input)
            input = self.casefolding(input)
            input = self.stemming(input)
            input = self.stopwordremove(input)
            sentences[i] = input
        return sentences

    def tokenizing(self, sentences):
        input_ids, attention_mask = [], []
        for sentence in sentences:
            output = self.embedding(sentence)
            input_ids.append(output['input_ids'])
            attention_mask.append(output['attention_mask'])
        return {
            'input_ids': tf.convert_to_tensor(
                np.asarray(input_ids).squeeze(),
                dtype=tf.int32
            ),
            'attention_mask': tf.convert_to_tensor(
                np.asarray(attention_mask).squeeze(),
                dtype=tf.int32
            )
        }

    def preprocess_get_token(self, sentences, display_len=20):
        preprocessing = self.preprocessing(sentences)
        tokenized = self.tokenizing(preprocessing)
        return [self.tokenizer.convert_ids_to_tokens(tokenized['input_ids'][i][:display_len]) for i in range(len(sentences))]

preprocess = Preprocess()    

### Casefolding

In [26]:
sample_texts = question_data['pattern']
print(sample_texts[7])
print(sample_texts[8])
print(sample_texts[9])

Apa saja jurusan di UIN Bandung
Ada jurusan apa saja?
Mohon informasi mengenai program studi yang tersedia di UIN Sunan Gunung Djati Bandung.


In [28]:
preprocess = Preprocess()
hasil_casefolding = [preprocess.casefolding(sample) for sample in sample_texts]

bf_cf = sample_texts
af_cf = hasil_casefolding

df = pd.DataFrame({
    'Before Casefolding': bf_cf,
    'After Casefolding': af_cf 
})

df.tail(10)

Unnamed: 0,Before Casefolding,After Casefolding
249,Ada beasiswa di UIN Sunan Gunung Djati Bandung?,ada beasiswa di uin sunan gunung djati bandung?
250,Beasiswa ada di UIN Sunan Gunung Djati Bandung...,beasiswa ada di uin sunan gunung djati bandung...
251,Di UIN Sunan Gunung Djati Bandung ada beasiswa...,di uin sunan gunung djati bandung ada beasiswa...
252,Beasiswa di UIN Sunan Gunung Djati Bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
253,UIN Sunan Gunung Djati Bandung punya beasiswa?,uin sunan gunung djati bandung punya beasiswa?
254,Ada info soal beasiswa di UIN Sunan Gunung Dja...,ada info soal beasiswa di uin sunan gunung dja...
255,Beasiswa di UIN Sunan Gunung Djati Bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
256,"Minta info dong, di UIN Sunan Gunung Djati Ban...","minta info dong, di uin sunan gunung djati ban..."
257,Beasiswa di UIN Sunan Gunung Djati Bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
258,UIN Sunan Gunung Djati Bandung ada beasiswa?,uin sunan gunung djati bandung ada beasiswa?


### Cleaning

In [29]:
preprocess = Preprocess()
hasil_cleaning = [preprocess.cleaning(sample) for sample in hasil_casefolding]
bf_cl = hasil_casefolding
af_cl = hasil_cleaning
df = pd.DataFrame({
    'Before Cleaning': bf_cl,
    'After Cleaning': af_cl
})
df.tail(10)

Unnamed: 0,Before Cleaning,After Cleaning
249,ada beasiswa di uin sunan gunung djati bandung?,ada beasiswa di uin sunan gunung djati bandung
250,beasiswa ada di uin sunan gunung djati bandung...,beasiswa ada di uin sunan gunung djati bandung...
251,di uin sunan gunung djati bandung ada beasiswa...,di uin sunan gunung djati bandung ada beasiswa...
252,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
253,uin sunan gunung djati bandung punya beasiswa?,uin sunan gunung djati bandung punya beasiswa
254,ada info soal beasiswa di uin sunan gunung dja...,ada info soal beasiswa di uin sunan gunung dja...
255,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
256,"minta info dong, di uin sunan gunung djati ban...",minta info dong di uin sunan gunung djati ban...
257,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
258,uin sunan gunung djati bandung ada beasiswa?,uin sunan gunung djati bandung ada beasiswa


### Stemming

In [30]:
preprocess = Preprocess()
hasil_stemming = [preprocess.stemming(sample) for sample in hasil_cleaning]
bf_st = hasil_cleaning
af_st = hasil_stemming
df = pd.DataFrame({
    "Before Stemming" : bf_st,
    "After Stemming" : af_st
})
df.tail(10)

Unnamed: 0,Before Stemming,After Stemming
249,ada beasiswa di uin sunan gunung djati bandung,ada beasiswa di uin sunan gunung djati bandung
250,beasiswa ada di uin sunan gunung djati bandung...,beasiswa ada di uin sunan gunung djati bandung...
251,di uin sunan gunung djati bandung ada beasiswa...,di uin sunan gunung djati bandung ada beasiswa...
252,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
253,uin sunan gunung djati bandung punya beasiswa,uin sunan gunung djati bandung punya beasiswa
254,ada info soal beasiswa di uin sunan gunung dja...,ada info soal beasiswa di uin sunan gunung dja...
255,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
256,minta info dong di uin sunan gunung djati ban...,minta info dong di uin sunan gunung djati band...
257,beasiswa di uin sunan gunung djati bandung ada...,beasiswa di uin sunan gunung djati bandung ada...
258,uin sunan gunung djati bandung ada beasiswa,uin sunan gunung djati bandung ada beasiswa


### Stopword Removal

In [31]:
preprocess = Preprocess()
hasil_stopword_removal = [preprocess.stopwordRemoval(sample) for sample in hasil_stemming]
bf_sr = hasil_stemming
af_sr = hasil_stopword_removal
df = pd.DataFrame({
    "Before Stopword Removal": bf_sr,
    "After Stopword Removal": af_sr
})
df.tail(10)

Unnamed: 0,Before Stopword Removal,After Stopword Removal
249,ada beasiswa di uin sunan gunung djati bandung,beasiswa uin sunan gunung djati bandung
250,beasiswa ada di uin sunan gunung djati bandung...,beasiswa di uin sunan gunung djati bandung
251,di uin sunan gunung djati bandung ada beasiswa...,uin sunan gunung djati bandung beasiswa
252,beasiswa di uin sunan gunung djati bandung ada...,beasiswa uin sunan gunung djati bandung nggak
253,uin sunan gunung djati bandung punya beasiswa,uin sunan gunung djati bandung punya beasiswa
254,ada info soal beasiswa di uin sunan gunung dja...,info soal beasiswa uin sunan gunung djati bandung
255,beasiswa di uin sunan gunung djati bandung ada...,beasiswa uin sunan gunung djati bandung info l...
256,minta info dong di uin sunan gunung djati band...,minta info dong uin sunan gunung djati bandung...
257,beasiswa di uin sunan gunung djati bandung ada...,beasiswa uin sunan gunung djati bandung gak sih
258,uin sunan gunung djati bandung ada beasiswa,uin sunan gunung djati bandung beasiswa


### Filtering (Drop Duplicate Data)

In [87]:
to_be_filtered = pd.DataFrame({
    "pattern": hasil_stopword_removal,
    "tag": question_data['tag']
})
to_be_filtered.head(10)
hasil_filtering = to_be_filtered.drop_duplicates(subset=['pattern'])

hasil_filtering.to_csv('data/preprocessed-data.csv', index=False)

print('Filtered data', hasil_filtering.shape)
hasil_filtering.tail(10)

Filtered data (247, 2)


Unnamed: 0,pattern,tag
248,mohon klarifikasi ada beasiswa tawar uin sunan...,beasiswa
249,beasiswa uin sunan gunung djati bandung,beasiswa
250,beasiswa di uin sunan gunung djati bandung,beasiswa
251,uin sunan gunung djati bandung beasiswa,beasiswa
252,beasiswa uin sunan gunung djati bandung nggak,beasiswa
253,uin sunan gunung djati bandung punya beasiswa,beasiswa
254,info soal beasiswa uin sunan gunung djati bandung,beasiswa
255,beasiswa uin sunan gunung djati bandung info l...,beasiswa
256,minta info dong uin sunan gunung djati bandung...,beasiswa
257,beasiswa uin sunan gunung djati bandung gak sih,beasiswa


### Preprocessed Data

In [89]:
dataset = pd.read_csv('data/preprocessed-data.csv')
print("Total data: ", dataset.shape)
dataset.tail(10)

Total data:  (247, 2)


Unnamed: 0,pattern,tag
237,mohon klarifikasi ada beasiswa tawar uin sunan...,beasiswa
238,beasiswa uin sunan gunung djati bandung,beasiswa
239,beasiswa di uin sunan gunung djati bandung,beasiswa
240,uin sunan gunung djati bandung beasiswa,beasiswa
241,beasiswa uin sunan gunung djati bandung nggak,beasiswa
242,uin sunan gunung djati bandung punya beasiswa,beasiswa
243,info soal beasiswa uin sunan gunung djati bandung,beasiswa
244,beasiswa uin sunan gunung djati bandung info l...,beasiswa
245,minta info dong uin sunan gunung djati bandung...,beasiswa
246,beasiswa uin sunan gunung djati bandung gak sih,beasiswa


## Data Labelling

In [99]:
# Defining labels
label_list = dataset['tag'].drop_duplicates()
LABELS = label_list.tolist()
len(LABELS), LABELS

(17,
 ['gr_hi',
  'gr_ha',
  'gr_pa',
  'gr_si',
  'gr_so',
  'gr_ma',
  'prodi',
  'jm_s1',
  'snbp_jur',
  'snbt_jur',
  'spanpt_jur',
  'umpt_jur',
  'man_jur',
  'jm_s2',
  'portal_pmb',
  'ukt',
  'beasiswa'],
 0           gr_hi
 1           gr_ha
 2           gr_pa
 3           gr_si
 4           gr_so
 5           gr_ma
 6           prodi
 29          jm_s1
 53       snbp_jur
 77       snbt_jur
 101    spanpt_jur
 123      umpt_jur
 146       man_jur
 166         jm_s2
 186    portal_pmb
 207           ukt
 228      beasiswa
 Name: tag, dtype: object)

In [110]:
# Give 0 as initial value
labelled_data = dataset.copy()
labelled_data[LABELS] = 0
labels = pd.Series(label_list)

for i, row in labelled_data.iterrows():
    tag = row['tag'].split(';')
    labelled_data.loc[i, tag] = 1
labelled_data.tail(1)

Unnamed: 0,pattern,tag,gr_hi,gr_ha,gr_pa,gr_si,gr_so,gr_ma,prodi,jm_s1,snbp_jur,snbt_jur,spanpt_jur,umpt_jur,man_jur,jm_s2,portal_pmb,ukt,beasiswa
246,beasiswa uin sunan gunung djati bandung gak sih,beasiswa,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [144]:
labelled_data[LABELS].sum()

gr_hi          1
gr_ha          1
gr_pa          1
gr_si          1
gr_so          1
gr_ma          1
prodi         23
jm_s1         24
snbp_jur      24
snbt_jur      24
spanpt_jur    22
umpt_jur      23
man_jur       20
jm_s2         20
portal_pmb    21
ukt           21
beasiswa      19
dtype: int64

## Model

In [124]:
from sentence_transformers import SentenceTransformer
sentences = []
for sentence in dataset['pattern']:
    sentences.append(sentence)

model = SentenceTransformer('firqaaa/indo-sentence-bert-base')
embeddings = model.encode(sentences)

In [142]:
ask = model.encode('apa saja jurusan yang ada di UIN Sunan Gunung Djati Bandung')
result = np.array(model.similarity(ask, embeddings))
highestIndex = np.argmax(result)
highestVal = np.max(result)
tag = dataset.loc[highestIndex]['tag']
result, highestIndex, highestVal, tag

(array([[ 0.14246768, -0.0332774 , -0.11128758, -0.09922208, -0.10121701,
         -0.09191533,  0.92224646,  0.6029278 ,  0.16917452,  0.82638973,
          0.8563111 ,  0.7417173 ,  0.91295755,  0.8853363 ,  0.89134014,
          0.8555141 ,  0.7679645 ,  0.88985664,  0.87490296,  0.91470546,
          0.8597665 ,  0.9110863 ,  0.8889887 ,  0.82661545,  0.91447175,
          0.78812456,  0.91206825,  0.9384594 ,  0.89888513,  0.8826097 ,
          0.72193253,  0.4582982 ,  0.38770458,  0.8656471 ,  0.8339616 ,
          0.8278644 ,  0.8366456 ,  0.8459759 ,  0.83853406,  0.81521404,
          0.84032124,  0.8268362 ,  0.8315866 ,  0.8888906 ,  0.8608991 ,
          0.8309269 ,  0.8508012 ,  0.85775715,  0.8372898 ,  0.9040065 ,
          0.8950766 ,  0.85438824,  0.8662636 ,  0.7130393 ,  0.78527796,
          0.7273698 ,  0.7037092 ,  0.7910934 ,  0.72961277,  0.7358496 ,
          0.7325377 ,  0.7546714 ,  0.714992  ,  0.7356234 ,  0.5540387 ,
          0.02495227,  0.13145578,  0.

In [143]:
# get confidents higher than 0.5
confidents = []
pos = 0
for item in result:
    for value in item:
        if(value > 0.8):
            print(pos, value)
        pos += 1
        #     confidents['val'].append(value)
        #     confidents['index'].append(pos)
confidents

6 0.92224646
9 0.82638973
10 0.8563111
12 0.91295755
13 0.8853363
14 0.89134014
15 0.8555141
17 0.88985664
18 0.87490296
19 0.91470546
20 0.8597665
21 0.9110863
22 0.8889887
23 0.82661545
24 0.91447175
26 0.91206825
27 0.9384594
28 0.89888513
29 0.8826097
33 0.8656471
34 0.8339616
35 0.8278644
36 0.8366456
37 0.8459759
38 0.83853406
39 0.81521404
40 0.84032124
41 0.8268362
42 0.8315866
43 0.8888906
44 0.8608991
45 0.8309269
46 0.8508012
47 0.85775715
48 0.8372898
49 0.9040065
50 0.8950766
51 0.85438824
52 0.8662636
109 0.80350506
166 0.83263385
167 0.82219356
169 0.80327344
170 0.8187289
173 0.84568435
174 0.8115903
176 0.8878231
177 0.88434905
178 0.89005196
179 0.84026444
180 0.8422848
181 0.86802065
183 0.845219
185 0.91059804
186 0.8310725
187 0.83498335
190 0.8001512
191 0.8252986
194 0.8163625
195 0.8405067
196 0.81038386
207 0.82747316
209 0.80806273
210 0.8700903
213 0.8419852
216 0.818751
217 0.81137717
218 0.8616997
219 0.8225026
220 0.83090043
221 0.8585019
222 0.8309124
223

[]

In [12]:
# from transformers import AutoTokenizer, AutoModel
# import torch


# #Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# # Sentences we want sentence embeddings for
# sentences = ["Ibukota Perancis adalah Paris", 
#              "Menara Eifel terletak di Paris, Perancis", 
#              "Pizza adalah makanan khas Italia", 
#              "Saya kuliah di Carneige Mellon University"]


# # Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('firqaaa/indo-sentence-bert-base')
# model = AutoModel.from_pretrained('firqaaa/indo-sentence-bert-base')

# # Tokenize sentences
# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# # Compute token embeddings
# with torch.no_grad():
#     model_output = model(**encoded_input)

# # Perform pooling. In this case, mean pooling.
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# print("Sentence embeddings:")
# print(sentence_embeddings)