In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Flatten, Dense, Dot, Lambda, Embedding, TextVectorization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools
from collections import Counter
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from utils import data_merger, filter_data
import random
import re

In [2]:
data = pd.read_excel('./dataset.xlsx')
df = pd.DataFrame(data)
listAnswers = data_merger([df['a1'],df['a2'],df['a3'],df['a4'],df['a5'],df['a6'],df['a7'],df['a8'],df['a9'],df['a10'],df['a11'],df['a12'],df['a13'],df['a14'],df['a15'],df['a16'],df['a17'],df['a18'],df['a19'],df['a20']])
combinedAnswers = []
for i in range(len(listAnswers)):
    for item in listAnswers[i]:
        combinedAnswers.append(item)

In [3]:
# items = []
# index = 1
# for i in range(0, 110):
#     if i%10 == 0 and i != 0:
#         index += 1
#     if index == 11 :
#         index = 1
#     for j in range(len(listAnswers[i])):
#         items.append(f"({index+5}, '"+listAnswers[i][j]+"'),")


In [4]:
# len(listAnswers)

In [5]:
max_sequence_length = 35

In [6]:
# Tokenize sentences
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(combinedAnswers)

In [7]:
# Convert sentences to sequences and pad them
sequences = tokenizer.texts_to_sequences(combinedAnswers)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

In [8]:
# Convert sentences to TF-IDF vectors using TfidfVectorizer
vectorizer = TfidfVectorizer().fit(combinedAnswers)
vectors = vectorizer.transform(combinedAnswers).toarray()

In [9]:
input_a = Input(shape=(len(padded_sequences[0]),))
input_b = Input(shape=(len(padded_sequences[0]),))

flatten_a = Flatten()(input_a)
flatten_b = Flatten()(input_b)

cosine_similarity = Lambda(lambda x: tf.reduce_sum(tf.multiply(x[0], x[1]), axis=-1) / 
                            (tf.norm(x[0], axis=-1) * tf.norm(x[1], axis=-1)), 
                            output_shape=lambda _: (1,))([flatten_a, flatten_b])

model = Model(inputs=[input_a, input_b], outputs=cosine_similarity)

In [10]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [11]:
def predict_similarity(strA, strB):
    seqA = tokenizer.texts_to_sequences([strA])
    seqB = tokenizer.texts_to_sequences([strB])

    # Pad sequences
    padded_seqA = pad_sequences(seqA, padding='post', maxlen=len(padded_sequences[0]))
    padded_seqB = pad_sequences(seqB, padding='post', maxlen=len(padded_sequences[0]))

    # Predict similarity using model.predict
    similarity = model.predict([padded_seqA, padded_seqB])[0]

    return similarity

In [12]:
# Test the model
sentence_a = "Menurut saya, kurangnyangan anggaran yang terbatas, yang dapat memengaruhi kualitas pendidikan yang mereka tawarkan."
sentence_b = "Menurut saya, kurangnyangan anggaran yang terbatas, yang "

similarity_score = predict_similarity(sentence_a, sentence_b)
print(f"Similarity Score: {similarity_score}")

Similarity Score: 0.6474320292472839


In [14]:
model.save('similarity.h5')