In [1]:
filepath = '/content/FullChatbotDataset.xlsx'

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
df = pd.read_excel(filepath)
df.head()

Unnamed: 0,Topic,SubTopic,Intent,Question,Answer
0,Fintech,BlockChain,Define,What is blockchain technology?,Blockchain is a decentralized digital ledger t...
1,Fintech,BlockChain,Define,What are the key components of a blockchain?,"Key components include nodes (computers), bloc..."
2,Fintech,BlockChain,Define,Can you define smart contracts in the context ...,Smart contracts are self-executing contracts w...
3,Fintech,BlockChain,Define,What does decentralization mean in blockchain?,Decentralization refers to distributing contro...
4,Fintech,BlockChain,Define,What is a cryptocurrency and how does it relat...,Cryptocurrency is a digital or virtual currenc...


In [4]:
# Encode the topics into numerical labels
label_map = {"Fintech": 0, "ArTech": 1, "PropTech": 2, "Digitalization": 3, "Irrelevant": 4}
df['Topic'] = df['Topic'].map(label_map)

In [5]:
# Get questions as X and topic as y
X = []
y = []
for rows in df.iterrows():
  X.append(rows[1]['Question'])
  y.append(rows[1]['Topic'])


In [6]:
# initalize variables
vocab_size = 1000
embedding_dim = 16
max_len = 20
oov_token = "<OOV>"
num_class = len(label_map)

In [7]:
# tokenize the words then add padding to the sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
padded_sequences = pad_sequences(sequences, truncating='post', maxlen=max_len)

In [8]:
# Data split
from sklearn.model_selection import train_test_split
X = padded_sequences
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Neural Network Model
nn_model = Sequential()
nn_model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
nn_model.add(GlobalAveragePooling1D())
nn_model.add(Dense(24, activation='relu'))
nn_model.add(Dense(128, activation='relu'))
nn_model.add(Dense(24, activation='relu'))
nn_model.add(Dense(num_class, activation='softmax'))

nn_model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

nn_model.summary()



In [10]:
# Train model

epochs = 100
nn_model.fit(X_train, y_train, epochs=epochs)

Epoch 1/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3353 - loss: 1.5925
Epoch 2/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3111 - loss: 1.5053 
Epoch 3/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3348 - loss: 1.4524 
Epoch 4/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3019 - loss: 1.4706 
Epoch 5/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3335 - loss: 1.4247  
Epoch 6/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4061 - loss: 1.3587 
Epoch 7/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5564 - loss: 1.2111 
Epoch 8/100
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6309 - loss: 0.9592 
Epoch 9/100
[1m18/18[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d84dec4ae30>

In [11]:
# predict y_test
y_logits = nn_model.predict(X_test)
y_pred = [np.argmax(predict_value) for predict_value in y_logits]

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


In [13]:
# metrics of the model
from sklearn.metrics import classification_report
print("Performance of Neural Network:")
print(classification_report(y_test, y_pred, target_names=label_map.keys()))

Performance of Neural Network:
                precision    recall  f1-score   support

       Fintech       0.93      0.96      0.95        56
        ArTech       0.88      0.88      0.88        33
      PropTech       0.17      0.14      0.15         7
Digitalization       0.94      1.00      0.97        29
    Irrelevant       1.00      0.85      0.92        20

      accuracy                           0.90       145
     macro avg       0.78      0.77      0.77       145
  weighted avg       0.89      0.90      0.89       145



# SVM still the best option compared to neural network