In [1]:
!pip install numpy==1.19.5 --user



In [2]:
!pip install h5py==2.10.0 --user



In [3]:
!pip install gensim==3.6.0 --user



In [4]:
!pip install deepcut --user



In [5]:
!pip install pythainlp --user



In [6]:
!pip install nltk --user



In [7]:
!pip install flask_ngrok



In [8]:
# MaLSTM import
import deepcut
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re

import itertools
import datetime
import tensorflow
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adadelta
from tensorflow.keras.callbacks import ModelCheckpoint
import difflib

# Question import
import requests

# Category model import
import pickle
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import joblib



In [9]:
from flask_ngrok import run_with_ngrok
from flask import Flask, jsonify, request
import json

# MaLSTM Preparation

In [10]:
#Clean Text
def remove_repettition(text):
    token_list = list(text)
    if len(token_list) > 2:
        filter_list = [True, True]
        n = len(token_list)
        for i in range(2, n):
            if (token_list[i] == token_list[i-1]) and (token_list[i] == token_list[i-2]):
                filter_list.append(False)
            else:
                filter_list.append(True)

        output = ''.join(np.array(token_list)[filter_list])
    else:
        output = text
    return output

def cleansing(text):
    # \t, \n, \xa0 and other special characters. Replace by blank string
    text = re.sub('[\t\n\xa0\"\'!?\/\(\)%\:\=\-\+\*\_ๆ]', '', text)
    
    # Numbers. Replace by space
    text = re.sub('[0-9]', ' ', text)
    
    # Dot. Replace by space
    text = re.sub('[\.]', ' ', text)
    
    # One or more consecutive space. Replace by single space
    text = re.sub('\s+',' ',text)
    
    # Remove 2 or more repettition
    text = remove_repettition(text)
    
    return text

In [11]:
import gensim
wv_model = gensim.models.Word2Vec.load('corpus.th.model')

In [12]:
def word2idx(word):
    index = 0
    index = wv_model.wv.vocab[word].index
    return index

In [13]:
def word_index(listword):
    dataset = []
    vocabulary = dict()
    inverse_vocabulary = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
    for sentence in listword:
        tmp = []
        for w in sentence:
            if w not in wv_model:
                continue

            if w not in vocabulary:
                vocabulary[w] = len(inverse_vocabulary)
                tmp.append(len(inverse_vocabulary))
                inverse_vocabulary.append(w)
            else:
                tmp.append(word2idx(w))
        dataset.append(tmp)
    return np.array(dataset)

In [14]:
# define word embedding
vocab_list = [(k, wv_model.wv[k]) for k, v in wv_model.wv.vocab.items()]
embeddings_matrix = np.zeros((len(wv_model.wv.vocab.items()) + 1, wv_model.vector_size))
for i in range(len(vocab_list)):
    word = vocab_list[i][0]
    embeddings_matrix[i + 1] = vocab_list[i][1]

In [15]:
# vocab_list

In [16]:
EMBEDDING_DIM = 300
embeddings_matrix = 1 * np.random.randn(len(vocab_list) + 1, EMBEDDING_DIM)  # This will be the embedding matrix
embeddings_matrix[0] = 0  # So that the padding will be ignored

In [17]:
# Model variables
n_hidden = 256
batch_size = 128
n_epoch = 100
max_seq_length = 2704

In [18]:
# embeddings_matrix

In [19]:
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

In [20]:
# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings_matrix), EMBEDDING_DIM, weights=[embeddings_matrix], input_length=max_seq_length, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])


malstm.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

# Start training
training_start_time = time()

In [21]:
malstm.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2704)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2704)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 2704, 300)    9468300     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 256)          570368      embedding[0][0]              

In [22]:
# Load best weight from model
malstm.load_weights('sm_colab_ka.h5')

#Test with Text

In [23]:
def prepare_for_predict(input_questions):
    q_input= []
    cleansing(input_questions)
    tokenized_input_1 =deepcut.tokenize(input_questions)
    for sentence in tokenized_input_1:
      q_input.append(sentence)
    q_input= word_index(tokenized_input_1)
    q_input = pad_sequences(q_input, maxlen=max_seq_length)
    return q_input

In [24]:
max_word = 19219
max_seq_length = 2704

In [25]:
from itertools import chain
def tokenize_text_list(ls):
    """Tokenize list of text"""
    return list(chain.from_iterable([deepcut.tokenize(ls)]))

In [26]:
#Duplicate list
def duplicate(testList, n):
    return [ele for ele in testList for _ in range(n)]

# Question Preparation

In [27]:
cookies = {'user': 'j%3A%7B%22id%22%3A%22root-id%22%2C%22username%22%3A%22root%22%2C%22role%22%3A%22root%22%7D',
          'authorization':'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6InJvb3QtaWQiLCJuYW1lIjoicm9vdGFkbWluIiwidXNlcm5hbWUiOiJyb290Iiwicm9sZSI6InJvb3QiLCJpYXQiOjE2MjA3NTUxMzIsImV4cCI6MTYyMDc5ODMzMn0.-uNCuzyIYxA-4cTWPYlM_hyTfFXZhsgVeoa6Xis3POg'}
url = 'https://natthawat.live/api'

In [28]:
response = requests.get('%s/km/faq' % url, cookies=cookies)
faqs = json.loads(response.text)

response = requests.get('%s/km/category' % url, cookies=cookies)
categories = json.loads(response.text)

In [29]:
questions_data = {}
for category in categories:
    questions_data[category['category']] = []
    for faq in faqs:
        if faq['category']['category'] == category['category']:
            questions_data[category['category']].append(faq['question'])

In [30]:
questions_data

{'หลักสูตร': ['วิชาภาคบังคับที่มีวิชาต่อเนื่อง มีอะไรบ้าง หลักสูตรปกติ?',
  'ถ้าไม่ได้เป็นนักศึกษาชั้นปีที่ 3 สามารถฝึกงานได้หรือไม่？',
  'วิชา CPE223/Digital มีวิชาตัวต่อมั้ยครับ?',
  'วิชา CPE332/Professional issuesมีวิชาตัวต่อมั้ยครับ?',
  'วิศวะคอม/หลักสูตรวิทยาศาสตรบัณฑิต สาขาวิชาวิทยาศาสตร์ข้อมูลสุขภาพ \nปี 2 เทอม 1 ต้องลงเรียนกี่หน่วยกิต? เรียนวิชาอะไรบ้าง?',
  'วิศวะคอม/หลักสูตรวิทยาศาสตรบัณฑิต สาขาวิชาวิทยาศาสตร์ข้อมูลสุขภาพ \nปี 4 เทอม 1 ต้องลงเรียนกี่หน่วยกิต? เรียนวิชาอะไรบ้าง?',
  'วิศวะคอมหลักสูตรวิทยาศาตร์ข้อมูลสุขภาพมีอัตราค่าเรียนเท่าไหร่',
  'วิชา CPE100/Programming มีวิชาตัวต่อมั้ยครับ?',
  'วิชา CPE343/Object orientedมีวิชาตัวต่อมั้ยครับ?',
  'วิชา CPE375/Interactive computingมีวิชาตัวต่อมั้ยครับ?',
  'วิศวะคอม/หลักสูตรวิทยาศาสตรบัณฑิต สาขาวิชาวิทยาศาสตร์ข้อมูลสุขภาพ \nปี 3 เทอม 1 ต้องลงเรียนกี่หน่วยกิต? เรียนวิชาอะไรบ้าง?',
  'วิศวะคอม/หลักสูตรวิทยาศาสตรบัณฑิต สาขาวิชาวิทยาศาสตร์ข้อมูลสุขภาพ \nปี 1 เทอม 1 ต้องลงเรียนกี่หน่วยกิต? เรียนวิชาอะไรบ้าง?',
  'วิศวกรรมคอมพ

# Category Model Preparation

In [31]:
data = pd.read_excel("Category.xlsx")
data

Unnamed: 0,Category
0,หลักสูตร
1,ฝึกงาน
2,ลงทะเบียนเรียน
3,การรับเข้านักศึกษา
4,ทุนการศึกษา
5,คำถามทั่วไป


In [32]:
#Load File
with open('token_text_category.data', 'rb') as filehandle:
    # read the data as binary data stream
    tokenized_texts = pickle.load(filehandle)

In [33]:
def text_to_bow(tokenized_text, vocabulary_):
    n_doc = len(tokenized_text)
    values, row_indices, col_indices = [], [], []
    for r, tokens in enumerate(tokenized_text):
        feature = {}
        for token in tokens:
            word_index = vocabulary_.get(token)
            if word_index is not None:
                if word_index not in feature.keys():
                    feature[word_index] = 1
                else:
                    feature[word_index] += 1
        for c, v in feature.items():
            values.append(v)
            row_indices.append(r)
            col_indices.append(c)
        #print(feature)

    # document-term matrix in sparse CSR format
    X = sp.csr_matrix((values, (row_indices, col_indices)),
                      shape=(n_doc, len(vocabulary_)))
    return X

vocabulary_ = {v: k for k, v in enumerate(set(chain.from_iterable(tokenized_texts)))}
X = text_to_bow(tokenized_texts, vocabulary_)

In [34]:
transformer = TfidfTransformer()
svd_model = TruncatedSVD(n_components=100,
                         algorithm='arpack', n_iter=100)
X_tfidf = transformer.fit_transform(X)
X_svd = svd_model.fit_transform(X_tfidf)

In [35]:
tag = pd.get_dummies(data.Category).columns

In [36]:
#Load Model
logist_models = joblib.load("category_model.pkl")



In [37]:
y_pred = np.argmax(np.vstack([model.predict_proba(X_svd)[:, 1] for model in logist_models]).T, axis=1)
y_pred = np.array([tag[yi] for yi in y_pred])
y_true = data.Category.values
print(tag[0:6])

Index(['การรับเข้านักศึกษา', 'คำถามทั่วไป', 'ทุนการศึกษา', 'ฝึกงาน',
       'ลงทะเบียนเรียน', 'หลักสูตร'],
      dtype='object')


# time total

In [38]:
%%time
#         questions = data["questions"]
inputQuestion = "วิศวคอมมีหลักสูตรอะไรบ้าง"
print('input: ' + inputQuestion)
inputQuestion = cleansing(inputQuestion)

# Category model
tokenized_input_2 = inputQuestion
tokenized_text = deepcut.tokenize(tokenized_input_2)
x = text_to_bow([tokenized_text], vocabulary_)
x_tfidf = transformer.transform(x)
x_svd = svd_model.transform(x_tfidf)
pred = [model.predict_proba(x_svd.reshape(-1, 1).T).ravel()[1] for model in logist_models]

print(list(zip(tag, pred)))
predict_category = max(list(zip(tag, pred)))
max_value = 0
max_category = ''
pred_results = list(zip(tag, pred))

for pred_result in pred_results:
  # print(pred_result)
  if pred_result[1] > max_value:
    max_value = pred_result[1]
    max_category = pred_result[0]
questions=questions_data[max_category]

# maLSTM
newQuestion = {'question1':questions}
questionsDB = pd.DataFrame(data=newQuestion)
tokenized_category =questionsDB.question1.map(tokenize_text_list)
#         print(tokenized_category)

max_word = 19219
max_seq_length = 2704
q_category= []
for sentence in tokenized_category:
    q_category.append(sentence)
q_category = word_index(q_category)
all_Question_categorylen = len(q_category)

tokenized_dup_input_2= duplicate([tokenized_input_2],all_Question_categorylen)
q_user = word_index(tokenized_dup_input_2)
# Split to dicts
M_input = {'left': q_category, 'right': q_user}
# Zero padding
for model_input, side in itertools.product([M_input], ['left', 'right']):
    model_input[side] = pad_sequences(model_input[side], maxlen=max_seq_length)

# Make sure everything is ok
assert M_input['left'].shape == M_input['right'].shape
play_predict = malstm.predict(x=[M_input['left'],  M_input['right']])
max_question_percentage = max(play_predict)
question_index = np.where(play_predict == max_question_percentage)
predictedQuestion = questionsDB.loc[question_index[0][0],'question1']
print('output: '+ predictedQuestion+ ' ' + "%lf" % max_question_percentage)
value = {
  "predictedQuestion": predictedQuestion,
  "similarity": "%lf" % max_question_percentage
}

input: วิศวคอมมีหลักสูตรอะไรบ้าง
[('การรับเข้านักศึกษา', 0.13460088729294664), ('คำถามทั่วไป', 0.2034823474505967), ('ทุนการศึกษา', 0.050783028033583755), ('ฝึกงาน', 0.0294225354977537), ('ลงทะเบียนเรียน', 0.08308430089106947), ('หลักสูตร', 0.41979603189236847)]


  if w not in wv_model:
  return np.array(dataset)


output: Work Integrate Leaning หรือ Wil จำเป็นที่ลงเลือกวิชาเลือกเสรีหรือไม่? 0.635234
Wall time: 22.6 s


# time seperate

In [39]:
%%time
#category model
inputQuestion = "วิศวคอมมีหลักสูตรอะไรบ้าง"
tokenized_input_2= cleansing(inputQuestion)

Wall time: 0 ns


In [40]:
%%time
tokenized_text = deepcut.tokenize(tokenized_input_2)

Wall time: 64.9 ms


In [41]:
%%time
x = text_to_bow([tokenized_text], vocabulary_)
x_tfidf = transformer.transform(x)
x_svd = svd_model.transform(x_tfidf)

Wall time: 968 µs


In [42]:
%%time
pred = [model.predict_proba(x_svd.reshape(-1, 1).T).ravel()[1] for model in logist_models]

Wall time: 998 µs


In [43]:
%%time
# print(list(zip(tag, pred)))
predict_category = max(list(zip(tag, pred)))
max_value = 0
max_category = ''
pred_results = list(zip(tag, pred))

Wall time: 0 ns


In [44]:
%%time
for pred_result in pred_results:
  # print(pred_result)
  if pred_result[1] > max_value:
    max_value = pred_result[1]
    max_category = pred_result[0]

Wall time: 0 ns


In [45]:
%%time
# maLSTM
questions=questions_data[max_category]
newQuestion = {'question1':questions}
questionsDB = pd.DataFrame(data=newQuestion)
tokenized_category =questionsDB.question1.map(tokenize_text_list)
#         print(tokenized_category)

Wall time: 5.01 s


In [46]:
%%time
max_word = 19219
max_seq_length = 2704
q_category= []

Wall time: 0 ns


In [47]:
%%time
for sentence in tokenized_category:
    q_category.append(sentence)

Wall time: 0 ns


In [48]:
%%time
q_category = word_index(q_category)
all_Question_categorylen = len(q_category)

Wall time: 4.99 ms


  if w not in wv_model:
  return np.array(dataset)


In [49]:
%%time
tokenized_dup_input_2 = duplicate([tokenized_input_2],all_Question_categorylen)

Wall time: 0 ns


In [50]:
%%time
q_user = word_index(tokenized_dup_input_2)
# Split to dicts
M_input = {'left': q_category, 'right': q_user}
# Zero padding
for model_input, side in itertools.product([M_input], ['left', 'right']):
    model_input[side] = pad_sequences(model_input[side], maxlen=max_seq_length)

Wall time: 8.98 ms


  if w not in wv_model:


In [51]:
%%time
# Make sure everything is ok
assert M_input['left'].shape == M_input['right'].shape

Wall time: 0 ns


In [52]:
%%time
play_predict = malstm.predict(x=[M_input['left'],  M_input['right']])

Wall time: 15.6 s


In [53]:
%%time
max_question_percentage = max(play_predict)
question_index = np.where(play_predict == max_question_percentage)
predictedQuestion = questionsDB.loc[question_index[0][0],'question1']
# print('output: '+ predictedQuestion+ ' ' + "%lf" % max_question_percentage)

Wall time: 0 ns


In [54]:
%%time
value = {
  "predictedQuestion": predictedQuestion,
  "similarity": "%lf" % max_question_percentage
}

Wall time: 0 ns
