In [2]:
!pip install deepcut --user



In [3]:
!pip install flask_ngrok --user



In [1]:
import deepcut
import pandas as pd
import numpy as np
from itertools import chain
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
import joblib
import pickle
from flask_ngrok import run_with_ngrok
from flask import Flask
import json

In [2]:
data = pd.read_excel("Category.xlsx")
data

Unnamed: 0,Category
0,หลักสูตร
1,ฝึกงาน
2,ลงทะเบียนเรียน
3,การรับเข้านักศึกษา
4,ทุนการศึกษา
5,คำถามทั่วไป


In [3]:
#Load File
with open('token_text_category.data', 'rb') as filehandle:
    # read the data as binary data stream
    tokenized_texts = pickle.load(filehandle)

In [4]:
# tokenized_texts

In [5]:
def text_to_bow(tokenized_text, vocabulary_):
    n_doc = len(tokenized_text)
    values, row_indices, col_indices = [], [], []
    for r, tokens in enumerate(tokenized_text):
        feature = {}
        for token in tokens:
            word_index = vocabulary_.get(token)
            if word_index is not None:
                if word_index not in feature.keys():
                    feature[word_index] = 1
                else:
                    feature[word_index] += 1
        for c, v in feature.items():
            values.append(v)
            row_indices.append(r)
            col_indices.append(c)
        #print(feature)

    # document-term matrix in sparse CSR format
    X = sp.csr_matrix((values, (row_indices, col_indices)),
                      shape=(n_doc, len(vocabulary_)))
    return X

vocabulary_ = {v: k for k, v in enumerate(set(chain.from_iterable(tokenized_texts)))}
X = text_to_bow(tokenized_texts, vocabulary_)

In [6]:
transformer = TfidfTransformer()
svd_model = TruncatedSVD(n_components=100,
                         algorithm='arpack', n_iter=100)
X_tfidf = transformer.fit_transform(X)
X_svd = svd_model.fit_transform(X_tfidf)

In [7]:
tag = pd.get_dummies(data.Category).columns

In [8]:
#Load Model
logist_models = joblib.load("category_model.pkl")



In [9]:
logist_models

[LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression(),
 LogisticRegression()]

In [10]:
y_pred = np.argmax(np.vstack([model.predict_proba(X_svd)[:, 1] for model in logist_models]).T, axis=1)
y_pred = np.array([tag[yi] for yi in y_pred])
y_true = data.Category.values
print(tag[0:6])

Index(['การรับเข้านักศึกษา', 'คำถามทั่วไป', 'ทุนการศึกษา', 'ฝึกงาน',
       'ลงทะเบียนเรียน', 'หลักสูตร'],
      dtype='object')


# time total

In [11]:
%%time
text =  'วิศวคอมพิวเตอร์มีหลักสูตรอะไรบ้าง'
tokenized_text = deepcut.tokenize(text)
x = text_to_bow([tokenized_text], vocabulary_)
x_tfidf = transformer.transform(x)
x_svd = svd_model.transform(x_tfidf)
pred = [model.predict_proba(x_svd.reshape(-1, 1).T).ravel()[1] for model in logist_models]
print(list(zip(tag, pred)))
predict_category = max(list(zip(tag, pred)))
max_value = 0
max_category = ''
pred_results = list(zip(tag, pred))
for pred_result in pred_results:
  # print(pred_result)
  if pred_result[1] > max_value:
    max_value = pred_result[1]
    max_category = pred_result[0]
print(max_category, max_value)
value = {
  "category": max_category,
  "accuracy": max_value
}

[('การรับเข้านักศึกษา', 0.1251187945483281), ('คำถามทั่วไป', 0.14597382228539618), ('ทุนการศึกษา', 0.05706560577766554), ('ฝึกงาน', 0.031089886042377998), ('ลงทะเบียนเรียน', 0.09189157863959775), ('หลักสูตร', 0.4811685284649409)]
หลักสูตร 0.4811685284649409
Wall time: 937 ms


# time seperate

In [12]:
%%time
text =  'วิศวคอมพิวเตอร์มีหลักสูตรอะไรบ้าง'
tokenized_text = deepcut.tokenize(text)

Wall time: 56.4 ms


In [13]:
%%time
x = text_to_bow([tokenized_text], vocabulary_)

Wall time: 1.01 ms


In [14]:
%%time
x_tfidf = transformer.transform(x)

Wall time: 0 ns


In [15]:
%%time
x_svd = svd_model.transform(x_tfidf)

Wall time: 1.02 ms


In [16]:
%%time
pred = [model.predict_proba(x_svd.reshape(-1, 1).T).ravel()[1] for model in logist_models]

Wall time: 978 µs


In [17]:
%%time
print(list(zip(tag, pred)))

[('การรับเข้านักศึกษา', 0.1251187945483281), ('คำถามทั่วไป', 0.14597382228539618), ('ทุนการศึกษา', 0.05706560577766554), ('ฝึกงาน', 0.031089886042377998), ('ลงทะเบียนเรียน', 0.09189157863959775), ('หลักสูตร', 0.4811685284649409)]
Wall time: 0 ns


In [18]:
%%time
predict_category = max(list(zip(tag, pred)))

Wall time: 0 ns


In [19]:
%%time
max_value = 0
max_category = ''

Wall time: 0 ns


In [20]:
%%time
pred_results = list(zip(tag, pred))

Wall time: 0 ns


In [21]:
%%time
for pred_result in pred_results:
  # print(pred_result)
  if pred_result[1] > max_value:
    max_value = pred_result[1]
    max_category = pred_result[0]
print(max_category, max_value)

หลักสูตร 0.4811685284649409
Wall time: 0 ns


In [22]:
%%time
value = {
  "category": max_category,
  "accuracy": max_value
}

Wall time: 0 ns
