In [None]:
import numpy as np
from keras.utils import to_categorical
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import json
import random
from keras.models import Sequential
from keras import layers
from keras.layers import Dense, Dropout, Activation, Flatten
import matplotlib.pyplot as plt

In [None]:
!unzip category_preprocess.zip

Archive:  category_preprocess.zip
   creating: category_preprocess/
  inflating: category_preprocess/nhi.json  
  inflating: category_preprocess/liên khoa mắt tai mũi họng răng hàm mặt da liễu.json  
  inflating: category_preprocess/ngoại.json  
  inflating: category_preprocess/sản.json  
  inflating: category_preprocess/lâm sàng cận lâm sàng.json  
  inflating: category_preprocess/nội.json  
  inflating: category_preprocess/truyền nhiễm.json  


In [None]:
labels = {
    'lâm sàng cận lâm sàng': 0,
    'liên khoa mắt tai mũi họng răng hàm mặt da liễu': 1,
    'ngoại': 2,
    'nhi': 3,
    'nội': 4,
    'sản': 5,
    'truyền nhiễm': 6
}

In [None]:
dir_name = 'category_preprocess'

In [None]:
filenames = []
for (dirpath, dirnames, filenames1) in os.walk(dir_name):
    for fp in filenames1:
        filenames.append(f'{dir_name}/{fp}')

In [None]:
filenames

['category_preprocess/truyền nhiễm.json',
 'category_preprocess/nhi.json',
 'category_preprocess/sản.json',
 'category_preprocess/ngoại.json',
 'category_preprocess/lâm sàng cận lâm sàng.json',
 'category_preprocess/liên khoa mắt tai mũi họng răng hàm mặt da liễu.json',
 'category_preprocess/nội.json']

In [None]:
for fpath in filenames:
    with open(fpath, 'r') as fp:
        data = json.load(fp)

        print(fpath, len(data))

category_preprocess/truyền nhiễm.json 2118
category_preprocess/nhi.json 1676
category_preprocess/sản.json 7769
category_preprocess/ngoại.json 1839
category_preprocess/lâm sàng cận lâm sàng.json 1976
category_preprocess/liên khoa mắt tai mũi họng răng hàm mặt da liễu.json 2121
category_preprocess/nội.json 3277


In [None]:
TARGET_TRAIN_LEN = 3000
TRAIN_MIN_MAX = {
    0: [1800, 1976],
    1: [2000, 2121],
    2: [1700, 1839],
    3: [1600, 1676],
    4: [3000, 3277],
    5: [3000, 4277],
    6: [2000, 2118]
}

In [None]:
train_text = []
train_label = []
test_text = []
test_label = []

for fpath in filenames:
    label = labels[fpath.split('/')[-1].split('.')[0]]
    print(fpath, label)

    with open(fpath, 'r') as fp:
        data = json.load(fp)
        random.shuffle(data)

        count = 0
        for obj in data:
            count += 1
            if count > TRAIN_MIN_MAX[label][1]:
                break
            if count <= TRAIN_MIN_MAX[label][0]:
                train_label.append(label)
                # ghép title và question 
                train_text.append(obj['title'] + ' ' + obj['question'])
            else:
                test_label.append(label)
                test_text.append(obj['title'] + ' ' + obj['question'])
            if (label != 5 or label != 4) and count <= TARGET_TRAIN_LEN - TRAIN_MIN_MAX[label][0]:
                train_label.append(label)
                obj1 = data[random.randrange(0, TRAIN_MIN_MAX[label][0])]
                obj2 = data[random.randrange(0, TRAIN_MIN_MAX[label][0])]
                train_text.append(obj2['title'] + ' ' + obj2['question'] +
                                  ' ' + obj1['title'] + ' ' + obj1['question'])

category_preprocess/truyền nhiễm.json 6
category_preprocess/nhi.json 3
category_preprocess/sản.json 5
category_preprocess/ngoại.json 2
category_preprocess/lâm sàng cận lâm sàng.json 0
category_preprocess/liên khoa mắt tai mũi họng răng hàm mặt da liễu.json 1
category_preprocess/nội.json 4


In [None]:
print(train_text[0], train_label[0])

virus hp trong dạ_dày xét_nghiệm máu phát_hiện vỉut hp trong da_day 6


In [None]:
print(labels)
print(len(train_text), len(train_label), max(train_label))
print(len(test_text), len(test_label), max(test_label))

{'lâm sàng cận lâm sàng': 0, 'liên khoa mắt tai mũi họng răng hàm mặt da liễu': 1, 'ngoại': 2, 'nhi': 3, 'nội': 4, 'sản': 5, 'truyền nhiễm': 6}
21000 21000 6
2184 2184 6


In [None]:
with open('text_vectorize.json', 'w', encoding='utf8') as fp:
  json_object = json.dumps((train_text+test_text), indent = 4, ensure_ascii=False)
  fp.write(json_object)

In [None]:
word_vectorizer = TfidfVectorizer(max_features=20000)
word_vectorizer.fit(train_text+ test_text)

x_train = word_vectorizer.transform(train_text)
x_test = word_vectorizer.transform(test_text)

print(type(x_train)) # <class 'scipy.sparse.csr.csr_matrix'>
x_train = x_train.toarray()
x_test = x_test.toarray()
print(type(x_train)) # <class 'numpy.ndarray'>
print(x_train.shape, x_test.shape)

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>
(21000, 16331) (2184, 16331)


In [None]:
y_train = to_categorical(train_label)
y_test = to_categorical(test_label)
print(y_train.shape, y_test.shape)

(21000, 7) (2184, 7)


In [None]:
from keras.regularizers import l2, l1, l1_l2

In [None]:
def build_model(feature_num, label_num):
    model = Sequential()
    # Input - Layer
    model.add(layers.Dense(1024, activation = "relu", input_shape=(feature_num,)))
    # Hidden - Layers
    model.add(layers.Dropout(0.95, noise_shape=None, seed=None))
    #model.add(layers.Dense(512, activation = "relu"))
    #model.add(layers.Dropout(0.8, noise_shape=None, seed=None))
    model.add(layers.Dense(512, activation = "relu",))
    model.add(layers.Dropout(0.9, noise_shape=None, seed=None))
    model.add(layers.Dense(32, activation = "relu"))
    model.add(layers.Dropout(0.4, noise_shape=None, seed=None))
    model.add(layers.Dense(16, activation = "relu"))
    # Output- Layer
    model.add(Dense(label_num, activation='softmax'))
    model.compile(optimizer = "adam", loss = "categorical_crossentropy",metrics = ["accuracy"])
    return model

In [None]:
mymodel = build_model(x_train.shape[1], y_train.shape[1])
mymodel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              15950848  
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                16416     
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                5

In [None]:
def build_model_regularizer(feature_num, label_num):
    print(feature_num, label_num)
    model = Sequential()
    # Input - Layer
    model.add(layers.Input(feature_num,))
    model.add(layers.Dense(1024, activation = "relu",kernel_regularizer=l2(0.0001)))
    # Hidden - Layers
    model.add(layers.Dropout(0.8, noise_shape=None, seed=None))
    # model.add(layers.Dense(512, activation = "relu", kernel_regularizer=l2(0.005), bias_regularizer=l2(0.005)))
    # model.add(layers.Dropout(0.8, noise_shape=None, seed=None))
    model.add(layers.Dense(256, activation = "relu",
                           kernel_regularizer=l2(0.0025)))
    model.add(layers.Dropout(0.6, noise_shape=None, seed=None))
    model.add(layers.Dense(32, activation = "relu",
                           kernel_regularizer=l2(0.0025)))
    model.add(layers.Dropout(0.4, noise_shape=None, seed=None))
    model.add(layers.Dense(16, activation = "relu"))
    # Output- Layer
    model.add(Dense(label_num, activation='softmax'))
    model.compile(optimizer = "adam", loss = "categorical_crossentropy",metrics = ["accuracy"])
    return model

In [None]:
mymodel = build_model_regularizer(x_train.shape[1], y_train.shape[1])
# mymodel.summary()
history = mymodel.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=40, batch_size=32)

16331 7
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
print(history.history)

{'loss': [1.7254222631454468, 1.4257522821426392, 1.3531931638717651, 1.3123018741607666, 1.285187005996704, 1.2514296770095825, 1.2383445501327515, 1.2265673875808716, 1.200355887413025, 1.191537618637085, 1.1764121055603027, 1.1766852140426636, 1.162326693534851, 1.1480756998062134, 1.144690752029419, 1.1421871185302734, 1.1246676445007324, 1.1316888332366943, 1.1183007955551147, 1.124900460243225, 1.1189393997192383, 1.1220908164978027, 1.119411826133728, 1.1201292276382446, 1.106142520904541, 1.1122881174087524, 1.1004084348678589, 1.100053071975708, 1.1035252809524536, 1.0976183414459229, 1.1008559465408325, 1.1091275215148926, 1.0991895198822021, 1.1018792390823364, 1.093274474143982, 1.0909608602523804, 1.0872482061386108, 1.0839135646820068, 1.0886529684066772, 1.0812745094299316], 'accuracy': [0.44247618317604065, 0.6538095474243164, 0.7065714001655579, 0.7422381043434143, 0.7576666474342346, 0.7770476341247559, 0.7876190543174744, 0.7971428632736206, 0.8036190271377563, 0.810

In [None]:
mymodel.save('model.h5')

In [None]:
def predict(q):
  x = word_vectorizer.transform([q])
  input = x.toarray()
  output = mymodel.predict(input)
  print(output, np.argmax(output))

predict('xét_nghiệm beta dạ bsĩ mới chuyển phôi bviện mỹ_đức ngày phôi trữ ngày loại hôm_nay đi xét_nghiệm beta dc u thai fai bsĩ mà thấy nhìu bà mẹ dc trên u bình_thường hay bình thương cam_on')

[[0.04174808 0.00191995 0.04804575 0.00693974 0.13331932 0.72503
  0.04299719]] 5
