In [6]:
import pandas as pd

train_df = pd.read_csv("C:/pill/nsmc-master/ratings_train.txt", "\t")
test_df = pd.read_csv("C:/pill/nsmc-master/ratings_test.txt", "\t")

In [7]:
from konlpy.tag import Okt
okt = Okt()
okt.pos(u'흔들리는 꽃들 속에서 네 샴푸향이 느껴진거야')

[('흔들리는', 'Verb'),
 ('꽃', 'Noun'),
 ('들', 'Suffix'),
 ('속', 'Noun'),
 ('에서', 'Josa'),
 ('네', 'Noun'),
 ('샴푸', 'Noun'),
 ('향', 'Noun'),
 ('이', 'Josa'),
 ('느껴진거야', 'Verb')]

In [8]:
def tokenize(doc):
    #형태소와 품사를 join
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

In [9]:
train_df.isnull().any() #document에 null값이 있다.
train_df['document'] = train_df['document'].fillna(''); #null값을 ''값으로 대체

test_df.isnull().any()
test_df['document'] = test_df['document'].fillna(''); #null값을 ''값으로 대체

In [10]:
#tokenize 과정은 시간이 오래 걸릴수 있음...
train_docs = [(tokenize(row[1]), row[2]) for row in train_df.values]
test_docs = [(tokenize(row[1]), row[2]) for row in test_df.values]

In [11]:
print(train_docs[0])
print(test_docs[0])

(['이/Determiner', '걸/Noun', '영화/Noun', '라고/Josa', '만들다/Verb', 'ㅡ/KoreanParticle', ',./Punctuation', 'ㅡ/KoreanParticle'], 0)
(['이/Determiner', '걸/Noun', '영화/Noun', '라고/Josa', '만들다/Verb', 'ㅡ/KoreanParticle', ',./Punctuation', 'ㅡ/KoreanParticle'], 0)


In [12]:
tokens = [t for d in train_docs for t in d[0]]
print("토큰개수:", len(tokens))

토큰개수: 491497


In [13]:
import nltk
text = nltk.Text(tokens, name='NMSC')

#토큰개수
print(len(text.tokens))

#중복을 제외한 토큰개수
print(len(set(text.tokens)))

#출력빈도가 높은 상위 토큰 10개
print(text.vocab().most_common(10))

491497
25592
[('./Punctuation', 19949), ('mg/Alpha', 11515), ('정/Noun', 11280), ('밀리그램/Noun', 8466), ('하다/Verb', 7788), ('캡슐/Noun', 6797), ('이/Josa', 5964), ('영화/Noun', 5481), ('보다/Verb', 5134), ('가/Josa', 4341)]


In [14]:
FREQUENCY_COUNT = 10000; #시간적 여유가 있다면 10000개를 해보도록~
selected_words = [f[0] for f in text.vocab().most_common(FREQUENCY_COUNT)]
#단어리스트 문서에서 상위 10000개들중 포함되는 단어들이 개수
def term_frequency(doc):
    return [doc.count(word) for word in selected_words]
#문서에 들어가는 단어 개수
x_train = [term_frequency(d) for d,_ in train_docs]
x_test = [term_frequency(d) for d,_ in test_docs]
#라벨(1 or 0)
y_train = [c for _,c in train_docs]
y_test = [c for _,c in test_docs]


In [15]:
import numpy as np
x_train = np.asarray(x_train).astype('float32')
x_test = np.asarray(x_test).astype('float32')

y_train = np.asarray(y_train).astype('float32')
y_test = np.asarray(y_test).astype('float32')

In [16]:
import tensorflow as tf

#레이어 구성
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(FREQUENCY_COUNT,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [17]:
#학습 프로세스 설정
model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=0.001),
    loss=tf.keras.losses.binary_crossentropy,
    metrics=[tf.keras.metrics.binary_accuracy]
    )

In [18]:
model.fit(x_train, y_train, epochs=100, batch_size=512)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x2a3bfd18508>

In [19]:
results = model.evaluate(x_test, y_test)




In [20]:
results


[0.008208808489143848, 0.9986448287963867]

In [21]:
model.save('C:/pill/pill_prescription2.h5')


In [22]:
def predict_pill(review):
    token = tokenize(review)
    tfq = term_frequency(token)
    data = np.expand_dims(np.asarray(tfq).astype('float32'), axis=0)
    score = float(model.predict(data))
    if(score > 0.5):
        print(f"{review} ==> 약 ({round(score*100)}%)")
    else:
        print(f"{review} ==> 약 아님 ({round((1-score)*100)}%)")

In [23]:
predict_pill("테라싸이클 린캅셀250말일리그람(항생제")


테라싸이클 린캅셀250말일리그람(항생제 ==> 약 (100%)
