In [220]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [221]:
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)

In [222]:
train = train.dropna()
test = test.dropna()

In [223]:
print(train.shape, test.shape)
train.head()

(24000, 2) (6000, 2)


Unnamed: 0,x,y
0,"받다,홀리데이,구매하다,개인적,케이스,디자인,아쉽다,립스틱,색깔,예쁘다",디자인
1,"좋다,가격,구매,잘하다,같다",가격
2,"전,이니,노세범,쓰다,퍼프,작다,불편하다,퍼프,적당하다,용량,많다,케이스,고급,지다...",용량
3,"색깔,예쁘다,맘,들다",디자인
4,"화,신청,하다,신정,끼다,토요일,오전,받다,생각,늦다,디자인,맘,들다,처음,바르다,...",디자인


In [224]:
train_corpus = []
for words in train['x']:
    a = []
    a.append(words)
    train_corpus.append(a)
    
test_corpus = []
for words in test['x']:
    a = []
    a.append(words)
    test_corpus.append(a)

In [225]:
train_corpus[:2]

[['받다,홀리데이,구매하다,개인적,케이스,디자인,아쉽다,립스틱,색깔,예쁘다'], ['좋다,가격,구매,잘하다,같다']]

In [226]:
train_token_review = []
for corpus in train_corpus:
    c = corpus[0].split(",")
    train_token_review.append(c)
        
test_token_review = []
for corpus in test_corpus:
    c = corpus[0].split(",")
    test_token_review.append(c)

In [248]:
train_token_review[:2]

[['받다', '홀리데이', '구매하다', '개인적', '케이스', '디자인', '아쉽다', '립스틱', '색깔', '예쁘다'],
 ['좋다', '가격', '구매', '잘하다', '같다']]

In [227]:
from collections import namedtuple

TaggedDocument = namedtuple('TaggedDocument', 'words tags')

In [253]:
from gensim.models.doc2vec import Doc2Vec

tagged_train_docs = [TaggedDocument(d, c) for d, c in zip(train_token_review, train['y'].values)]
tagged_test_docs = [TaggedDocument(d, c) for d, c in zip(test_token_review, test['y'].values)]

In [254]:
tagged_train_docs[:2]

[TaggedDocument(words=['받다', '홀리데이', '구매하다', '개인적', '케이스', '디자인', '아쉽다', '립스틱', '색깔', '예쁘다'], tags='디자인'),
 TaggedDocument(words=['좋다', '가격', '구매', '잘하다', '같다'], tags='가격')]

In [255]:
tagged_test_docs[:2]

[TaggedDocument(words=['은은하다', '고급스럽다', '좋다', '색', '자연스럽다', '고급', '지다', '싸다', '잘산거', '같다', '쓰다', '마켓', '아이폰', '앱', '작성'], tags='디자인'),
 TaggedDocument(words=['제품', '받다', '색상', '예쁘다', '다르다', '사람', '사다', '반품', '하다', '보내다', '반품하다', '팔수', '이다', '그렇다', '제품', '확인하다', '보내다', '하다', '아니다', '색상', '손가락', '테스트하다', '제품', '인터넷', '물건', '사다', '정도', '확인', '하다', '보내다'], tags='색상')]

In [256]:
doc_vectorizer = Doc2Vec(dm=0,
                         min_count=3,
                         vector_size=256, 
                         window=5,
                         negative=20)

In [257]:
doc_vectorizer.build_vocab(tagged_train_docs)

In [258]:
from time import time

start = time()
for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = train_model.alpha # fix the learning rate, no decay
end = time()
print("During Time: {}".format(end-start))

  """


During Time: 112.86632919311523


In [259]:
X_train = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_train_docs]
y_train = [doc.tags for doc in tagged_train_docs]

In [260]:
X_test = [doc_vectorizer.infer_vector(doc.words) for doc in tagged_test_docs]
y_test = [doc.tags for doc in tagged_test_docs]

In [261]:
X_train_np = np.asarray(X_train)
X_test_np = np.array(X_test)

In [285]:
y_train_np = np.asarray(y_train, dtype=str)
y_test_np = np.asarray(y_test, dtype=str)

In [310]:
print(X_train_np.shape)
print(X_test_np.shape)

(24000, 256)
(6000, 256)


In [311]:
print(y_train_np.shape)
print(y_test_np.shape)

(24000,)
(6000,)


In [312]:
y_test_np

array(['디자인', '색상', '감촉', ..., '향기', '용량', '색상'], dtype='<U3')

In [319]:
target_names = list(set(y_test_np))
target_mapping_table = {}
for idx, names in enumerate(target_names):
    target_mapping_table[names] = idx

In [320]:
target_mapping_table

{'가격': 3, '감촉': 5, '디자인': 4, '색상': 0, '용량': 1, '향기': 2}

In [325]:
y_train_np

array(['디자인', '가격', '용량', ..., '용량', '디자인', '가격'], dtype='<U3')

In [None]:
print(y_train_np)
print(y_test_np)
print(y_train_pred)
print(y_test_pred)

In [342]:
train_prediction = fnn_clf.predict(X_train_np)
test_prediction = fnn_clf.predict(X_test_np)

train_y_pred = []
for i in range(len(train_prediction)):
    train_y_pred.append(np.argmax(train_prediction[i]))

test_y_pred = []
for i in range(len(test_prediction)):
    test_y_pred.append(np.argmax(test_prediction[i]))

In [345]:
tr_y_pred = le.inverse_transform(train_y_pred)
te_y_pred = le.inverse_transform(test_y_pred)

  if diff:
  if diff:


In [346]:
np.array(tr_y_pred)

array(['디자인', '가격', '용량', ..., '용량', '향기', '색상'], dtype='<U3')

In [321]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_label = le.fit_transform(y_train_np)

In [322]:
y_label

array([2, 0, 4, ..., 4, 2, 0])

In [329]:
b = le.inverse_transform(y_label)

  if diff:


In [327]:
y_train_np == b

array([ True,  True,  True, ...,  True,  True,  True])

In [337]:
b.shape

(24000,)

In [338]:
y_test_np.shape

(6000,)

In [347]:
train_df = pd.DataFrame(confusion_matrix(tr_y_pred, b),
                            index=target_names,
                            columns=target_names)

In [348]:
train_df

Unnamed: 0,색상,용량,향기,가격,디자인,감촉
색상,3009,389,284,226,222,264
용량,268,2573,159,339,164,422
향기,160,88,2543,468,57,177
가격,181,348,685,2649,123,179
디자인,210,169,89,138,3297,163
감촉,172,433,240,180,137,2795


In [349]:
print(len(y_train))
print(len(train_y_pred))
print(target_names)

24000
24000
['색상', '용량', '향기', '가격', '디자인', '감촉']


In [293]:
import tensorflow as tf

fnn_clf = tf.keras.Sequential()
fnn_clf.add(tf.keras.layers.Dense(128, activation='relu', input_shape=(len(X_train[0]), )))
fnn_clf.add(tf.keras.layers.Dense(128, activation='relu'))
fnn_clf.add(tf.keras.layers.Dense(6, activation='softmax'))

fnn_clf.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['sparse_categorical_accuracy'])

print(X_train)
print(y_train)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [290]:
fnn_clf

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f088446f5c0>

In [313]:
fnn_clf.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_10 (Dense)             (None, 128)               16512     
_________________________________________________________________
dense_11 (Dense)             (None, 6)                 774       
Total params: 50,182
Trainable params: 50,182
Non-trainable params: 0
_________________________________________________________________


In [291]:
y_train_np

array(['디자인', '가격', '용량', ..., '용량', '디자인', '가격'], dtype='<U3')

In [318]:
fnn_clf.fit(X_train_np, y_label, epochs=10, steps_per_epoch=30)

Train on 24000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0820501da0>

In [None]:
# save the model to disk
fnn_clf.save('C:/Users/daumsoft/PycharmProjects/visualization/model/fnn_model.h5')

train_prediction = fnn_clf.predict(X_train)
test_prediction = fnn_clf.predict(X_test)

train_y_pred = []
for i in range(len(train_prediction)):
    train_y_pred.append(np.argmax(train_prediction[i]))

test_y_pred = []
for i in range(len(test_prediction)):
    test_y_pred.append(np.argmax(test_prediction[i]))