In [None]:
import pandas as pd
df=pd.read_csv('카테고리_상품.csv', header=0, encoding='utf-8', index_col=0)
df

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    import numpy as np
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    return df

In [None]:
df = reduce_mem_usage(df)
df["review"].iloc[0]

In [None]:
df=df.drop_duplicates()

In [None]:
df.info()

In [None]:
df=df.drop(df[df['review'].isnull()].index)
df=df.reset_index(drop=True)
df.info()

In [None]:
from konlpy.tag import Okt

okt = Okt()
okt.morphs(df['review'].loc[1])

In [None]:
df['doc'] = df['review'].str.replace("[^ㄱ-하-ㅣ가-힣]", " ")
df['doc'].replace("", np.nan, inplace=True)
df

In [None]:
#불용어 설정
#-*- coding: utf-8 -*-
f = open("korean_stopwords.txt", 'r', encoding='utf-8')
read = f.read()
stop_words = read.split()

name ="상품명"
okt = Okt()
st= okt.morphs(name)

for i in range(len(stop_words)):
    st.append(stop_words[i])
st

In [None]:
aa = [(df['score'] >= 4), (df['score'] == 3), (df['score'] <= 2)]
bb = [1, 0, -1]
df['rate'] = np.select(aa, bb, default='Not Specified')
df

In [None]:
def clean(dfdf):
    okt = Okt()
    dfdf=reduce_mem_usage(dfdf)
    dfdf['word']=0
    for i in range(len(dfdf)):
        lists=[] 
        t_words = okt.nouns(dfdf['doc'].iloc[i])
        for w in t_words:
            if w not in st:
                lists.append(w)
        dfdf['word'].loc[i]=lists
    return dfdf

df_clean=clean(df)
del df_clean["doc"]
df_clean

In [None]:
del df_clean["doc"]
df_train = df_clean[(df_clean['rate'] == '1') | (df_clean['rate'] == '-1')]
df_test = df_clean[df_clean['rate']== '0']

In [None]:
X_train = []
for i in range(len(df_train)):
    X_train.append(df_train['word'].iloc[i])

X_test = []
for i in range(len(df_test)):
    X_test.append(df_test['word'].iloc[i])

In [None]:
print("x_train 크기 :", len(X_train))
print("x_test 크기 :", len(X_test))

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
total_cnt = len(tokenizer.word_index)
rare_cnt = 0 
total_freq = 0
rare_freq = 0 

for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    if(value < 3):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value
        
vocab_size = total_cnt - rare_cnt + 1
vocab_size

In [None]:
tokenizer = Tokenizer(vocab_size) 
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train) 
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
y_train = np.array(df_train['rate'])
y_test = np.array(df_test['rate'])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train)
X_test = pad_sequences(X_test)

In [None]:
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
from keras.models import load_model

model = Sequential()
model.add(Embedding(vocab_size, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='relu'))

In [None]:
for i in range(len(df_test)):
    score = (model.predict(X_test)[i]*100)
    if(score >= 0.05):
        df_test["rate"].iloc[i] = '1'
    if(score < 0.05):
        df_test["rate"].iloc[i] = '-1'

In [None]:
df_test["rate"].value_counts()

In [None]:
df= pd.concat([df_train,df_test])
df

In [None]:
df = df.reset_index(drop=True)

del df["word"]

In [None]:
df1=reduce_mem_usage(df)
df1["rate"].value_counts()

In [None]:
for i in range(len(df1)):
    if (df1["rate"].iloc[i] ==  '1'):
        df1["rate"].iloc[i] =  '긍정'
    if (df1["rate"].iloc[i] ==  '-1'):
        df1["rate"].iloc[i] =  '부정'
df1

In [None]:
df1["rate"].value_counts()