In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive/감성분석

In [None]:
!pip install konlpy

In [None]:
!pip install sentencepiece

# 라이브러리 불러오기

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import urllib.request
from konlpy.tag import Okt
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 텍스트 전처리 함수 만들기

In [None]:
# 한글만 추출하기
def hangul_only(df : pd.DataFrame, convert_column : str) -> pd.DataFrame:
    df['sentence'] = df[convert_column].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣0-9 ]","")
    df['sentence'] = df['sentence'].replace('^ +', '')
    df['sentence'].replace('', np.nan, inplace = True)
    df = df.dropna(how = 'any')
    return df

# 형태소 분석 -> mecab 이용해서 명사, 형용사, 동사만 추출
def mecab_preprocessing(df : pd.DataFrame, convert_column : str) -> pd.DataFrame:
    tags = ['JK', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC', 'EP', 'EF', 'EC', 'ETN', 'ETM']

    from konlpy.tag import Mecab

    mecab = Mecab()

    for idx, row in tqdm(df.iterrows(), desc = 'removing josa', total = len(df)):
        josa_removed = [x[0] for x in mecab.pos(row['sentence']) if x[1] not in tags]
        df.loc[idx, 'preprocessed_sentence'] = ' '.join(josa_removed)

    return df

# 데이터 간소화 하기 (필요한 것만 불러오기)
def get_data_only_review(df : pd.DataFrame) -> pd.DataFrame:
    return df[['sentence', 'preprocessed_sentence', 'label']]

# 각 데이터 합치기
def concat_individual_data(data_list : list) -> pd.DataFrame:
    df = data_list[0].copy()
    for i in range(1, len(data_list)):
        df = pd.concat([df, data_list[i]], axis = 0).reset_index(drop = True)
    return df.reset_index(drop = True)

In [None]:
review_df = pd.read_csv('mecab_data.csv').drop(['index', 'id'], axis = 1) ## 영화 리뷰 데이터
review_df.head()

In [None]:
shopping_df = pd.read_csv('shopping_review.csv') ## 쇼핑 리뷰 데이터
shopping_df.head()

In [None]:
shopping_df['label'] = shopping_df['ratings'].apply(lambda x : 1 if x >= 4 else 0)
shopping_df['label'].value_counts()

In [None]:
## get_data_only_review => 필요한 것만 불러오기
review_data = get_data_only_review(review_df)
shopping_data = get_data_only_review(shopping_df)

## 영화 리뷰, 쇼핑 리뷰 데이터 합치기
train_data = concat_individual_data([review_data, shopping_data])
print('합쳐진 데이터의 개수 : {}'.format(len(train_data)))
print('셔플 전 처음 세 개 : ', train_data.head(3))

train_data = train_data.sample(frac = 1).reset_index(drop = True)
print('셔플 후 처음 세 개 : ', train_data.head(3))

In [None]:
train_data.shape

In [None]:
## 클래스 라벨 비율 분포 확인
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
plt.bar(train_data['label'].value_counts().index, train_data['label'].value_counts(), color = 'skyblue')
plt.title('label distribution of data')
plt.show()

In [None]:
document_length = [len(x) for x in train_data['preprocessed_sentence'].astype(str)]
plt.hist(document_length, bins = 30)
plt.title('document length distribution')

In [None]:
numpy_document_length = np.array(document_length)
print('전체 문서의 (평균, 표준편차) : ({}, {}):'.format(numpy_document_length.mean(), numpy_document_length.std()))
print('전체 문서의 (최소, 최대) : ({}, {})'.format(numpy_document_length.min(), numpy_document_length.max()))
print('전체 문서의 개수 : {}'.format(len(document_length)))
print('문서의 길이가 3보다 큰 것의 개수 : {}'.format(len(numpy_document_length[numpy_document_length > 3])))

In [None]:
with open('review_1120.txt', 'w', encoding = 'utf8') as f:
    f.write('\n'.join(train_data['sentence']))

In [None]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
spm.SentencePieceTrainer.Train('--input=review_1120.txt \
--model_prefix=tokenizer --vocab_size=32000 --model_type=bpe --max_sentence_length=9999')

In [None]:
import csv
import sentencepiece as spm

vocab_list = pd.read_csv('tokenizer.vocab', sep = '\t', header = None, quoting = csv.QUOTE_NONE)
print(vocab_list.head(10))

sp = spm.SentencePieceProcessor()
vocab_file = "tokenizer.model"
sp.load(vocab_file)

lines = [
  "뭐 이딴 것도 영화냐.",
  "진짜 최고의 영화입니다 ㅋㅋ",
  "커버력 좋아서 투명한 피부로 보이게 만들어 줌 피지도 잘 가려줌 밀착력도 있고 보송보송함"
]
for line in lines:
  print(line)
  print(sp.encode_as_pieces(line))
  print(sp.encode_as_ids(line))
  print()

In [None]:
train_data['sentence_tokenized'] = [' '.join(sp.encode_as_pieces(line)) for line in train_data['sentence']]

In [None]:
train_data['공백 길이'] = train_data['sentence'].apply(lambda x : len(x.split(' ')))
train_data[train_data['공백 길이'] == 1]

In [None]:
train = train_data[train_data['공백 길이'] >= 3].reset_index(drop = True)
train

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tqdm import tqdm

X = train.drop('label', axis = 1)
y = train['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 2022)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

train_X = []
test_X = []

for sentence in tqdm(X_train['sentence_tokenized']):
    tokenized_sentence = sentence.split(' ')
    train_X.append(tokenized_sentence)

for sentence in tqdm(X_test['sentence_tokenized']):
    tokenized_sentence = sentence.split(' ')
    test_X.append(tokenized_sentence)

In [None]:
tokenizer = Tokenizer(oov_token = '<OOV>') ## 토큰화 한 것을 정수형으로
tokenizer.fit_on_texts(train_X)
print(tokenizer.word_index)

In [None]:
# saving
with open('tokenizer_1120.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
threshold = 20
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    # 단어의 등장 빈도수가 threshold보다 작으면
    if(value < threshold):
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print('단어 집합(vocabulary)의 크기 :',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수: %s'%(threshold - 1, rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 등장 빈도 비율:", (rare_freq / total_freq)*100)

In [None]:
# 전체 단어 개수 중 빈도수 2이하인 단어는 제거.
# 0번 패딩 토큰을 고려하여 + 1
vocab_size = total_cnt - rare_cnt + 1
print('단어 집합의 크기 :',vocab_size)

tokenizer = Tokenizer(vocab_size, oov_token = '<OOV>')
tokenizer.fit_on_texts(train_X)
train_X = tokenizer.texts_to_sequences(train_X)
test_X = tokenizer.texts_to_sequences(test_X)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
print('리뷰의 최대 길이 :',max(len(review) for review in train_X))
print('리뷰의 평균 길이 :',sum(map(len, train_X))/len(train_X))
plt.hist([len(review) for review in train_X], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
def below_threshold_len(max_len, nested_list):
  count = 0
  for sentence in nested_list:
    if(len(sentence) <= max_len):
        count = count + 1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (count / len(nested_list))*100))

In [None]:
max_len = 50
below_threshold_len(max_len, train_X)

In [None]:
train_X = pad_sequences(train_X, maxlen=max_len)
test_X = pad_sequences(test_X, maxlen=max_len)

# LSTM 모델 구축

In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, LeakyReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.experimental import CosineDecay

embedding_dim = 128
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(LSTM(hidden_units))
model.add(Dense(128, activation = LeakyReLU(alpha = 0.03)))
model.add(Dropout(0.3))
model.add(Dense(32, activation = LeakyReLU(alpha = 0.03)))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=3)
mc = ModelCheckpoint('sentiment_model_1120.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['acc'])
#history = model.fit(X_train, y_train, epochs=30, callbacks = [es, mc], batch_size=128, validation_split = 0.2)
history = model.fit(train_X, y_train, epochs = 30, callbacks = [es,mc], batch_size = 128, validation_data = [test_X, y_test])

In [None]:
loaded_model = load_model('sentiment_model_1120.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(test_X, y_test)[1]))

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
     pickle.dump(tokenizer, handle)

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
from konlpy.tag import Okt

In [None]:
stopwords = pd.read_excel('한국어 불용어 목록.xlsx')

In [None]:
def sentiment_predict(new_sentence):
  new_sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', new_sentence)
  new_sentence = okt.morphs(new_sentence, stem=True) # 토큰화
  new_sentence = [word for word in new_sentence if not word in stopwords] # 불용어 제거
  encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩
  pad_new = pad_sequences(encoded, maxlen = max_len) # 패딩
  score = float(loaded_model.predict(pad_new)) # 예측
  if(score > 0.5) :
    result = "긍정 리뷰"

  else :
    result = "부정 리뷰"
  return result, score

# 훈련된 모델을 새로운 리뷰 데이터에 적용

In [None]:
review = pd.read_csv('Review.csv')
review.head()

In [None]:
reviews = review[us_review['Type'] == 'Order']

selected_columns = ['Order ID', 'User ID', 'Rating', 'Content']
review_data = reviews[selected_columns]
review_data

In [None]:
sentiment_results = []
okt = Okt()

for index, row in review_data.iterrows():
    sentence = row['Content']
    user_id = row['User ID']
    order_id = row['Order ID']
    rating = row['Rating']

    result, score = sentiment_predict(sentence)
    print("User ID : ", user_id, "Order ID : ", order_id, "리뷰 : ", sentence, "결과 : ", result, "점수 : ", score, "평점 : ", rating)

    sentiment_results.append({
        'User ID': user_id,
        'Order ID': order_id,
        'sentence': sentence,
        'sentiment': result,
        'score': score,
        'rating':rating
    })

In [None]:
sentiment_results= pd.DataFrame(sentiment_results)
sentiment_results.head()