# 코드

In [None]:
from konlpy.tag import Okt
from collections import Counter

In [None]:
!pip install konlpy

In [None]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.rc('font',family='NanumBarunGothic')

# 샵다이소_감성분석

In [None]:
df = pd.read_csv('샵다이소_구글앱_리뷰_별점.csv')
df.head()

In [None]:
df = df.iloc[3:]
df.head()

In [None]:
df = df.reset_index()
df.head()

In [None]:
df.drop(['index','Unnamed: 0'],axis=1,inplace=True)
df.head()

In [None]:
df = df.drop_duplicates(subset=['text'])
df.info()

In [None]:
df['score'].value_counts()

In [None]:
list(df['text'])

In [None]:
text_data = ' '.join(list(df['text']))
text_data

In [None]:
okt = Okt()
nouns = okt.nouns(text_data)
print(nouns)

In [None]:
counter = Counter(nouns)
print(counter)

In [None]:
noun_data = Counter({x: counter[x] for x in counter if len(x)>1})
noun_data.most_common(10)

In [None]:
import re
from tqdm import tqdm

noun_result = []

for temp in tqdm(df['text']):
    han = re.compile('[^ㄱ-ㅣ가-힣]')
    temp_result = han.sub(' ',temp)
    temp_noun = okt.nouns(temp_result)
    result_noun = [x for x in temp_noun if len(x)>1]
    noun_result.append(result_noun)

print(noun_result)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(tokenizer = lambda x:x , lowercase=False)
bow_vect = count_vectorizer.fit_transform(noun_result)
word_list = count_vectorizer.get_feature_names_out()
print(word_list)
print(count_vectorizer.vocabulary_)

In [None]:
count_list = bow_vect.toarray().sum(axis=0)
count_list

In [None]:
bow_vect.toarray()

In [None]:
word_count = dict(zip(word_list,count_list))
word_count

In [None]:
bow_vect.shape

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_vectorizer = TfidfTransformer()
tf_idf = tf_vectorizer.fit_transform(bow_vect)
tf_idf.toarray()

In [None]:
df['score_real'] = np.where(df['score']>=4,1,0)
df.head()

In [None]:
from sklearn.model_selection import train_test_split
x = tf_idf
y = df['score_real']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1)

print(x_train.shape)
print(x_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

lr = LogisticRegression(random_state = 0)
lr.fit(x_train,y_train)

y_pred = lr.predict(x_test)
print(y_pred)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
precision_score(y_test,y_pred)

In [None]:
recall_score(y_test,y_pred)

In [None]:
f1_score(y_test,y_pred)

In [None]:
lr.coef_

In [None]:
invert_index_vectorizer = {}
for temp,temp1 in count_vectorizer.vocabulary_.items():
    invert_index_vectorizer[temp1] = temp

invert_index_vectorizer

In [None]:
lr.coef_

In [None]:
for index,value in enumerate(lr.coef_[0]):
    print(index,value)

In [None]:
coef_pos_index = sorted( ((value,index) for index,value in enumerate(lr.coef_[0])) ,reverse=True  )
for temp in coef_pos_index[0:20]:
    print(invert_index_vectorizer[temp[1]])

In [None]:
coef_neg_index = sorted( ((value,index) for index,value in enumerate(lr.coef_[0])) ,reverse=False  )
for temp in coef_neg_index[0:20]:
    print(invert_index_vectorizer[temp[1]])

In [None]:
def comment_predict(text):
    han = re.compile('[^ㄱ-ㅣ가-힣]')
    temp_result = han.sub(' ',text)
    temp_noun = okt.nouns(temp_result)
    result_noun = [x for x in temp_noun if len(x)>1]
    vec_test = count_vectorizer.transform([result_noun])
    tf_test = tf_vectorizer.transform(vec_test)

    result = ''
    if lr.predict(tf_test)[0] == 1:
        result = '긍정'
    else :
        result = '부정'


    return result
p_cnt = 0
n_cnt =0
for temp in df['text']:
    if comment_predict(temp) == '긍정':
        p_cnt +=1
    else:
        n_cnt +=1

print(p_cnt,'///', n_cnt)



In [None]:
comment_predict('어플 사용용할때 오류가 많이 나고 재고가 파악이 안되요요')

In [None]:
df[df['text'].str.contains('오류')]

In [None]:
df[df['text'].str.contains('배송비')]

In [None]:
df[df['text'].str.contains('픽업')]

In [None]:
df[df['text'].str.contains('가능')]

In [None]:
pd.options.display.max_rows = 700
pd.options.display.max_columns = 4
df

In [None]:
# row 생략 없이 출력
pd.set_option('display.max_rows', None)
# col 생략 없이 출력
pd.set_option('display.max_columns', None)
df