In [1]:
import pandas as pd
pd.__version__

'1.4.4'

In [2]:
import os
os.environ["PYTHONIOENCODING"] = "utf-8"
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
nsmc_train_df = pd.read_csv('./data/ratings_train.txt', encoding='utf8', sep='\t', engine='python')
nsmc_train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [4]:
nsmc_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB


In [5]:
nsmc_train_df = nsmc_train_df[nsmc_train_df['document'].notnull()]

In [6]:
nsmc_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149995 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        149995 non-null  int64 
 1   document  149995 non-null  object
 2   label     149995 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ MB


In [7]:
nsmc_train_df['label'].value_counts()

0    75170
1    74825
Name: label, dtype: int64

In [8]:
import re

In [9]:
nsmc_train_df['document'] = nsmc_train_df['document'].apply(lambda x : re.sub(r"[^ ㄱ-ㅣ가-힣]+"," ", x))

In [10]:
nsmc_test_df = pd.read_csv('./data/ratings_test.txt', encoding='utf8', sep='\t', engine='python')
nsmc_test_df = nsmc_test_df[nsmc_test_df['document'].notnull()]
nsmc_test_df['document'] = nsmc_test_df['document'].apply(lambda x : re.sub(r'[^ ㄱ-ㅣ가-힣]+', "", x))

In [11]:
from konlpy.tag import Okt

okt = Okt()

In [12]:
def okt_tokenizer(text):
    tokens = okt.morphs(text)
    return tokens


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer = okt_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf.fit(nsmc_train_df['document'])
nsmc_train_tfidf = tfidf.transform(nsmc_train_df['document'])

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
SA_lr = LogisticRegression(random_state = 0)
SA_lr.fit(nsmc_train_tfidf, nsmc_train_df['label'])

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'C': [1, 3, 3.5, 4, 4.5, 5]}
SA_lr_grid_cv = GridSearchCV(SA_lr, param_grid=params, cv=3, scoring='accuracy', verbose=1)

In [None]:
SA_lr_grid_cv.fit(nsmc_train_tfidf, nsmc_train_df['label'])

In [None]:
print(SA_lr_grid_cv.best_params_, round(SA_lr_grid_cv.best_score_, 4))

In [None]:
SA_lr_best = SA_lr_grid_cv.best_estimator_

In [None]:
nsmc_test_tfidf = tfidf.transform(nsmc_test_df['document'])

In [None]:
test_predict = SA_lr_best.predict(nsmc_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score
print('감성 분석 정확도 : ', 
      round(accuracy_score(nsmc_test_df['label'], test_predict), 3))

In [None]:
st = input('감성 분석할 문장입력 >> ')

In [None]:
st = re.compile(r'[ㄱ-ㅣ가-힣]+').findall(st)
print(st)
st = [" ".join(st)]
print(st)


In [None]:
st_tfidf = tfidf.transform(st)
st_predict = SA_lr_best.predict(st_tfidf)

In [None]:
if st_predict == 0:
    print(st, '==> 부정 감성')
else:
    print(st, '==> 긍정 감성')

# 네이버 기사(코로나) 감성 분석하기

In [None]:
import json

with open('data/코로나_naver_news.json', encoding='utf8') as j_f:
    data = json.load(j_f)

In [None]:
data_title = []
data_description = []

for item in data:
    data_title.append(item['title'])
    data_description.append(item['description'])   

In [None]:
data_df = pd.DataFrame({'title':data_title, 'description':data_description})
data_df.head()

In [None]:
data_df['title'] = data_df['title'].apply(lambda x : re.sub(r'[^ ㄱ-ㅣ 가-힣]+', " ", x))
data_df['description'] = data_df['description'].apply(lambda x : re.sub(r'[^ ㄱ-ㅣ 가-힣]+', " ", x))

In [None]:
data_df.head()

In [None]:
data_title_tfidf = tfidf.transform(data_df['title'])

data_title_predict = SA_lr_best.predict(data_title_tfidf)

data_df['title_label'] = data_title_predict

In [None]:
data_description_tfidf = tfidf.transform(data_df['description'])

data_description_predict = SA_lr_best.predict(data_description_tfidf)

data_df['description_label'] = data_description_predict

In [None]:
# csv 파일로 저장 ---------------------------------------------
data_df.to_csv('./data/코로나new_label.csv', encoding='euc-kr')

In [None]:
data_df.head()

In [None]:
columns_name = ['title','title_label','description','description_label']
NEG_data_df = pd.DataFrame(columns=columns_name)
POS_data_df = pd.DataFrame(columns=columns_name)

for i, data in data_df.iterrows(): 
    title = data["title"] 
    description = data["description"] 
    t_label = data["title_label"] 
    d_label = data["description_label"] 
    
    if d_label == 0: # 부정 감성 샘플만 추출
        NEG_data_df = NEG_data_df.append(pd.DataFrame([[title, t_label, description, d_label]],columns=columns_name),ignore_index=True)
    else : # 긍정 감성 샘플만 추출
        POS_data_df = POS_data_df.append(pd.DataFrame([[title, t_label, description, d_label]],columns=columns_name),ignore_index=True)

# 파일에 저장.
NEG_data_df.to_csv('./data/코로나_news_NES.csv', encoding='euc-kr') 
POS_data_df.to_csv('./data/코로나_news_POS.csv', encoding='euc-kr') 


In [None]:
len(NEG_data_df), len(POS_data_df)

In [None]:
POS_description = POS_data_df['description']

POS_description_noun_tk = []

for d in POS_description:
    POS_description_noun_tk.append(okt.nouns(d)) #형태소가 명사인 것만 추출

In [None]:
POS_description_noun_join = []

for d in POS_description_noun_tk:
    d2 = [w for w in d if len(w) > 1] #길이가 1인 토큰은 제외
    POS_description_noun_join.append(" ".join(d2))

In [None]:
POS_tfidf = TfidfVectorizer(tokenizer = okt_tokenizer, min_df=2 )
POS_dtm = POS_tfidf.fit_transform(POS_description_noun_join)

In [None]:
POS_vocab = dict() 

for idx, word in enumerate(POS_tfidf.get_feature_names()):
    POS_vocab[word] = POS_dtm.getcol(idx).sum()
    
POS_words = sorted(POS_vocab.items(), key=lambda x: x[1], reverse=True)

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

fm.get_fontconfig_fonts()
font_location = 'C:/Windows/Fonts/malgun.ttf'
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

max = 15  #바 차트에 나타낼 단어의 수 

In [None]:
plt.bar(range(max), [i[1] for i in POS_words[:max]], color="blue")
plt.title("긍정 뉴스의 단어 상위 %d개" %max, fontsize=15)
plt.xlabel("단어", fontsize=12)
plt.ylabel("TF-IDF의 합", fontsize=12)
plt.xticks(range(max), [i[0] for i in POS_words[:max]], rotation=70)

plt.show()

In [None]:
NEG_description = NEG_data_df['description']

NEG_description_noun_tk = []
NEG_description_noun_join = []

for d in NEG_description:
    NEG_description_noun_tk.append(okt.nouns(d)) #형태소가 명사인 것만 추출
    
for d in NEG_description_noun_tk:
    d2 = [w for w in d if len(w) > 1]  #길이가 1인 토큰은 제외
    NEG_description_noun_join.append(" ".join(d2)) # 토큰을 연결(join)하여 리스트 구성

NEG_tfidf = TfidfVectorizer(tokenizer = okt_tokenizer, min_df=2 )
NEG_dtm = NEG_tfidf.fit_transform(NEG_description_noun_join)
NEG_vocab = dict() 

for idx, word in enumerate(NEG_tfidf.get_feature_names()):
    NEG_vocab[word] = NEG_dtm.getcol(idx).sum()
    
NEG_words = sorted(NEG_vocab.items(), key=lambda x: x[1], reverse=True)
plt.bar(range(max), [i[1] for i in NEG_words[:max]], color="red")
plt.title("부정 뉴스의 단어 상위 %d개" %max, fontsize=15)
plt.xlabel("단어", fontsize=12)
plt.ylabel("TF-IDF의 합", fontsize=12)
plt.xticks(range(max), [i[0] for i in NEG_words[:max]], rotation=70)

plt.show()

In [None]:
description = data_df['description']


In [None]:
description_noun_tk = []
for d in description:
    description_noun_tk.append(okt.nouns(d))


In [None]:
description_noun_tk2 = []

for d in description_noun_tk:
    item = [i for i in d if len(i) > 1]
    description_noun_tk2.append(item)

In [None]:
print(description_noun_tk2[:5])

In [None]:
get_ipython().system('pip install gensim')

In [None]:
import gensim
import gensim.corpora as corpora

In [None]:
dictionary = corpora.Dictionary(description_noun_tk2)
print(dictionary[0])

In [None]:
corpus = [dictionary.doc2bow(word) for word in description_noun_tk2]
print(corpus[:3])

In [None]:
k = 4

In [None]:
lda_model = gensim.models.ldamulticore.LdaMulticore(corpus, iterations = 12, num_topics = k, id2word = dictionary, passes = 1, workers = 10)

In [None]:
print(lda_model.print_topics(num_topics = k, num_words = 15))


In [None]:
get_ipython().system('pip install pyLDAvis ')

In [None]:
import pyLDAvis.gensim_models

lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)


In [None]:
pyLDAvis.display(lda_vis)

In [None]:
#pyLDAvis.save_html(lda_vis,'./data/저장할파일이름')