## 라이브러리

In [22]:
import numpy as np
import pandas as pd
import re
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
from wordcloud import WordCloud
from collections import Counter

# LDA 구현을 위한 라이브러리
from sklearn.decomposition import LatentDirichletAllocation

## 폰트 설정

In [2]:
# 폰트 사이즈
plt.rcParams['font.size'] = 25
# 폰트 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# 화면크기 설정
plt.rcParams['figure.figsize'] = (16,18)

## 함수 모음

In [3]:
# 나이대 분리 함수
def age_split(age):
    if age < 20 :
        result = 1
    elif age < 30 :
        result = 2
    elif age < 40 :
        result = 3
    elif age < 50 :
        result = 4
    elif age < 60 :
        result = 5
    else:
        result = 6
        
    return result

In [4]:
# 지역 분리 함수
def location_split(location):
    if location in [1,4,8]:
        result = 1 # ==> 수도권
    elif location in [6, 10, 11, 17]:
        result = 2 # 충청권
    elif location in [5, 12, 13]:
        result = 3 # 전라권
    elif location in [2, 3, 7, 14, 15]:
        result = 4 # 경상권
    elif location in [9]:
        result = 5 # 강원권
    else:
        result = 6 # 제주
    return result

In [5]:
# 특수기호 & 영어 제거
# 텍스트 한글 외 다른 것들 제거 (숫자는 포함 ex.4강...)
def text_preprocessing(text):
    text = str(text)
    sentence = text.split(' ')
    prepro_sentence = []
    for word in sentence:
        prepro_word = re.sub(r'[^가-힣0-9]', '', word)
        prepro_sentence.append(prepro_word)
        
    result = ' '.join(prepro_sentence)
    return result

In [6]:
# okt 형태소 분석기 토크나이징
def tokenize(text):
    okt = Okt()
    tokens = okt.pos(text)

#     stop_words = ['있는', '하는', '생각', '합니다', '대한', '필요', '먼저'
#              '한다', '마음', '문제', '대통령', '위해', '않는', '의견', '나라']
#     tokens = [(word, tag) for word, tag in tokens if word not in stop_words]
    
    total_words = []
    for word, tag in tokens:
        if tag not in ['Josa', 'Suffix']:
            total_words.append(word)
    result = ' '.join(total_words)
    return result

In [7]:
# 빈도분석 함수
def count_vectorize(text, vectorizer):
    word_dict = sorted(vectorizer.vocabulary_.items())
    idx2word = {idx:word for word, idx in word_dict}
    
    total_word = []
    total_word.append((' ').join(text.values))
    
    count_matrix = count_vectorizer.transform(total_word)
    
    count_word = []
    count_vector = []
    
    for i in range(20,0,-1):
        count_word.append(idx2word[(-count_matrix.toarray()[0]).argsort()[i-1]])
        count_vector.append(count_matrix.toarray()[0][(-count_matrix.toarray()[0]).argsort()[i-1]]) 
    
    return count_word, count_vector

In [8]:
# tf-idf 분석
def tfidf_data(text_data, stop_word, tfidf):
    _tokeniz = text_data.map(tokenize)
    
    _word = []
    for sentence in [text for text in _tokeniz.values]:
        for word in sentence.split(' '):
            if word not in stop_word:
                _word.append(word)
                
    _total_word = []
    _total_word.append(' '.join(_word))

    _tfidf_matrix = tfidf.transform(_total_word)
    
    # 단어사전 정렬
    word_dict = sorted(tfidf.vocabulary_.items())
    idx2word = {idx:word for word, idx in word_dict}
    
    _tfidf_word = []
    _tfidf = []
    for i in range(25,0,-1):
        _tfidf_word.append(idx2word[(-_tfidf_matrix.toarray()[0]).argsort()[i-1]])
        _tfidf.append(_tfidf_matrix.toarray()[0][(-_tfidf_matrix.toarray()[0]).argsort()[i-1]])

    return _tfidf_word, _tfidf

In [9]:
# 가로 막대 그래프 그리기
def count_graph(word, vector, color, title):
    plt.barh(word, vector, label='단어 빈도', color=color)
    plt.ylabel('단어')
    plt.xlabel('빈도')
    plt.legend()
    plt.yticks(word)
    plt.title('{} 텍스트 빈도 분석 Top20'.format(title))
    plt.tight_layout()
    plt.savefig('{}.png'.format(title))
    plt.show()

In [67]:
# 이상한 단어 정상 단어 바꾸기
def change_word(word):
    for idx, val in enumerate(word):
        word[idx] = val.replace('관왕', '3관왕')
    for idx, val in enumerate(word):
        word[idx] = val.replace('했지만', '못했지만')
    for idx, val in enumerate(word):
        word[idx] = val.replace('김제', '김제덕')
    for idx, val in enumerate(word):
        word[idx] = val.replace('체전', '단체전')
    return word

## 데이터 불러오기

In [10]:
path = './2_올림픽에서 가장 인상 깊었던 것_30591.xlsx'
df = pd.read_excel(path, sheet_name=0)
df

Unnamed: 0,NO,UID,START,END,TIME,Q1,Q2t1,Q2t2,Q2_1,HQ_Q2_1,Q3t1
0,1000006,C327036214_1-4-4-orsiuqoem97enw8npjjes5d4,2021/08/11-00:00:16,2021/08/11-00:00:49,00:00:33,2,2000,21,3,46,안산선수
1,1000007,C327036214_1-3-3-d94uyxnq0gtmxiylnwzph10d,2021/08/11-00:00:16,2021/08/11-00:00:52,00:00:36,1,1997,24,3,47,여자배구 4강진출
2,1000011,C327036214_1-8-8-1zk22sg9o22c136k0kmh29v7,2021/08/11-00:00:20,2021/08/11-00:00:58,00:00:38,2,1978,43,8,75,메달 못따도 즐거워하는것
3,1000010,C327036214_1-5-5-3tk4BveNZppGGFhe9xYjaUtl,2021/08/11-00:00:20,2021/08/11-00:00:59,00:00:39,1,1983,38,4,54,안봄
4,1000008,C327036214_1-4-4-333y69w5o6sfk85ttlaaz15z,2021/08/11-00:00:16,2021/08/11-00:01:06,00:00:50,2,2001,20,8,82,수영에서 아시아 기록 깬것
...,...,...,...,...,...,...,...,...,...,...,...
30586,1031602,C327036214_1-1-1-lii8dwyp46v1opx4q333qlrd,2021/08/13-09:11:07,2021/08/13-09:16:14,00:05:07,2,1995,26,3,48,"긴장감넘치는 에페 펜싱경기와 최고의 경기력을 볼 수 있었던 여자 배구, 그리고 역시..."
30587,1021545,C327036214_1-7-7-wxukm1vvghgg5cuqln8v904s,2021/08/11-16:41:40,2021/08/13-09:16:19,1900-01-01 16:34:39,1,1980,41,1,15,배구 사강 .안산의 양궁
30588,1031618,C327036214_1-7-7-jjRTZVWV4uvLUlbUlHwRONHI,2021/08/13-09:16:05,2021/08/13-09:16:38,00:00:33,1,1981,40,4,55,여자배구 한일전에서 짜릿한 역전승
30589,1031617,C327036214_1-5-5-18q23pc3kmyg11q0pkia4pvh,2021/08/13-09:15:50,2021/08/13-09:16:44,00:00:54,1,1990,31,8,77,양궁 결승전에서 끝! 이라는 외마디!


In [11]:
new_df = df.iloc[:,5:]
new_df

Unnamed: 0,Q1,Q2t1,Q2t2,Q2_1,HQ_Q2_1,Q3t1
0,2,2000,21,3,46,안산선수
1,1,1997,24,3,47,여자배구 4강진출
2,2,1978,43,8,75,메달 못따도 즐거워하는것
3,1,1983,38,4,54,안봄
4,2,2001,20,8,82,수영에서 아시아 기록 깬것
...,...,...,...,...,...,...
30586,2,1995,26,3,48,"긴장감넘치는 에페 펜싱경기와 최고의 경기력을 볼 수 있었던 여자 배구, 그리고 역시..."
30587,1,1980,41,1,15,배구 사강 .안산의 양궁
30588,1,1981,40,4,55,여자배구 한일전에서 짜릿한 역전승
30589,1,1990,31,8,77,양궁 결승전에서 끝! 이라는 외마디!


## 데이터 전처리

In [12]:
# 연령 column에서 값이 '/NA/'인 행 제거
new_df = new_df.drop(new_df[new_df['Q2t2'] == '/NA/'].index)

In [13]:
# 연령대 값 int로 바꾸기
new_df = new_df.astype({'Q2t2':'int64'})
# 결측값 확인
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30569 entries, 0 to 30590
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Q1       30569 non-null  int64 
 1   Q2t1     30569 non-null  int64 
 2   Q2t2     30569 non-null  int64 
 3   Q2_1     30569 non-null  int64 
 4   HQ_Q2_1  30569 non-null  int64 
 5   Q3t1     30566 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.6+ MB


In [14]:
# 텍스트 부분 결측치 제거 후 확인
new_df = new_df.dropna(how='any', axis=0)
new_df.isna().sum()

Q1         0
Q2t1       0
Q2t2       0
Q2_1       0
HQ_Q2_1    0
Q3t1       0
dtype: int64

In [15]:
# 텍스트 특수기호, 영어 제거
new_df['prepro_text'] = new_df['Q3t1'].map(text_preprocessing)

In [16]:
# 텍스트 tokenizing
new_df['prepro_text'] = new_df['prepro_text'].map(tokenize)

### 성별

In [17]:
# 성별 데이터
# 남성
male = new_df['prepro_text'][new_df['Q1'] == 1]
# 여성
female = new_df['prepro_text'][new_df['Q1'] == 2]

In [38]:
print(len(male))
print(len(female))

10624
19942


### 나이별

In [18]:
# 나이 코딩 변환
new_df['Q2t2'] = new_df['Q2t2'].map(age_split)
new_df['Q2t2'].value_counts()

3    9263
2    8124
4    6172
1    3000
5    2995
6    1012
Name: Q2t2, dtype: int64

In [19]:
# 연령대 데이터
age10 = new_df['prepro_text'][new_df['Q2t2']==1]
age20 = new_df['prepro_text'][new_df['Q2t2']==2]
age30 = new_df['prepro_text'][new_df['Q2t2']==3]
age40 = new_df['prepro_text'][new_df['Q2t2']==4]
age50 = new_df['prepro_text'][new_df['Q2t2']==5]
age60 = new_df['prepro_text'][new_df['Q2t2']==6]

# 청년층 vs 장년층
age1030 = new_df['prepro_text'][new_df['Q2t2']<=3]
age4060 = new_df['prepro_text'][new_df['Q2t2']>=4]

### 지역별

In [20]:
# 지역 분리 코딩 변환
new_df['지역'] = new_df['Q2_1'].map(location_split)

In [54]:
new_df['지역'].value_counts()

1    18033
4     6698
2     2707
3     2241
5      610
6      277
Name: 지역, dtype: int64

In [21]:
# 6권역
sudo = new_df['prepro_text'][new_df['지역']==1]
choong = new_df['prepro_text'][new_df['지역']==2]
junla = new_df['prepro_text'][new_df['지역']==3]
kyungsang = new_df['prepro_text'][new_df['지역']==4]
kangwon = new_df['prepro_text'][new_df['지역']==5]
jeju = new_df['prepro_text'][new_df['지역']==6]

# 수도권 vs 비수도권
ok_sudo = new_df['prepro_text'][new_df['지역']==1]
no_sudo = new_df['prepro_text'][new_df['지역']>=2]

# 텍스트 분석

## 불용어 사전

In [24]:
stop_words = ['10', '선수', '배구', '경기', '모습', '제일', '이번', '인상', '기억', 
              '하는', '우상', '가장', '종목', '수의', '깊었다', '올림픽', '안산선', '수가']

In [25]:
# 빈도분석 단어사전 정리
count_vectorizer = CountVectorizer(max_features=1000, 
                                   min_df=100,
                                   ngram_range =(1,2),
                                   stop_words=stop_words).fit(new_df['prepro_text'])

In [26]:
# 벡터화
feature_vec = count_vectorizer.fit_transform(male)

In [30]:
# 4개의 주제로 LDA
lda = LatentDirichletAllocation(n_components=4, random_state=0)
lda.fit(feature_vec)

LatentDirichletAllocation(n_components=4, random_state=0)

In [31]:
lda.components_

array([[2.39901704e+02, 1.43249645e+02, 1.68237488e+02, 2.52548381e-01,
        1.35984107e+02, 1.14828117e+02, 1.99404443e+02, 2.03378312e+00,
        2.54988579e-01, 2.55631263e-01, 1.58958975e+02, 1.94452196e+02,
        2.56971799e-01, 2.61920241e+02, 2.12164936e+02, 2.54741739e-01,
        1.05241322e+02, 2.56139340e-01, 3.60994551e+00, 5.21018552e-01,
        1.58972895e+02, 6.01246382e+00, 1.13178159e+02, 1.30621077e+02,
        5.41414816e+02, 2.88012048e-01, 1.38967233e+02, 1.49711869e+02,
        2.67513793e-01, 2.54204659e-01, 1.06906232e+02, 7.78146824e+01,
        1.91672098e+02, 7.59550644e+00, 3.39717959e-01, 5.89010053e-01,
        2.56424718e-01, 1.07082811e+02, 1.62945669e+02, 2.53285731e-01,
        2.53523304e-01, 2.51104705e-01, 1.11622145e+02, 5.59419678e-01,
        2.56043448e-01, 9.12289894e+01, 2.52298739e-01, 2.51098598e-01,
        2.52364381e-01, 2.52155123e-01, 8.14124447e-01, 2.52348124e-01,
        2.73338938e+00, 2.51542768e-01, 2.50869557e-01, 1.913287

In [32]:
lda.components_.shape

(4, 93)

In [37]:
feature_names = count_vectorizer.get_feature_names() 
for idx, topic in enumerate(lda.components_):
    print('{}번째 토픽'.format(idx+1))
    topic_word_idx = (-topic).argsort() # arsort 오름차순하고 싶을 때
    top20_idx = topic_word_idx[:20]
    # print(top20_idx)
    
    features = [(feature_names[i], round(topic[i],2)) for i in top20_idx]
    print(features)

1번째 토픽
[('메달', 541.41), ('최선', 495.23), ('일본', 397.21), ('우리나라', 274.01), ('너무', 261.92), ('장면', 243.8), ('감동', 239.9), ('화이팅', 219.74), ('노력', 212.16), ('활약', 211.88), ('근대', 199.4), ('깊었습니다', 194.45), ('생각', 191.67), ('열심히', 191.33), ('입니다', 188.8), ('우리', 183.23), ('코로나', 175.61), ('인기', 171.39), ('관심', 168.24), ('승리', 162.95)]
2번째 토픽
[('높이뛰기', 804.24), ('야구', 470.2), ('한국', 460.23), ('대표팀', 274.2), ('대한민국', 262.8), ('남자', 246.85), ('신기록', 231.24), ('축구', 209.24), ('수영', 189.16), ('성적', 170.8), ('메달', 168.58), ('한국 신기록', 156.25), ('육상', 145.24), ('긍정', 136.14), ('투혼', 128.69), ('터키', 122.65), ('도전', 120.97), ('세계', 117.86), ('좋은', 115.3), ('선전', 57.04)]
3번째 토픽
[('양궁', 1484.06), ('여자배구', 1293.5), ('금메달', 491.46), ('안산', 482.24), ('진출', 398.23), ('관왕', 339.24), ('여자', 211.62), ('획득', 185.21), ('양궁 금메달', 162.25), ('안산 관왕', 139.25), ('선전', 138.25), ('여자배구 진출', 129.25), ('양궁 안산', 123.25), ('없음', 119.25), ('양궁 관왕', 117.24), ('메달 획득', 110.18), ('무관', 105.53), ('펜싱', 86.1), ('투지', 33.15), (