## 라이브러리

In [None]:
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer

## 상수

In [None]:
# matplotlib 그래프 속성 설정
plt.rcParams['font.family'] = 'D2coding'
# plt.rcParams['figure.figsize'] = (12,16)
plt.rcParams['font.size'] = 15

# 형태소 분석기
okt = Okt()

## EDA

In [None]:
df['column'] = df['column'].map(lambda text: re.sub(" +", " ", re.sub(r"[^가-힣a-zA-Z0-9]", ' ', text)))

In [None]:
# question text 길이 기본 정보 확인
question_len = df['column'].map(lambda text: len(text))
question_len.describe()

In [None]:
# question text boxplot
plt.figure()
plt.boxplot([question_len], labels=['Question 박스 플랏'])
plt.show()

In [None]:
# question 히스토그램
plt.figure()
plt.hist([question_len], bins=45, range=[0, 250], color='g', label='Question')
plt.legend()
plt.title('Question Text 길이 히스토그램')
plt.xlabel('Text Length')
plt.show()

In [None]:
# answer 정답 텍스트 데이터 길이
answer_len = df['column'].map(lambda text: len(text))
answer_len.describe()

In [None]:
# answer text boxplot
plt.figure()
plt.boxplot([answer_len], labels=['Answer 박스 플랏'])
plt.show()

In [None]:
# answer 히스토그램
plt.figure()
plt.hist([answer_len], bins=100, range=[0, 1500], color='r', label='Answer')
plt.legend()
plt.title('Answer Text 길이 히스토그램')
plt.xlabel('Text Length')
plt.show()

### 빈도분석

In [None]:
# question texts에서 단어 명사만 추출
question_nouns = []
for question in df['column']:
    question_nouns.extend(okt.nouns(question))

# answer texts에서 단어 명사만 추출
answer_nouns = []
for answer in df['column']:
    answer_nouns.extend(okt.nouns(answer))

In [None]:
# 빈도분석
question_count_vectorizer = CountVectorizer(
    max_features=100,
    min_df=10,
    ngram_range=(1,1),
    stop_words=[]
).fit(question_nouns)

answer_count_vectorizer = CountVectorizer(
    max_features=100,
    min_df=10,
    ngram_range=(1,1),
    stop_words=[]
).fit(answer_nouns)

In [None]:
# question 단어 사전 정의
question_idx2word = {idx: word for word, idx in sorted(question_count_vectorizer.vocabulary_.items())}

# answer 단어 사전 정의
answer_idx2word = {idx: word for word, idx in sorted(answer_count_vectorizer.vocabulary_.items())}

In [None]:
# question 단어 빈도 matrix
question_count_matrix = question_count_vectorizer.transform([' '.join(question_nouns)])

# answer 단어 빈도 matrix
answer_count_matrix = answer_count_vectorizer.transform([' '.join(answer_nouns)])

In [None]:
# question 빈도 Top20 데이터 추출
question_count_top20_word = []
question_count_top20_num = []
for i in range(1, 21):
    question_count_idx = (question_count_matrix.toarray()[0]).argsort()[-i]
    question_count_top20_word.append(question_idx2word[question_count_idx])
    question_count_top20_num.append(question_count_matrix.toarray()[0][question_count_idx])

# answer 빈도 Top20 데이터 추출
answer_count_top20_word = []
answer_count_top20_num = []
for i in range(1, 21):
    answer_count_idx = (answer_count_matrix.toarray()[0]).argsort()[-i]
    answer_count_top20_word.append(answer_idx2word[answer_count_idx])
    answer_count_top20_num.append(answer_count_matrix.toarray()[0][answer_count_idx])

In [None]:
# question 단어 빈도 그래프 그리기
colors = sns.color_palette('hls', 25)
plt.figure(figsize=(10,15))
plt.barh(question_count_top20_word[::-1], question_count_top20_num[::-1], label='Question 단어 빈도', color=colors)
plt.legend()
plt.ylabel('Question 단어')
plt.xlabel('단어 빈도')
plt.title('Question 단어 빈도 Top20')
plt.yticks(question_count_top20_word[::-1])

for i, word in enumerate(question_count_top20_word[::-1]):
    count_score = question_count_top20_num[::-1][i]
    plt.text(count_score, word, count_score, color='#000000', horizontalalignment='right', verticalalignment='center')

plt.show()

In [None]:
# question 단어 빈도 그래프 그리기
colors = sns.color_palette('hls', 25)
plt.figure(figsize=(10,15))
plt.barh(answer_count_top20_word[::-1], answer_count_top20_num[::-1], label='Answer 단어 빈도', color=colors)
plt.legend()
plt.ylabel('Answer 단어')
plt.xlabel('단어 빈도')
plt.title('Answer 단어 빈도 Top20')
plt.yticks(answer_count_top20_word[::-1])

for i, word in enumerate(answer_count_top20_word[::-1]):
    count_score = answer_count_top20_num[::-1][i]
    plt.text(count_score, word, count_score, color='#000000', horizontalalignment='right', verticalalignment='center')

plt.show()