# 월간식당 빅데이터 분석

* 키워드별 단어 출현 빈도

In [1]:
import re
import numpy as np
import pandas as pd
import json
import string
from pandas import read_excel

import warnings
warnings.simplefilter("ignore")

from konlpy.utils import pprint
from konlpy.tag import Hannanum
hannanum = Hannanum()

from konlpy.tag import Kkma
kkma = Kkma()

from gensim.models.word2vec import Word2Vec
from tqdm import tqdm

from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer

from msba import posts as p
from msba import stopwords as stopwords

In [8]:
def interested_words():
    # 핵심단어 읽어 오기
    my_sheet = '소비키워드'
    keywords_filename = 'deskresearch_.xlsx'
    df = read_excel(keywords_filename, sheet_name = my_sheet, header=1) # index_col='번호'
    keywords = df['핵심단어']
    subkeywords = df['대체어']
    interested_words = df['키워드']
    return keywords, subkeywords, interested_words

def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
#     text = re.sub('[A-z]', '', text)
#     text = re.sub('[0-9]', '', text)
    text = re.sub('[\'\"]', '', text)
    res = ''.join([i for i in text if not i.isdigit()]) 
    return text

def preprocessing_2(rows):
    pattern = re.compile(r"[.,?!★~]")
    sentences = []
    for row in rows:
        sentences = sentences + pattern.split(row.replace("\xa0", "").replace("\t","").strip())
        sentences = list(set(sentences))        
    return sentences

def convert_list_to_string(org_list, seperator=' '):
    """ Convert list to string, by joining all item in list with given separator.
        Returns the concatenated string """
    return seperator.join(org_list)

# 실전 : 말뭉치 생성 및 전처리
def preprocessing_3(sentences):
    # Convert list of strings to string
#     full_str = convert_list_to_string(sentences[1])
#     # print(full_str)

    pattern = re.compile(r".*(광고정보).*")

    corpus = []
    for sentence in sentences:
        if len(sentence):
            sentence = sentence.strip().split(" ")
            sentence = convert_list_to_string(sentence).replace("  ", " ").strip()
            if pattern.match(sentence):
                continue
            else:
                corpus.append(sentence)
    return corpus

def preprocessing_4(docs):
    # docs = [
    #         w for w in hannanum.nouns(" ".join(cell)) for cell in cells
    #         if ((not w[0].isnumeric()) and (w[0] not in string.punctuation))
    # ]
    vect = CountVectorizer(stop_words=stopwords.stopwords_kr, min_df=5, max_df=200).fit(docs)
    count = vect.transform(docs).toarray().sum(axis=0)
    idx = np.argsort(-count)
    count = count[idx]
    feature_name = np.array(vect.get_feature_names())[idx]
    # plt.bar(range(len(count)), count)
    # plt.show()

    tf_list = list(zip(feature_name, count))[:100]

    tf_df = pd.DataFrame(tf_list,columns=['단어', '빈도'])
    return tf_df

def remove_stopwords(sentence):
    result = []
    word_tokens = sentence.split(' ')
    for w in word_tokens: 
        if w not in stopwords.stopwords_kr: 
            result.append(w)
    # 위의 4줄은 아래의 한 줄로 대체 가능
    # result=[word for word in word_tokens if not word in stop_words]
    return result

def save_to_csv(keyword, tf_df):
    # save to csv
    filename = "./output/" + "유관어_" + keyword.replace(" ","") + ".csv"   
    # filename_list.append(filename)
    tf_df.to_csv(filename, date_format='%Y%m%d', encoding='utf-8-sig')

## 키워드/관심어 불러오기

In [3]:
keywords, subkeywords, interested_words = interested_words()

for keyword, subkeyword, interested_word in zip(keywords, subkeywords, interested_words):
    subkeyword = subkeyword.replace(" ", "").replace(",","|")
    interested_word = subkeyword + "|" + interested_word.replace(" ", "").replace(",","|")
#     print(keyword, " : " , subkeyword, interested_word)

# 자신의 관심사에 맞는 단어로 데이터 가져오기

In [7]:
# for keyword, subkeyword, interested_word in zip(keywords, subkeywords, interested_words):

keyword = 'RMR'
keyword = keyword.replace(" ","")
df = p.readall(keyword)
df = df[ (df['date'] >= '2019-07-01') & (df['date'] < '2020-07-01')]
df = df.drop_duplicates()
# print(df.shape)  
rows = df['title'].apply(preprocessing) + df['content'].apply(preprocessing)
# print(len(rows), type(rows))
sentences = preprocessing_2(rows)
# print(len(sentences), type(sentences))
sentences = preprocessing_3(sentences)
# print(len(corpus), type(corpus))

dataset = []
for sentence in tqdm(sentences):
    sentence = remove_stopwords(sentence)
    dataset.append(kkma.nouns(sentence))

dataset = [[y for y in x if not len(y)==1] for x in dataset] # 한글자 문장 제거
dataset = [[y for y in x if not y.isdigit()] for x in dataset] # 숫자로된 문장 제거
dataset[:10]

# 모형 구축
model = Word2Vec(dataset,            # 불용어 처리한 후 
                sg=1,                # skip-gram 적용: 중심 단어로 주변 단어를 예측
                window=5,            # 중심 단어로부터 좌우 5개 단어까지 학습에 적용
                min_count=1          # 전체문서에서 최소 1회 이상 출현단어로 학습 진행             
                )
model.init_sims(replace=True)

w2c = dict()
for item in model.wv.vocab:
    w2c[item]=model.wv.vocab[item].count

100%|█████████▉| 21178/21195 [06:00<00:00, 70.87it/s] 

KeyboardInterrupt: 

In [None]:
w2cSorted=dict(sorted(w2c.items(), key=lambda x: x[1],reverse=True))
# df_tf = pd.DataFrame.from_dict(w2cSorted)
# save_to_csv(keyword, df_tf)
#     w2cSorted
#     type(w2cSorted)

# 가장 유사한 단어 100개
df_co = pd.DataFrame(model.wv.most_similar(keyword, topn=100), columns=['단어', '유사도'])
# 100개 보기
#     df_co.head(100)
save_to_csv(keyword, df_co)

In [None]:
from konlpy.tag import Okt

okt = Okt()

text = okt.pos("이수는 노래를 잘 부릅니다.", norm = True, stem = True)

print (text)

In [None]:
# from soynlp.noun import LRNounExtractor
# noun_extractor = LRNounExtractor(verbose=True)
# from soykeyword.lasso import LassoKeywordExtractor

# from soynlp.noun import LRNounExtractor
# noun_extractor = LRNounExtractor(verbose=True)
# noun_extractor.train(sentences)
# nouns = noun_extractor.extract(min_noun_score=0.99,
#     min_noun_frequency=20)
# nouns

# noun_extractor = LRNounExtractor(verbose=True)
# noun_extractor.train(sentences)
# nouns = noun_extractor.extract()


# from soykeyword.lasso import LassoKeywordExtractor

# lassobased_extractor = LassoKeywordExtractor(
#     costs=[500, 200, 100, 50, 10, 5, 1, 0.1],
#     min_tf=20, 
#     min_df=10
# )

# lassobased_extractor.train(sentences)
# words = lassobased_extractor.extract_from_word(
#     5537,
#     min_num_of_keywords=30
# )

# dataset = words