In [1]:
import pandas as pd

import gensim
from gensim import corpora

import re

from IPython.core.display import HTML
import pyLDAvis
from pyLDAvis import gensim_models


In [7]:
file = '네이버_전처리.csv'

try: # CSV 파일 읽기 시도 (UTF-8 인코딩)
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:  # CSV 파일 읽기 시도 (CP949 인코딩)
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError: # CSV 파일 읽기 시도 (EUC-KR 인코딩)
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

In [4]:
df.head()

Unnamed: 0,url,title,content,preprocessed_content
0,https://n.news.naver.com/mnews/article/366/000...,"[전문] 민희진 “네이버·두나무, 투자 무관한 사적 만남”",민 대표 “하이브가 제시한 증거 자료 모두 불법”하이브 “증거 짜깁기한 적 없어”…...,"['민', '대표', '하이브', '제시', '증거', '자료', '불법', '하이..."
1,https://n.news.naver.com/mnews/article/015/000...,"""네이버·두나무와의 만남은…"" 민희진 첫 입장 표명","민희진 어도어 대표, 기자회견 후 첫 입장""네이버, 두나무, 하이브와 4자 대면 하...","['민희', '진', '어도어', '대표', '기자', '회견', '후', '입장'..."
2,https://n.news.naver.com/mnews/article/011/000...,"[전문] 기자회견 후 첫 입장 발표…민희진 “네이버·두나무, 투자 무관한 사적 만남""",서울경제스타DB[서울경제]민희진 어도어 대표가 경영권 확보를 위해 네이버와 업비트 ...,"['서울', '경제', '스타', '서울경제', '민희', '진', '어도어', '..."
3,https://n.news.naver.com/mnews/article/003/001...,"SKT ""T로밍 첫 이용 고객, 요금 50% 네이버페이 포인트로 돌려드려요""",'바로' 요금제 개편 1주년 맞아 9월까지 캐시백 이벤트최근 60개월 로밍 요금제 ...,"['요금', '개편', '캐시백', '이벤트', '로밍', '요금', '이용', '..."
4,https://n.news.naver.com/mnews/article/079/000...,"민희진 ""네이버·두나무 인수 제안 NO…뉴진스와 더 돈독""","민희진 어도어 대표 19일 공식 입장문 발표""하이브 제시 증거 모두 불법 취득""민희...","['민희', '진', '어도어', '대표', '공식', '입장', '문', '발표'..."


Couduct LDA

In [8]:
# 'preprocessed_content' 열에 있는 각 항목에 대해 하나의 문자열로 만드는 작업을 수행
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)

In [9]:
def remove_t(my_str):
    # 대괄호([]), 쉼표(,), 작은 따옴표(')를 제거하여 반환
    return re.sub('[\[\],\']', '', my_str)

# 'preprocessed_content' 열에 있는 각 항목에 대해 특정 문자들을 제거하는 작업을 수행
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)

In [10]:
# 'preprocessed_content' 열에 있는 각 항목에 대해 문자열을 공백을 기준으로 분할하는 작업을 수행
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)

In [11]:
df['preprocessed_content']

0       [민, 대표, 하이브, 제시, 증거, 자료, 불법, 하이브, 증거, 짜깁기, 민, ...
1       [민희, 진, 어도어, 대표, 기자, 회견, 후, 입장, 나무, 하이브, 자, 대면...
2       [서울, 경제, 스타, 서울경제, 민희, 진, 어도어, 대표, 경영, 확보, 업, ...
3       [요금, 개편, 캐시백, 이벤트, 로밍, 요금, 이용, 이력, 고객, 대상, 텔레콤...
4       [민희, 진, 어도어, 대표, 공식, 입장, 문, 발표, 하이브, 제시, 증거, 모...
                              ...                        
1687    [현황, 일본, 정부, 라인, 지분, 일본, 압박, 미국, 틱톡, 금지, 일본, 라...
1688    [플러스, 멤버십, 무료, 가입, 마자, 해지, 가입, 방법, 해지, 방법, 간, ...
1689    [작년, 연재, 웹툰, 전체, 썸, 네일, 대대적, 작업, 만우절, 네이버, 웹툰,...
1690    [피부, 요즘, 관심, 도, 탈모, 관리, 발양, 샴푸, 탈모, 샴푸, 기능, 제품...
1691    [애슐리, 신상, 등장, 진리, 송, 영철, 공작, 소, 제작, 라이브, 커머스, ...
Name: preprocessed_content, Length: 1692, dtype: object

In [12]:
# 데이터프레임의 'preprocessed_content' 열에서 단어 사전 생성
word_dict = corpora.Dictionary(df['preprocessed_content'])

In [13]:
# 단어 사전을 기반으로 문서를 BoW 형식으로 변환하여 코퍼스 생성
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

LDA 모델 훈련 (총 20개 토픽)

In [14]:
N_TOPICS = 20
# LDA 모델 생성
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

In [15]:
topics = ldamodel.print_topics(num_words=4) # LDA 모델에서 상위 단어 출력
for topic in topics: # 각 주제에 대해 상위 단어 출력
    print(topic)

(0, '0.018*"블로그" + 0.010*"광고" + 0.008*"검색" + 0.008*"정보"')
(1, '0.014*"라인" + 0.013*"지분" + 0.011*"일본" + 0.010*"정부"')
(2, '0.018*"라인" + 0.016*"지분" + 0.012*"정부" + 0.012*"일본"')
(3, '0.040*"라인" + 0.025*"지분" + 0.023*"일본" + 0.018*"정부"')
(4, '0.009*"키워드" + 0.009*"방법" + 0.008*"검색" + 0.008*"블로그"')
(5, '0.020*"일본" + 0.019*"라인" + 0.017*"정부" + 0.014*"야후"')
(6, '0.019*"블로그" + 0.008*"멤버십" + 0.008*"검색" + 0.008*"방법"')
(7, '0.017*"라인" + 0.013*"일본" + 0.013*"광고" + 0.012*"지분"')
(8, '0.024*"정부" + 0.024*"라인" + 0.019*"일본" + 0.018*"야후"')
(9, '0.028*"라인" + 0.013*"일본" + 0.012*"정부" + 0.011*"지분"')
(10, '0.026*"라인" + 0.017*"지분" + 0.015*"야후" + 0.015*"정부"')
(11, '0.009*"라인" + 0.008*"사용" + 0.007*"글" + 0.006*"블로그"')
(12, '0.008*"사용" + 0.008*"일본" + 0.007*"방법" + 0.007*"스토어"')
(13, '0.009*"사용" + 0.008*"방법" + 0.008*"일본" + 0.007*"확인"')
(14, '0.015*"일본" + 0.013*"라인" + 0.012*"검색" + 0.009*"블로그"')
(15, '0.010*"포인트" + 0.009*"문화" + 0.008*"방법" + 0.007*"페이"')
(16, '0.020*"페이" + 0.012*"결제" + 0.011*"방법" + 0.010*"사용"')
(17, '0.010*"광고"

LDA 결과 시각화

In [18]:
#LDA 결과 시각화
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

나머지 키워드들로 동일하게 진행

In [27]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '카카오_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.021*"카카오톡" + 0.015*"선물" + 0.013*"사용" + 0.009*"향"')
(1, '0.011*"카카오톡" + 0.007*"서비스" + 0.007*"카카오" + 0.007*"방법"')
(2, '0.023*"카카오톡" + 0.013*"대화" + 0.010*"카카오페이" + 0.010*"삭제"')
(3, '0.020*"증가" + 0.018*"매출" + 0.015*"대비" + 0.015*"전년"')
(4, '0.017*"선물" + 0.013*"카카오톡" + 0.010*"가능" + 0.009*"사용"')
(5, '0.012*"카카오톡" + 0.011*"친구" + 0.010*"카카오" + 0.009*"대출"')
(6, '0.013*"카카오톡" + 0.011*"방법" + 0.010*"뱅크" + 0.009*"통장"')
(7, '0.014*"선물" + 0.011*"카카오톡" + 0.006*"시장" + 0.006*"서비스"')
(8, '0.011*"카카오톡" + 0.009*"가능" + 0.008*"선물" + 0.007*"사용"')
(9, '0.020*"카카오톡" + 0.012*"방법" + 0.010*"설정" + 0.010*"친구"')
(10, '0.013*"선물" + 0.009*"카카오" + 0.008*"카카오톡" + 0.006*"서비스"')
(11, '0.010*"카카오톡" + 0.008*"방법" + 0.008*"뱅크" + 0.007*"사진"')
(12, '0.031*"카카오톡" + 0.013*"친구" + 0.012*"서비스" + 0.011*"방법"')
(13, '0.010*"뱅크" + 0.009*"카카오" + 0.006*"사업" + 0.006*"금리"')
(14, '0.009*"뱅크" + 0.006*"카카오톡" + 0.006*"카카오" + 0.006*"사용"')
(15, '0.011*"카카오톡" + 0.008*"채널" + 0.008*"카카오" + 0.008*"서비스"')
(16, '0.023*"카카오톡" + 0.014*"선물" + 0.010*"

In [28]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '라인_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.011*"라인" + 0.010*"네이버" + 0.007*"프렌즈" + 0.006*"일본"')
(1, '0.030*"네이버" + 0.017*"일본" + 0.012*"라인" + 0.011*"프렌즈"')
(2, '0.019*"네이버" + 0.018*"정부" + 0.015*"일본" + 0.008*"기업"')
(3, '0.013*"라인" + 0.011*"프렌즈" + 0.011*"피자" + 0.011*"샐리"')
(4, '0.039*"네이버" + 0.027*"정부" + 0.026*"일본" + 0.015*"야후"')
(5, '0.037*"일본" + 0.037*"네이버" + 0.022*"정부" + 0.018*"지분"')
(6, '0.024*"네이버" + 0.015*"야후" + 0.014*"일본" + 0.012*"지분"')
(7, '0.031*"일본" + 0.030*"네이버" + 0.022*"정부" + 0.014*"야후"')
(8, '0.036*"일본" + 0.025*"네이버" + 0.020*"정부" + 0.014*"지분"')
(9, '0.029*"네이버" + 0.021*"야후" + 0.019*"지분" + 0.019*"일본"')
(10, '0.012*"네이버" + 0.011*"어휘" + 0.007*"프렌즈" + 0.007*"라인"')
(11, '0.020*"일본" + 0.018*"네이버" + 0.014*"프렌즈" + 0.013*"지분"')
(12, '0.020*"네이버" + 0.015*"일본" + 0.013*"정부" + 0.009*"야후"')
(13, '0.013*"일본" + 0.012*"네이버" + 0.011*"프렌즈" + 0.011*"라인"')
(14, '0.022*"네이버" + 0.017*"일본" + 0.008*"라인" + 0.007*"야후"')
(15, '0.035*"네이버" + 0.020*"일본" + 0.016*"지분" + 0.014*"야후"')
(16, '0.018*"라인" + 0.017*"일본" + 0.014*"프렌즈" + 0.009*"네이버"')
(

In [29]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '쿠팡_전처리.csv'


all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.010*"멤버십" + 0.009*"회원" + 0.009*"제품" + 0.008*"구매"')
(1, '0.014*"카드" + 0.012*"와우" + 0.010*"이용" + 0.009*"혜택"')
(2, '0.007*"분기" + 0.007*"가격" + 0.007*"멤버십" + 0.006*"고객"')
(3, '0.007*"수익" + 0.005*"집" + 0.005*"시간" + 0.005*"전"')
(4, '0.011*"상품" + 0.009*"제품" + 0.009*"배송" + 0.007*"분기"')
(5, '0.024*"상품" + 0.014*"판매" + 0.009*"시간" + 0.008*"등록"')
(6, '0.010*"상품" + 0.010*"멤버십" + 0.009*"판매" + 0.008*"와우"')
(7, '0.010*"사용" + 0.010*"플레이" + 0.008*"구매" + 0.005*"앱"')
(8, '0.021*"기업" + 0.020*"집단" + 0.019*"지정" + 0.015*"동일인"')
(9, '0.012*"멤버십" + 0.009*"인상" + 0.008*"무료" + 0.007*"가격"')
(10, '0.006*"가능" + 0.006*"사용" + 0.006*"제품" + 0.005*"생각"')
(11, '0.011*"체험" + 0.008*"지정" + 0.007*"광고" + 0.006*"집단"')
(12, '0.013*"기업" + 0.011*"집단" + 0.010*"지정" + 0.008*"동일인"')
(13, '0.015*"상품" + 0.007*"판매" + 0.007*"사용" + 0.006*"분기"')
(14, '0.013*"상품" + 0.012*"판매" + 0.008*"조사" + 0.007*"체험"')
(15, '0.024*"배달" + 0.009*"집단" + 0.008*"기업" + 0.008*"배송"')
(16, '0.010*"매출" + 0.009*"수익" + 0.009*"상품" + 0.006*"제품"')
(17, '0.011*"배달" + 0

In [30]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '배달의민족_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.014*"주문" + 0.011*"쿠폰" + 0.010*"무료" + 0.009*"입점"')
(1, '0.015*"입점" + 0.013*"사장" + 0.011*"주문" + 0.010*"결제"')
(2, '0.009*"주문" + 0.008*"메뉴" + 0.007*"음식" + 0.007*"무료"')
(3, '0.021*"주문" + 0.015*"족발" + 0.009*"포장" + 0.008*"마왕"')
(4, '0.015*"주문" + 0.009*"앱" + 0.007*"맛" + 0.007*"음식"')
(5, '0.017*"주문" + 0.017*"족발" + 0.011*"쿠폰" + 0.011*"할인"')
(6, '0.021*"쿠폰" + 0.014*"주문" + 0.013*"할인" + 0.008*"사용"')
(7, '0.012*"주문" + 0.007*"쿠폰" + 0.007*"서비스" + 0.007*"배미"')
(8, '0.020*"족발" + 0.012*"주문" + 0.010*"맛" + 0.009*"맛집"')
(9, '0.015*"배미" + 0.014*"주문" + 0.009*"피자" + 0.008*"집"')
(10, '0.022*"주문" + 0.010*"메뉴" + 0.009*"집" + 0.008*"가게"')
(11, '0.014*"주문" + 0.011*"할인" + 0.009*"족발" + 0.008*"쿠폰"')
(12, '0.016*"주문" + 0.014*"할인" + 0.012*"쿠폰" + 0.009*"배미"')
(13, '0.015*"결제" + 0.013*"앱" + 0.010*"주문" + 0.010*"카드"')
(14, '0.008*"배미" + 0.008*"가능" + 0.007*"사용" + 0.007*"이용"')
(15, '0.016*"주문" + 0.012*"배미" + 0.012*"입점" + 0.012*"광고"')
(16, '0.014*"주문" + 0.009*"메뉴" + 0.009*"앱" + 0.007*"배미"')
(17, '0.019*"주문" + 0.008*"족발" 

In [31]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '당근마켓_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.016*"거래" + 0.010*"중고" + 0.007*"판매" + 0.006*"서비스"')
(1, '0.028*"거래" + 0.012*"동네" + 0.010*"중고" + 0.009*"판매"')
(2, '0.009*"거래" + 0.007*"알바" + 0.006*"중고" + 0.006*"이벤트"')
(3, '0.020*"거래" + 0.012*"동네" + 0.011*"판매" + 0.010*"중고"')
(4, '0.014*"거래" + 0.010*"물건" + 0.009*"구매" + 0.009*"생각"')
(5, '0.018*"거래" + 0.016*"광고" + 0.012*"중고" + 0.011*"판매"')
(6, '0.023*"거래" + 0.008*"광고" + 0.008*"중고" + 0.008*"물건"')
(7, '0.016*"거래" + 0.007*"중고" + 0.006*"서비스" + 0.006*"플랫폼"')
(8, '0.024*"거래" + 0.014*"중고" + 0.013*"플랫폼" + 0.008*"서비스"')
(9, '0.019*"거래" + 0.010*"물건" + 0.008*"모임" + 0.008*"동네"')
(10, '0.015*"프로필" + 0.013*"거래" + 0.007*"비즈" + 0.006*"등록"')
(11, '0.020*"거래" + 0.013*"중고" + 0.008*"판매" + 0.007*"물건"')
(12, '0.024*"거래" + 0.013*"판매" + 0.008*"중고" + 0.007*"식품"')
(13, '0.030*"거래" + 0.008*"물건" + 0.008*"중고" + 0.008*"판매"')
(14, '0.029*"거래" + 0.014*"중고" + 0.012*"판매" + 0.009*"경우"')
(15, '0.026*"거래" + 0.012*"중고" + 0.008*"판매" + 0.005*"상품"')
(16, '0.017*"거래" + 0.007*"계좌" + 0.007*"구매" + 0.006*"돈"')
(17, '0.010*"판매" +

In [33]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '토스_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.015*"뱅크" + 0.014*"이벤트" + 0.012*"링크" + 0.009*"수수료"')
(1, '0.012*"현금" + 0.011*"결제" + 0.011*"통장" + 0.010*"진행"')
(2, '0.014*"카드" + 0.010*"계좌" + 0.009*"투자" + 0.009*"뱅크"')
(3, '0.019*"뱅크" + 0.015*"증권" + 0.012*"주식" + 0.010*"분기"')
(4, '0.057*"이벤트" + 0.025*"돈" + 0.018*"친구" + 0.016*"뱅크"')
(5, '0.011*"퀴즈" + 0.010*"고양이" + 0.009*"통장" + 0.009*"결제"')
(6, '0.017*"결제" + 0.012*"통장" + 0.011*"금리" + 0.010*"금액"')
(7, '0.014*"카드" + 0.010*"이벤트" + 0.008*"결제" + 0.008*"스피킹"')
(8, '0.014*"토익" + 0.013*"스피킹" + 0.011*"시험" + 0.008*"현금"')
(9, '0.015*"토익" + 0.012*"스피킹" + 0.010*"이벤트" + 0.009*"진행"')
(10, '0.011*"뱅크" + 0.009*"주식" + 0.009*"이벤트" + 0.008*"가능"')
(11, '0.013*"결제" + 0.011*"현금" + 0.011*"진행" + 0.010*"외화"')
(12, '0.023*"뱅크" + 0.015*"통장" + 0.013*"은행" + 0.010*"금리"')
(13, '0.016*"적금" + 0.012*"스피킹" + 0.010*"토익" + 0.010*"주식"')
(14, '0.012*"주식" + 0.009*"결제" + 0.009*"증권" + 0.007*"앱"')
(15, '0.011*"돈" + 0.011*"통장" + 0.008*"카드" + 0.008*"뱅크"')
(16, '0.022*"뱅크" + 0.012*"대출" + 0.012*"환전" + 0.011*"외화"')
(17, '0.033*"뱅크"

In [34]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '직방_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.012*"락" + 0.009*"부동산" + 0.008*"도어" + 0.008*"서비스"')
(1, '0.014*"아파트" + 0.014*"도어" + 0.013*"제품" + 0.011*"거래"')
(2, '0.016*"신탁" + 0.012*"중개" + 0.012*"거래" + 0.011*"서비스"')
(3, '0.014*"중개" + 0.013*"집" + 0.010*"도어" + 0.009*"락"')
(4, '0.021*"설치" + 0.014*"제품" + 0.012*"폰" + 0.012*"비디오"')
(5, '0.023*"락" + 0.021*"설치" + 0.017*"도어" + 0.009*"삼성"')
(6, '0.019*"거래" + 0.016*"설치" + 0.014*"아파트" + 0.008*"키"')
(7, '0.020*"분양" + 0.019*"단지" + 0.019*"락" + 0.015*"청약"')
(8, '0.026*"락" + 0.024*"도어" + 0.019*"설치" + 0.016*"지문"')
(9, '0.013*"중개" + 0.010*"청약" + 0.010*"제품" + 0.010*"매물"')
(10, '0.014*"서비스" + 0.014*"중개" + 0.012*"부동산" + 0.008*"기업"')
(11, '0.020*"도어" + 0.019*"설치" + 0.019*"락" + 0.016*"기능"')
(12, '0.032*"설치" + 0.023*"도어" + 0.018*"락" + 0.017*"키"')
(13, '0.011*"락" + 0.009*"가능" + 0.009*"도어" + 0.008*"아파트"')
(14, '0.027*"설치" + 0.024*"락" + 0.020*"도어" + 0.019*"제품"')
(15, '0.033*"설치" + 0.030*"폰" + 0.019*"도어" + 0.017*"아파트"')
(16, '0.016*"부동산" + 0.013*"매물" + 0.011*"설치" + 0.009*"아파트"')
(17, '0.015*"아파트" + 0.014*

In [35]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '야놀자_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.014*"할인" + 0.013*"호텔" + 0.011*"쿠폰" + 0.010*"야놀자"')
(1, '0.025*"쿠폰" + 0.021*"할인" + 0.012*"최대" + 0.012*"여행"')
(2, '0.012*"쿠폰" + 0.012*"호텔" + 0.011*"할인" + 0.009*"야놀자"')
(3, '0.026*"할인" + 0.024*"여행" + 0.018*"쿠폰" + 0.015*"야놀자"')
(4, '0.021*"여행" + 0.016*"야놀자" + 0.013*"할인" + 0.012*"쿠폰"')
(5, '0.048*"할인" + 0.032*"쿠폰" + 0.020*"여행" + 0.015*"예약"')
(6, '0.015*"할인" + 0.010*"야놀자" + 0.008*"호텔" + 0.008*"여행"')
(7, '0.009*"호텔" + 0.008*"야놀자" + 0.007*"할인" + 0.007*"사용"')
(8, '0.012*"쿠폰" + 0.010*"숙박" + 0.009*"혜택" + 0.008*"여행"')
(9, '0.025*"할인" + 0.014*"여행" + 0.012*"쿠폰" + 0.012*"혜택"')
(10, '0.020*"노른자" + 0.014*"여행" + 0.013*"호텔" + 0.013*"예약"')
(11, '0.024*"쿠폰" + 0.020*"할인" + 0.015*"최대" + 0.013*"숙소"')
(12, '0.005*"전문" + 0.005*"여행" + 0.005*"할인" + 0.004*"예약"')
(13, '0.028*"쿠폰" + 0.020*"할인" + 0.015*"야놀자" + 0.013*"예약"')
(14, '0.036*"쿠폰" + 0.025*"할인" + 0.018*"여행" + 0.016*"예약"')
(15, '0.012*"호텔" + 0.009*"할인" + 0.009*"쿠폰" + 0.008*"예약"')
(16, '0.024*"할인" + 0.013*"야놀자" + 0.013*"예약" + 0.012*"여행"')
(17, '0.016*"호텔

In [36]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = '삼성전자_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.018*"시장" + 0.011*"반도체" + 0.009*"기업" + 0.007*"제품"')
(1, '0.010*"제품" + 0.009*"삼성" + 0.005*"주행" + 0.005*"자율"')
(2, '0.022*"배당금" + 0.019*"배당" + 0.009*"분기" + 0.009*"제품"')
(3, '0.011*"시장" + 0.007*"제품" + 0.007*"주식" + 0.006*"반도체"')
(4, '0.009*"반도체" + 0.008*"갤럭시" + 0.006*"투자" + 0.006*"삼성"')
(5, '0.012*"반도체" + 0.009*"제품" + 0.008*"기업" + 0.008*"기능"')
(6, '0.010*"시장" + 0.007*"주가" + 0.006*"가능" + 0.006*"사용"')
(7, '0.016*"냉장고" + 0.011*"제품" + 0.009*"기능" + 0.007*"삼성"')
(8, '0.008*"제품" + 0.007*"사용" + 0.007*"가능" + 0.007*"투자"')
(9, '0.019*"배당금" + 0.018*"반도체" + 0.016*"배당" + 0.016*"분기"')
(10, '0.018*"갤럭시" + 0.010*"기능" + 0.010*"제품" + 0.008*"기술"')
(11, '0.012*"삼성" + 0.009*"베트남" + 0.007*"제품" + 0.007*"기업"')
(12, '0.015*"배당" + 0.012*"면접" + 0.009*"삼성" + 0.007*"배당금"')
(13, '0.016*"분기" + 0.010*"주가" + 0.009*"시장" + 0.006*"영업"')
(14, '0.010*"반도체" + 0.007*"주가" + 0.007*"시장" + 0.006*"기업"')
(15, '0.013*"배당금" + 0.010*"주식" + 0.009*"지급" + 0.007*"스팀"')
(16, '0.008*"반도체" + 0.007*"에어컨" + 0.007*"사용" + 0.006*"제품"')
(17, '0.

In [37]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = 'SK하이닉스_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.021*"반도체" + 0.017*"주가" + 0.011*"하이닉스" + 0.010*"투자"')
(1, '0.022*"반도체" + 0.016*"시장" + 0.011*"기업" + 0.010*"삼성전자"')
(2, '0.010*"반도체" + 0.010*"파두" + 0.008*"투자" + 0.008*"메모리"')
(3, '0.018*"반도체" + 0.010*"하이닉스" + 0.007*"투자" + 0.006*"용인"')
(4, '0.019*"메모리" + 0.016*"반도체" + 0.011*"기업" + 0.011*"제품"')
(5, '0.017*"반도체" + 0.014*"메모리" + 0.011*"실적" + 0.011*"시장"')
(6, '0.012*"반도체" + 0.011*"분기" + 0.010*"시장" + 0.008*"하이닉스"')
(7, '0.021*"반도체" + 0.009*"기업" + 0.008*"메모리" + 0.008*"개발"')
(8, '0.015*"메모리" + 0.013*"시장" + 0.012*"반도체" + 0.012*"투자"')
(9, '0.013*"반도체" + 0.013*"삼성전자" + 0.009*"주가" + 0.008*"시장"')
(10, '0.021*"제품" + 0.014*"메모리" + 0.013*"반도체" + 0.012*"시장"')
(11, '0.014*"하이닉스" + 0.014*"반도체" + 0.008*"투자" + 0.008*"시장"')
(12, '0.013*"시장" + 0.012*"반도체" + 0.007*"투자" + 0.006*"하이닉스"')
(13, '0.009*"제품" + 0.009*"반도체" + 0.008*"하이닉스" + 0.006*"투자"')
(14, '0.036*"반도체" + 0.013*"메모리" + 0.012*"삼성전자" + 0.010*"시장"')
(15, '0.012*"투자" + 0.012*"반도체" + 0.011*"주가" + 0.009*"시장"')
(16, '0.013*"반도체" + 0.012*"분기" + 0.011*"시

In [38]:
def remove_t(my_str):
    return re.sub('[\[\],\']', '', my_str)

file = 'DB하이텍_전처리.csv'

all_N = []
try:
    df = pd.read_csv('csv_pre/'+file, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv('csv_pre/'+file, encoding='cp949')
    except UnicodeDecodeError:
        df = pd.read_csv('csv_pre/'+file, encoding='euc-kr')

df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: ''.join(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: remove_t(x) if isinstance(x, str) else x)
df['preprocessed_content'] = df['preprocessed_content'].apply(lambda x: x.split() if isinstance(x, str) else x)
word_dict = corpora.Dictionary(df['preprocessed_content'])
corpus = [word_dict.doc2bow(text) for text in df['preprocessed_content']]

N_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = N_TOPICS, id2word=word_dict)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, word_dict)
pyLDAvis.display(vis)

  return re.sub('[\[\],\']', '', my_str)


(0, '0.025*"반도체" + 0.011*"하이텍" + 0.011*"주주" + 0.010*"사업"')
(1, '0.017*"반도체" + 0.010*"파운드리" + 0.009*"기업" + 0.008*"상승"')
(2, '0.015*"반도체" + 0.010*"주주" + 0.010*"회사" + 0.009*"기업"')
(3, '0.027*"반도체" + 0.013*"주가" + 0.012*"하이텍" + 0.011*"주주"')
(4, '0.020*"반도체" + 0.011*"기업" + 0.009*"주주" + 0.009*"사업"')
(5, '0.026*"반도체" + 0.014*"주주" + 0.014*"파운드리" + 0.013*"분할"')
(6, '0.014*"주주" + 0.013*"분할" + 0.010*"물" + 0.007*"기업"')
(7, '0.015*"반도체" + 0.013*"기업" + 0.010*"주가" + 0.008*"파운드리"')
(8, '0.019*"반도체" + 0.012*"주주" + 0.011*"기업" + 0.010*"생산"')
(9, '0.015*"반도체" + 0.011*"생산" + 0.010*"주가" + 0.010*"지분"')
(10, '0.013*"주가" + 0.012*"회사" + 0.010*"반도체" + 0.009*"주주"')
(11, '0.018*"반도체" + 0.016*"하이텍" + 0.013*"주주" + 0.012*"기업"')
(12, '0.016*"반도체" + 0.010*"전력" + 0.010*"주주" + 0.009*"이익"')
(13, '0.021*"반도체" + 0.015*"기업" + 0.012*"주가" + 0.009*"하이텍"')
(14, '0.021*"반도체" + 0.013*"주주" + 0.011*"지분" + 0.009*"회사"')
(15, '0.016*"주주" + 0.009*"하이텍" + 0.007*"주" + 0.007*"회사"')
(16, '0.022*"주주" + 0.012*"지분" + 0.012*"하이텍" + 0.009*"기업"')
