In [1]:
sentences = [
    "오늘 날씨가 좋아서 나들이 가고 싶다.", # -> [오늘, 날씨가, 좋아서 ...]
    "이 영화는 정말 재미있었어요.",
    "맛있는 음식을 먹으러 갈까요?",
    "운동을 하면 건강에 좋아지는 것 같아요.",
    "공부하기 싫어서 미루고 있어요.",
    "여행 계획을 세우고 있는데 어디로 갈까요?",
    "좋은 책을 읽으면 마음이 편안해져요.",
    "오늘은 친구들과 만나서 재미있게 놀았어요.",
    "새로운 언어를 배우는 것은 어려워도 흥미로워요.",
    "주말에 가족들과 함께 시간을 보내기로 했습니다."
]

## 자연어 전처리

In [2]:
from mecab import MeCab
mecab = MeCab()
# 불용어 리스트 생성 (예시)
stopwords = ['가', '고', '을', '를', '이', '는']
from konlpy.tag import Okt

# Okt 형태소 분석기 인스턴스 생성
okt = Okt()

# 불용어 리스트 생성 (예시)
stopwords = ['가', '고', '을', '를', '이', '는']

# 토크나이징 함수 정의
def tokenizer(raw, pos=["Noun","Alpha","Verb","Number"], stopword=stopwords):
    return [
        word for word, tag in okt.pos(
            raw, 
            norm=True,   # normalize 그랰ㅋㅏ -> 그래ㅋㅋ
            stem=True    # stemming 바뀌나->바뀌다
            )
            if len(word) > 1 and tag in pos and word not in stopword

    ]

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer = tokenizer, use_idf=True)
features = vectorizer.fit_transform(sentences)
features.toarray()



array([[0.46015789, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.46015789, 0.46015789, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.46015789, 0.        , 0.        ,
        0.        , 0.        , 0.39117625, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.51519219, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
      

## LDA 적용

In [4]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=3, random_state=111)

In [5]:
lda_model.fit(features)

In [6]:
lda_model.components_
# len(lda_model.components_[0])
# topics_list = lda_model.components_[0]  # 첫번째 토픽

array([[0.33440991, 0.79926725, 0.33449536, 0.86083679, 0.33438975,
        0.95573491, 0.33440991, 0.33440991, 0.33452947, 0.33498098,
        0.33452947, 0.33467772, 0.95573491, 1.03622195, 0.79926725,
        0.33438975, 0.79926725, 0.33440991, 0.33438975, 1.03622195,
        0.33438975, 0.33501138, 0.33441824, 0.86083679, 0.33467772,
        0.33498098, 0.33501138, 0.86083679, 0.79926725, 0.33452947,
        1.53902864],
       [0.79119095, 0.33467677, 0.33462884, 0.33482318, 0.33451201,
        0.33503673, 0.79119095, 0.79119095, 0.8490693 , 0.33517004,
        0.8490693 , 0.33483168, 0.33503673, 0.33542304, 0.33467677,
        0.33451201, 0.33467677, 0.79119095, 0.33451201, 0.33542304,
        0.33451201, 1.03685684, 1.16278664, 0.33482318, 0.33483168,
        0.33517004, 1.03685684, 0.33482318, 0.33467677, 0.8490693 ,
        0.33475147],
       [0.33455703, 0.33470187, 1.23724423, 0.33485083, 0.79125614,
        0.33506824, 0.33455703, 0.33455703, 0.33469213, 1.03695576,
      

## 토픽별 단어 표시

In [7]:
dictionary_list = vectorizer.get_feature_names_out()
dictionary_list
# len(vectorizer.get_feature_names_out())

array(['가다', '가족', '갈다', '건강', '계획', '공부', '나들이', '날씨', '놀다', '마음', '만나다',
       '먹다', '미루다', '배우다', '보내다', '세우다', '시간', '싶다', '어디', '언어', '여행',
       '영화', '오늘', '운동', '음식', '읽다', '정말', '좋아지다', '주말', '친구', '하다'],
      dtype=object)

In [12]:
import pandas as pd
# df_datas = [topics_list, dictionary_list]
# df_topics = pd.DataFrame(data=df_datas)
# df_topics = df_topics.T
# df_topics[:2]

In [9]:
# df_topics.columns

In [10]:
# df_topics.sort_values(0, ascending=False)

## 댓글과 주요 토픽 연결

## 상위 단어 추출
topics_list = list()
for topic in lda_model.components_: 
    df_datas = [topic, dictionary_list]
    df_topics = pd.DataFrame(data=df_datas)
    df_topics = df_topics.T
    df_topics = df_topics.sort_values(0, ascending=False)
    # print(df_topics[:3])
    topics_text = ' '.join(df_topics[1].values[:4]) # get values form Series (4 = word 갯수)
    print(topics_text)
    topics_list.append(topics_text)

In [14]:
# ' '.join(df_topics[1].values[:4]) # 시리즈를 하나의 문장 생성

In [16]:
# topics_list_add = [topics_list, ['Topic0', 'Topic1', 'Topic2']]
# df_topic_keywords = pd.DataFrame(topics_list_add)
# df_topic_keywords.T

In [17]:
topics_output = lda_model.transform(features)
topics_output

array([[0.10462299, 0.7905512 , 0.10482582],
       [0.13946382, 0.72088285, 0.13965333],
       [0.12347941, 0.12362429, 0.7528963 ],
       [0.77339137, 0.11328861, 0.11332002],
       [0.75164634, 0.12416304, 0.12419063],
       [0.10458222, 0.10474923, 0.79066855],
       [0.13943843, 0.13959605, 0.72096551],
       [0.11268585, 0.77443415, 0.11288   ],
       [0.72035161, 0.13980797, 0.13984042],
       [0.78957076, 0.10519796, 0.10523128]])

## 각 댓글별 topic 분류

In [18]:
lda_model.n_components

3

In [19]:
df_topics_score = pd.DataFrame(topics_output) # 각 댓글 당 토픽 분류 점수
df_topics_score

Unnamed: 0,0,1,2
0,0.104623,0.790551,0.104826
1,0.139464,0.720883,0.139653
2,0.123479,0.123624,0.752896
3,0.773391,0.113289,0.11332
4,0.751646,0.124163,0.124191
5,0.104582,0.104749,0.790669
6,0.139438,0.139596,0.720966
7,0.112686,0.774434,0.11288
8,0.720352,0.139808,0.13984
9,0.789571,0.105198,0.105231


In [20]:
# 각 댓글 마다 토픽 분류 지정
import numpy as np
dominant_in_topic = np.argmax(topics_output)
dominant_in_topic

17

In [21]:
df_topics_score['dominant_topic'] = dominant_in_topic
df_topics_score[:3]

Unnamed: 0,0,1,2,dominant_topic
0,0.104623,0.790551,0.104826,17
1,0.139464,0.720883,0.139653,17
2,0.123479,0.123624,0.752896,17


In [23]:
df_topics_score['sentences'] = sentences
# df_topics_score[:3]

In [24]:
df_topics_score

Unnamed: 0,0,1,2,dominant_topic,sentences
0,0.104623,0.790551,0.104826,17,오늘 날씨가 좋아서 나들이 가고 싶다.
1,0.139464,0.720883,0.139653,17,이 영화는 정말 재미있었어요.
2,0.123479,0.123624,0.752896,17,맛있는 음식을 먹으러 갈까요?
3,0.773391,0.113289,0.11332,17,운동을 하면 건강에 좋아지는 것 같아요.
4,0.751646,0.124163,0.124191,17,공부하기 싫어서 미루고 있어요.
5,0.104582,0.104749,0.790669,17,여행 계획을 세우고 있는데 어디로 갈까요?
6,0.139438,0.139596,0.720966,17,좋은 책을 읽으면 마음이 편안해져요.
7,0.112686,0.774434,0.11288,17,오늘은 친구들과 만나서 재미있게 놀았어요.
8,0.720352,0.139808,0.13984,17,새로운 언어를 배우는 것은 어려워도 흥미로워요.
9,0.789571,0.105198,0.105231,17,주말에 가족들과 함께 시간을 보내기로 했습니다.
