In [None]:
!pip install konlpy

In [None]:
!pip install -U numpy

In [None]:
!pip install -U pyLDAvis

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('대학백과_삼육대_리뷰.csv')
df.head()

In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

In [None]:
df.rename(columns = {'0' : '평가'}, inplace = True)

In [None]:
df.head()

In [None]:
from konlpy.tag import Okt
import re

In [None]:
def tokenize_text(text):
    text = re.sub(r"[^ㄱ-ㅣ가-힣\s]","",str(text))
    okt = Okt()
    okt_morphs = okt.pos(text)

    words = []
    for word,pos in okt_morphs: 
        if pos == 'Adjective' or pos=='Verb' or pos=='Noun':
            words.append(word)

    word_str =  ' '.join(words)  
    return word_str

In [None]:
from tqdm import tqdm
token_list = []
for temp in tqdm(df['평가']) :
  token_list.append(tokenize_text(temp))
token_list

In [None]:
drop_corpus = []

for index in range(len(token_list)):
    corpus = token_list[index]
    if len(set(corpus.split())) < 3:
        drop_corpus.append(corpus)

for corpus in drop_corpus:
    token_list.remove(corpus)

token_list

In [None]:
!pip install --upgrade pip
!pip install --upgrade numpy

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
count_vec = CountVectorizer(max_df=0.2,max_features=1000,min_df=3,ngram_range=(1,2))
feat_vect = count_vec.fit_transform(token_list)
print(feat_vect.shape)
print(count_vec.vocabulary_)

In [None]:
feature_names = count_vec.get_feature_names_out()

In [None]:
lda = LatentDirichletAllocation(n_components=5,max_iter=20)
lda.fit(feat_vect)

In [None]:
lda.components_

In [None]:
def display_topics(model,feature_names,num_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('토픽',topic_index)
        topic_word_indexes = topic.argsort()[::-1]
        top_index = topic_word_indexes[:num_top_words]
       
        f_name_list = []
        for temp in top_index:
            f_name_list.append(feature_names[temp])

        feature_concat = ' '.join(f_name_list)
        print(feature_concat)
    

In [None]:
display_topics(lda,feature_names,15)

In [None]:
import pyLDAvis.lda_model
pyLDAvis.enable_notebook()
vis = pyLDAvis.lda_model.prepare(lda,feat_vect,count_vec)
pyLDAvis.display(vis)

In [None]:
sent_topic = lda.transform(feat_vect)
print(sent_topic[0])

In [None]:
doc_per_topic_list = []
for n in range(sent_topic.shape[0]):
    topic_most_pr = sent_topic[n].argmax()
    topic_pr = sent_topic[n].max()
    doc_per_topic_list.append([n,topic_most_pr,topic_pr])

doc_topic_df = pd.DataFrame(doc_per_topic_list,columns=['no','토픽번호','확률'])
doc_topic_df



In [None]:
for topic in range(len(doc_topic_df['토픽번호'].unique())):
    print('토픽',topic)
    top_topic = doc_topic_df[ doc_topic_df['토픽번호']==topic].sort_values(by='확률',ascending=False)
    print(df['평가'].iloc[ top_topic['no'].iloc[0]])
    print(df['평가'].iloc[ top_topic['no'].iloc[1]])
    print(df['평가'].iloc[ top_topic['no'].iloc[2]])