# Latent Dirichlet Allocation (LDA)

자연어 처리에서 잠재 디리클레 할당(Latent Dirichlet allocation, LDA)은 주어진 문서에 대하여 각 문서에 어떤 주제들이 존재하는지를 서술하는 대한 확률적 토픽 모델 기법

미리 알고 있는 주제별 단어수 분포를 바탕으로, 주어진 문서에서 발견된 단어수 분포를 분석함으로써 해당 문서가 어떤 주제들을 함께 다루고 있을지를 예측

1) TF-IDF   
2) 잠재 의미 분석(Latent semantic indexing, LSI)   
3) 확률 잠재 의미 분석(Probabilistic latent semantic analysis, pLSA) = 확률 잠재 의미 인덱싱(probabilistic latent semantic 
indexing, pLSI)   
4) LDA(Latent Dirichlet allocation, LDA) 
5) 토픽 모델링 분야 탄생   

In [1]:
# 부모 폴더의 경로 추가
import sys; sys.path.insert(0, '..')

from util.data_loader import DataLoader
from util.metric_calculator import MetricCalculator

In [2]:
# Movielens 데이터 로딩
data_loader = DataLoader(num_users=1000, num_test_items=5, data_path='../data/ml-10M100K/')
movielens = data_loader.load()

In [3]:
# LDACollaboration 추천
from src.lda_collaboration import LDACollaborationRecommender

recommender = LDACollaborationRecommender()
recommend_result = recommender.recommend(movielens)

  "class": algorithms.Blowfish,
2024-03-13 19:24:36,082 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-03-13 19:24:36,212 : INFO : built Dictionary<4987 unique tokens: ['185', '231', '292', '316', '329']...> from 997 documents (total 67731 corpus positions)
2024-03-13 19:24:36,219 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<4987 unique tokens: ['185', '231', '292', '316', '329']...> from 997 documents (total 67731 corpus positions)", 'datetime': '2024-03-13T19:24:36.219038', 'gensim': '4.3.2', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2024-03-13 19:24:36,293 : INFO : using symmetric alpha at 0.02
2024-03-13 19:24:36,293 : INFO : using symmetric eta at 0.02
2024-03-13 19:24:36,302 : INFO : using serial LDA version on this node
2024-03-13 19:24:36,336 : INFO : running online (multi-pass) LDA training, 50 topics, 30 passes over the supplied corpus of 99

2024-03-13 19:25:07,337 : INFO : topic #15 (0.020): 0.007*"50" + 0.005*"260" + 0.005*"1196" + 0.005*"541" + 0.005*"608" + 0.004*"1199" + 0.004*"1270" + 0.004*"1089" + 0.004*"1288" + 0.004*"1136"
2024-03-13 19:25:07,337 : INFO : topic #37 (0.020): 0.009*"2571" + 0.009*"2959" + 0.008*"4993" + 0.008*"2706" + 0.007*"4306" + 0.006*"6377" + 0.006*"2683" + 0.006*"2762" + 0.006*"7438" + 0.006*"2858"
2024-03-13 19:25:07,337 : INFO : topic diff=1.788786, rho=0.377964
2024-03-13 19:25:10,076 : INFO : -8.407 per-word bound, 339.3 perplexity estimate based on a held-out corpus of 997 documents with 67731 words
2024-03-13 19:25:10,076 : INFO : PROGRESS: pass 6, at document #997/997
2024-03-13 19:25:12,874 : INFO : topic #42 (0.020): 0.010*"1234" + 0.010*"593" + 0.010*"608" + 0.009*"318" + 0.009*"2683" + 0.007*"1210" + 0.007*"1230" + 0.007*"1213" + 0.007*"1304" + 0.006*"1617"
2024-03-13 19:25:12,874 : INFO : topic #18 (0.020): 0.009*"908" + 0.009*"912" + 0.009*"1210" + 0.009*"4563" + 0.009*"5584" + 0

2024-03-13 19:25:46,287 : INFO : topic #8 (0.020): 0.013*"318" + 0.011*"1193" + 0.008*"2858" + 0.007*"678" + 0.007*"1914" + 0.007*"32" + 0.007*"457" + 0.007*"1617" + 0.007*"2124" + 0.007*"2150"
2024-03-13 19:25:46,289 : INFO : topic #16 (0.020): 0.012*"7153" + 0.009*"356" + 0.009*"2571" + 0.009*"480" + 0.008*"6377" + 0.008*"110" + 0.008*"1210" + 0.008*"589" + 0.007*"1036" + 0.007*"1270"
2024-03-13 19:25:46,289 : INFO : topic #9 (0.020): 0.010*"3387" + 0.010*"1431" + 0.010*"4675" + 0.010*"511" + 0.009*"5391" + 0.007*"3861" + 0.007*"4121" + 0.007*"4066" + 0.007*"5292" + 0.006*"260"
2024-03-13 19:25:46,289 : INFO : topic #20 (0.020): 0.008*"2336" + 0.006*"3052" + 0.005*"4903" + 0.005*"2442" + 0.005*"2724" + 0.005*"2762" + 0.005*"357" + 0.004*"260" + 0.004*"2396" + 0.004*"912"
2024-03-13 19:25:46,289 : INFO : topic #19 (0.020): 0.011*"11" + 0.010*"3646" + 0.010*"1220" + 0.010*"648" + 0.010*"1584" + 0.009*"1250" + 0.008*"1213" + 0.008*"3578" + 0.008*"3793" + 0.007*"296"
2024-03-13 19:25:46,

2024-03-13 19:26:18,917 : INFO : topic #28 (0.020): 0.017*"296" + 0.016*"50" + 0.014*"318" + 0.013*"593" + 0.013*"2858" + 0.013*"1196" + 0.012*"527" + 0.012*"1210" + 0.011*"2571" + 0.011*"260"
2024-03-13 19:26:18,923 : INFO : topic diff=0.123120, rho=0.223607
2024-03-13 19:26:21,608 : INFO : -8.011 per-word bound, 257.9 perplexity estimate based on a held-out corpus of 997 documents with 67731 words
2024-03-13 19:26:21,609 : INFO : PROGRESS: pass 19, at document #997/997
2024-03-13 19:26:23,677 : INFO : topic #5 (0.020): 0.010*"527" + 0.008*"608" + 0.007*"4306" + 0.007*"8961" + 0.007*"4226" + 0.007*"296" + 0.006*"912" + 0.006*"1270" + 0.006*"2571" + 0.006*"356"
2024-03-13 19:26:23,677 : INFO : topic #12 (0.020): 0.016*"1704" + 0.013*"318" + 0.012*"1784" + 0.012*"2858" + 0.011*"593" + 0.010*"2762" + 0.010*"2028" + 0.009*"2324" + 0.009*"527" + 0.008*"1358"
2024-03-13 19:26:23,677 : INFO : topic #7 (0.020): 0.012*"2762" + 0.012*"260" + 0.012*"1198" + 0.010*"1196" + 0.010*"1200" + 0.010*"2

2024-03-13 19:26:56,861 : INFO : topic #13 (0.020): 0.008*"1573" + 0.007*"589" + 0.007*"1527" + 0.006*"2571" + 0.006*"1676" + 0.006*"2028" + 0.006*"2762" + 0.006*"1580" + 0.005*"316" + 0.005*"2058"
2024-03-13 19:26:56,861 : INFO : topic #9 (0.020): 0.010*"3387" + 0.010*"1431" + 0.010*"4675" + 0.010*"511" + 0.009*"5391" + 0.007*"4121" + 0.007*"4066" + 0.007*"3861" + 0.006*"5292" + 0.006*"5650"
2024-03-13 19:26:56,861 : INFO : topic #31 (0.020): 0.017*"265" + 0.017*"307" + 0.017*"232" + 0.016*"36" + 0.016*"194" + 0.015*"272" + 0.013*"342" + 0.013*"475" + 0.013*"296" + 0.012*"535"
2024-03-13 19:26:56,861 : INFO : topic #42 (0.020): 0.011*"2683" + 0.011*"1234" + 0.008*"1269" + 0.008*"1230" + 0.008*"16" + 0.007*"1210" + 0.007*"1213" + 0.007*"608" + 0.007*"1303" + 0.007*"2431"
2024-03-13 19:26:56,877 : INFO : topic diff=0.050780, rho=0.192450
2024-03-13 19:27:00,145 : INFO : -7.967 per-word bound, 250.2 perplexity estimate based on a held-out corpus of 997 documents with 67731 words
2024-03-

In [4]:
# 평가
metric_calculator = MetricCalculator()

metrics = metric_calculator.calc(
    movielens.test.rating.tolist(), recommend_result.rating.tolist(),
    movielens.test_user2items, recommend_result.user2items, k=10)

print(metrics)

rmse=0.000, Precision@K=0.022, Recall@K=0.071
