***

# __Inférence variationnelle stochastique pour LDA__

In [114]:
import json
import time

import numpy as np
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
from scipy.special import digamma

from src import vectorize_corpus

In [54]:
with open("data/wikipedia_corpus.json", "r", encoding="utf-8") as f:
    wiki_articles = json.load(f)

documents = [article["content"] for article in wiki_articles.values()] # Extract the content of the articles
titles = [article["title"] for article in wiki_articles.values()] # Extracrt the title title of the articles
corpus, vocab = vectorize_corpus(documents) # Vectorized documents

V = len(vocab) # Vocabulary size

In [55]:
def log_dirichlet_expectation(alpha):
    if len(alpha.shape) == 1:
        return digamma(alpha) - digamma(np.sum(alpha))
    return digamma(alpha) - digamma(np.sum(alpha, axis=1))[:, np.newaxis]

In [56]:
def SVI_for_LDA(documents, V, K=10, alpha=1.0, eta=1.0, max_iter=100, tau=64, kappa=0.7, e_step_iter=20, verbose=True):
    D = len(documents)
    lambd = np.random.gamma(100., 1./100., size=(K, V))

    doc_topic_distrib = {}

    for t in range(max_iter):
        start_time = time.time()

        doc_id = np.random.randint(0, D)
        doc = documents[doc_id]
        N = len(doc)

        gamma = np.ones(K)
        phi = np.full(shape=(N, K), fill_value=1/K)

        E_logbeta = log_dirichlet_expectation(lambd)
        exp_E_logbeta = np.exp(E_logbeta)

        for j in range(e_step_iter):
            E_logtheta = log_dirichlet_expectation(gamma)
            exp_E_logtheta = np.exp(E_logtheta)

            for n, w in enumerate(doc):
                phi[n, :] = exp_E_logtheta * exp_E_logbeta[:, w]
                phi[n, :] /= np.sum(phi[n, :])
            
            gamma = alpha + np.sum(phi, axis=0)

            theta = gamma / np.sum(gamma)
            doc_topic_distrib[titles[doc_id]] = theta

        topic_word_contrib = np.zeros(shape=(K, V))
        for n, w in enumerate(doc):
            topic_word_contrib[:, w] += phi[n, :]

        lambd_hat = eta + D*topic_word_contrib

        rho = (t + tau) ** -kappa
        lambd = (1 - rho) * lambd + rho * lambd_hat

        iteration_time = time.time() - start_time
        if verbose:
            print(f"\t - Iteration {t} done in {iteration_time:.2f}s.")

    return lambd, doc_topic_distrib

In [57]:
lambd, doc_topic_distrib = SVI_for_LDA(
    corpus,
    V,
    K=10,
    alpha=1.0,
    eta=1.0,
    max_iter=500,
    tau=100,
    kappa=0.7,
    e_step_iter=30
)

	 - Iteration 0 done in 0.66s.
	 - Iteration 1 done in 0.46s.
	 - Iteration 2 done in 0.24s.
	 - Iteration 3 done in 0.18s.
	 - Iteration 4 done in 0.03s.
	 - Iteration 5 done in 1.39s.
	 - Iteration 6 done in 1.76s.
	 - Iteration 7 done in 0.04s.
	 - Iteration 8 done in 1.10s.
	 - Iteration 9 done in 0.76s.
	 - Iteration 10 done in 0.19s.
	 - Iteration 11 done in 0.22s.
	 - Iteration 12 done in 0.13s.
	 - Iteration 13 done in 0.25s.
	 - Iteration 14 done in 0.03s.
	 - Iteration 15 done in 0.67s.
	 - Iteration 16 done in 0.09s.
	 - Iteration 17 done in 0.95s.
	 - Iteration 18 done in 0.74s.
	 - Iteration 19 done in 0.50s.
	 - Iteration 20 done in 0.23s.
	 - Iteration 21 done in 0.06s.
	 - Iteration 22 done in 0.03s.
	 - Iteration 23 done in 0.45s.
	 - Iteration 24 done in 0.51s.
	 - Iteration 25 done in 0.13s.
	 - Iteration 26 done in 0.06s.
	 - Iteration 27 done in 0.02s.
	 - Iteration 28 done in 0.61s.
	 - Iteration 29 done in 1.59s.
	 - Iteration 30 done in 0.80s.
	 - Iteration 31 d

In [147]:
n_top_words = 20
inv_vocab = {i: w for w, i in vocab.items()}

dict_top_words = {}
for k in range(lambd.shape[0]):
    top_word_ids = lambd[k].argsort()[::-1][:n_top_words]
    top_words = [inv_vocab[i] for i in top_word_ids]
    dict_top_words[f"Topic {k}"] = top_words

df_top_words = pd.DataFrame(dict_top_words)
df_top_words.index += 1
df_top_words.index.name = "Rank"
df_top_words

Unnamed: 0_level_0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,culture,business,political,language,war,nature,sports,culture,lodge,internet
2,india,company,state,languages,military,life,women,media,grand,facebook
3,century,also,party,english,spanish,earth,sport,million,church,users
4,also,research,government,words,spain,science,team,people,first,data
5,period,canada,right,used,city,one,league,women,lodges,used
6,early,network,russian,also,gaza,would,men,news,century,explorer
7,modern,service,election,spoken,government,history,football,new,god,use
8,art,billion,president,word,led,philosophy,female,also,one,windows
9,history,united,national,speakers,became,natural,world,would,edinburgh,content
10,western,based,parliament,use,first,first,games,facebook,freemasonry,user


In [185]:
# Wordcloud to do here

In [59]:
rows = []

for title, theta in doc_topic_distrib.items():
    row = {
        'Article': title,
        'Seed': wiki_articles[title]["seed"],
        'Top topic': np.argmax(theta)
    }
    for i, val in enumerate(theta):
        row[f'Topic {i} prob'] = np.round(val*100, 2)
    rows.append(row)

df_results = pd.DataFrame(rows)

In [75]:
df_results

Unnamed: 0,Article,Seed,Top topic,Topic 0 prob,Topic 1 prob,Topic 2 prob,Topic 3 prob,Topic 4 prob,Topic 5 prob,Topic 6 prob,Topic 7 prob,Topic 8 prob,Topic 9 prob
0,Anaximander,nature,5,10.56,8.15,9.97,5.92,10.17,17.24,10.93,8.80,9.87,8.39
1,Language family,language,3,6.40,13.74,5.41,27.67,7.37,1.17,8.26,9.29,6.75,13.96
2,Politics of Edinburgh,politics,8,0.21,4.95,45.74,0.18,1.29,0.16,0.22,0.17,46.89,0.20
3,Culture of Mali,culture,0,63.38,0.59,0.32,0.48,0.39,0.35,9.76,18.44,3.88,2.42
4,Nature Methods,nature,5,1.79,4.03,2.45,2.13,1.83,76.29,3.03,2.90,2.32,3.22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,Russian language,language,3,2.87,1.29,17.59,54.88,5.64,0.15,0.23,2.71,13.43,1.21
400,Chechen language,language,3,6.97,0.74,4.06,81.09,2.88,1.03,0.16,0.51,2.00,0.58
401,Culture of Japan,culture,0,48.68,3.59,0.14,9.33,15.24,6.56,3.71,11.60,0.18,0.95
402,Internet Explorer,internet,9,0.29,3.14,0.73,0.64,3.54,0.27,0.84,4.00,0.24,86.31


In [60]:
topic_counts = df_results['Top topic'].value_counts().sort_index().reset_index()

topic_counts.columns = ['Topic', 'Count']
topic_counts['Topic'] = topic_counts['Topic'].apply(lambda x: f'Topic {x}')

fig = px.pie(
    topic_counts,
    names='Topic',
    values='Count',
    title='Répartition des topics dominants dans le corpus',
    hole=0.4,
    color_discrete_sequence=px.colors.qualitative.Safe
)

fig.show()

In [74]:
df_results["Seed"].value_counts()

Seed
history     58
language    57
culture     57
nature      50
internet    50
politics    48
business    48
sports      36
Name: count, dtype: int64

In [None]:
df_pivot = pd.pivot_table(
    df_results,
    index="Seed",
    columns="Top topic",
    aggfunc="size",
    fill_value=0
)
df_pivot.columns = [f"Topic {c}" for c in df_pivot.columns]
# df_pivot.reset_index(inplace=True)
df_pivot

Unnamed: 0_level_0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
Seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
business,0,32,2,0,1,0,2,9,1,1
culture,24,5,0,0,4,6,0,10,3,5
history,13,1,2,1,17,8,3,7,4,2
internet,0,9,0,0,1,0,0,6,0,34
language,4,0,0,43,1,0,0,1,2,6
nature,0,3,0,0,2,30,0,11,4,0
politics,0,0,35,0,4,0,1,3,3,2
sports,0,5,0,0,0,1,26,4,0,0


In [None]:
df_percent = df_pivot.div(df_pivot.sum(axis=1), axis=0)
df_percent = df_percent.reset_index()

df_melted = df_percent.melt(id_vars='Seed', var_name='Topic', value_name='Proportion')

fig = px.bar(
    df_melted,
    x="Seed",
    y="Proportion",
    color="Topic",
    barmode="stack",
    title="Répartition des topics par seed (proportions)",
    color_discrete_sequence=px.colors.qualitative.Prism
)

fig.update_layout(
    xaxis_title="Seed",
    yaxis_title="Proportion",
    legend_title="Topic",
    yaxis_tickformat=".0%",
    template="plotly_white"
)

fig.show()

In [113]:
df_percent

Unnamed: 0,Seed,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
0,business,0.0,0.666667,0.041667,0.0,0.020833,0.0,0.041667,0.1875,0.020833,0.020833
1,culture,0.421053,0.087719,0.0,0.0,0.070175,0.105263,0.0,0.175439,0.052632,0.087719
2,history,0.224138,0.017241,0.034483,0.017241,0.293103,0.137931,0.051724,0.12069,0.068966,0.034483
3,internet,0.0,0.18,0.0,0.0,0.02,0.0,0.0,0.12,0.0,0.68
4,language,0.070175,0.0,0.0,0.754386,0.017544,0.0,0.0,0.017544,0.035088,0.105263
5,nature,0.0,0.06,0.0,0.0,0.04,0.6,0.0,0.22,0.08,0.0
6,politics,0.0,0.0,0.729167,0.0,0.083333,0.0,0.020833,0.0625,0.0625,0.041667
7,sports,0.0,0.138889,0.0,0.0,0.0,0.027778,0.722222,0.111111,0.0,0.0
