# XHEC - Session 5-1


-----------------

## Topic Extraction : LDA Implementation  

In this session we will build an LDA from scratch

### Import libraries 

In [28]:
import os
import pandas as pd
import numpy as np
import itertools
import random

### Create documents 

In [29]:
rawdocs = ['eat turkey on turkey day holiday',
           'i like to eat cake on holiday',
           'turkey trot race on thanksgiving holiday',
           'snail race the turtle',
           'time travel space race',
           'movie on thanksgiving',
           'movie at air and space museum is cool movie',
           'aspiring movie star']

rawdocs = list(map(lambda x: x.split(), rawdocs)) #Split by whitespace

In [30]:
rawdocs

[['eat', 'turkey', 'on', 'turkey', 'day', 'holiday'],
 ['i', 'like', 'to', 'eat', 'cake', 'on', 'holiday'],
 ['turkey', 'trot', 'race', 'on', 'thanksgiving', 'holiday'],
 ['snail', 'race', 'the', 'turtle'],
 ['time', 'travel', 'space', 'race'],
 ['movie', 'on', 'thanksgiving'],
 ['movie', 'at', 'air', 'and', 'space', 'museum', 'is', 'cool', 'movie'],
 ['aspiring', 'movie', 'star']]

### Set parameters

In [31]:
K = 2 #Number of topic
alpha = 0.1 #Hyperparameter alpha
eta = 0.1 #Hyperparameter eta
iterationNb = 3 #Number of iterations

### Convert to a numerical problem 

In [32]:
#Create a dictionnary {id: word}
vocab = np.unique(list(itertools.chain.from_iterable(rawdocs)))
vocab = {k: v for v, k in enumerate(vocab)}

In [6]:
vocab

{'air': 0,
 'and': 1,
 'aspiring': 2,
 'at': 3,
 'cake': 4,
 'cool': 5,
 'day': 6,
 'eat': 7,
 'holiday': 8,
 'i': 9,
 'is': 10,
 'like': 11,
 'movie': 12,
 'museum': 13,
 'on': 14,
 'race': 15,
 'snail': 16,
 'space': 17,
 'star': 18,
 'thanksgiving': 19,
 'the': 20,
 'time': 21,
 'to': 22,
 'travel': 23,
 'trot': 24,
 'turkey': 25,
 'turtle': 26}

In [33]:
#Swap word for id in each document
document = [list(map(lambda x: vocab[x], doc)) for doc in rawdocs]

In [34]:
document

[[7, 25, 14, 25, 6, 8],
 [9, 11, 22, 7, 4, 14, 8],
 [25, 24, 15, 14, 19, 8],
 [16, 15, 20, 26],
 [21, 23, 17, 15],
 [12, 14, 19],
 [12, 3, 0, 1, 17, 13, 10, 5, 12],
 [2, 12, 18]]

### Create the topic-word matrix

In [35]:
def initialiseWordTopicMatrix(vocab, document, K):
    #Initialise the word-topic count matrix
    TopicWordMatrix = np.zeros((K, len(vocab)))
    #Randomly assign topic for each word in each document
    topicAssignmentList = [[random.randint(0,K-1) for i in range(len(doc))] for doc in document]

    for iDoc, doc in enumerate(document): #For all document
        for iToken, wordId in enumerate(doc): #For all token
            #Find the topic of the given token
            tokenTopic =  topicAssignmentList[iDoc][iToken]
            #Update the wordTopicMatrix
            TopicWordMatrix[tokenTopic][wordId] += 1
    return TopicWordMatrix, topicAssignmentList

In [36]:
TopicWordMatrix, topicAssignmentList = initialiseWordTopicMatrix(vocab, document, K)

In [37]:
TopicWordMatrix, TopicWordMatrix.shape  

(array([[0., 1., 1., 1., 1., 0., 1., 2., 1., 0., 1., 1., 2., 1., 1., 2.,
         1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [1., 0., 0., 0., 0., 1., 0., 0., 2., 1., 0., 0., 2., 0., 3., 1.,
         0., 2., 1., 2., 1., 0., 0., 0., 0., 2., 0.]]),
 (2, 27))

In [38]:
topicAssignmentList

[[0, 1, 0, 1, 0, 0],
 [1, 0, 0, 0, 0, 1, 1],
 [0, 0, 1, 1, 1, 1],
 [0, 0, 1, 0],
 [0, 0, 1, 0],
 [1, 1, 1],
 [0, 0, 1, 0, 1, 0, 0, 1, 1],
 [0, 0, 1]]

### Create the document-topic matrix 

In [39]:
def initialiseDocumentTopicMatrix(topicAssignmentList, document):
    documentTopicMatrix = np.zeros((len(document), K))
    for iDoc in range(len(document)):
        for iTopic in range(K):
            #Update document matrix topic according to topicAssignmentList
            documentTopicMatrix[iDoc][iTopic] = topicAssignmentList[iDoc].count(iTopic)
    return documentTopicMatrix

In [40]:
documentTopicMatrix = initialiseDocumentTopicMatrix(topicAssignmentList, document)

In [41]:
documentTopicMatrix

array([[4., 2.],
       [4., 3.],
       [2., 4.],
       [3., 1.],
       [3., 1.],
       [0., 3.],
       [5., 4.],
       [2., 1.]])

### LDA iterations 

In [42]:
def ldaModel(K, alpha, eta, iterationNb, document, vocab, TopicWordMatrix, topicAssignmentList, documentTopicMatrix):
    #For each iteration
    for i in range(iterationNb):
        #For each document
        for iDoc, doc in enumerate(document):
            #For each word in the document
            for iToken, wordId in enumerate(doc):
                #Initial topic for the token
                oldTopic = topicAssignmentList[iDoc][iToken]

                #Focus of the i-th Token - decrement in the matrices
                documentTopicMatrix[iDoc][oldTopic] -= 1
                TopicWordMatrix[oldTopic][wordId] -= 1
                
                #Gibbs-Sampling
                weight = []
                for iTopic in range(K):
                    #A term
                    num_a = topicAssignmentList[iDoc].count(iTopic)+alpha
                    denom_a = len(vocab)-1+alpha
                    #B term
                    num_b = TopicWordMatrix[iTopic][wordId] + eta
                    denom_b = TopicWordMatrix.sum(axis=0)[K]+eta
                    #Proba
                    weight.append((num_a/denom_a)*(num_b/denom_b))
                
                #Draw topic - multinomial distribution
                newTopic = random.choices(range(K), weights = weight, k = 1)[0]
                #Re-assign topic
                documentTopicMatrix[iDoc][newTopic] += 1
                TopicWordMatrix[newTopic][wordId] += 1
                topicAssignmentList[iDoc][iToken] = newTopic
    #Normalize matrix
    documentTopicMatrix = ((documentTopicMatrix+alpha).T/(documentTopicMatrix+alpha).sum(axis=1)).T
    TopicWordMatrix = ((TopicWordMatrix+alpha).T/(TopicWordMatrix+alpha).sum(axis=1)).T
    return documentTopicMatrix, TopicWordMatrix, topicAssignmentList


In [43]:
documentTopicMatrixUpdate, TopicWordMatrixUpdate, topicAssignmentListUpdate = ldaModel(K, alpha, eta, iterationNb, document, vocab, TopicWordMatrix, topicAssignmentList, documentTopicMatrix)


In [44]:
documentTopicMatrixUpdate, documentTopicMatrixUpdate.shape  

(array([[0.17741935, 0.82258065],
        [0.01388889, 0.98611111],
        [0.01612903, 0.98387097],
        [0.97619048, 0.02380952],
        [0.02380952, 0.97619048],
        [0.34375   , 0.65625   ],
        [0.88043478, 0.11956522],
        [0.03125   , 0.96875   ]]),
 (8, 2))

In [45]:
TopicWordMatrixUpdate, TopicWordMatrixUpdate.shape 

(array([[0.06586826, 0.06586826, 0.00598802, 0.06586826, 0.00598802,
         0.06586826, 0.00598802, 0.06586826, 0.00598802, 0.00598802,
         0.06586826, 0.00598802, 0.18562874, 0.00598802, 0.00598802,
         0.06586826, 0.06586826, 0.06586826, 0.00598802, 0.00598802,
         0.06586826, 0.00598802, 0.00598802, 0.00598802, 0.00598802,
         0.00598802, 0.06586826],
        [0.00325733, 0.00325733, 0.03583062, 0.00325733, 0.03583062,
         0.00325733, 0.03583062, 0.03583062, 0.1009772 , 0.03583062,
         0.00325733, 0.03583062, 0.03583062, 0.03583062, 0.13355049,
         0.06840391, 0.00325733, 0.03583062, 0.03583062, 0.06840391,
         0.00325733, 0.03583062, 0.03583062, 0.03583062, 0.03583062,
         0.1009772 , 0.00325733]]),
 (2, 27))

In [46]:
topicAssignmentListUpdate

[[0, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1],
 [0, 0, 0, 0],
 [1, 1, 1, 1],
 [0, 1, 1],
 [0, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 1, 1]]

### Show topic 

In [47]:
def displayTopic(TopicWordMatrixUpdate, vocab, nb_word):
    vocab = {v: k for k, v in vocab.items()} #Swap id and value to have a dict {id: "word"}
    for topicNb, wordPerTopic in enumerate(TopicWordMatrixUpdate):
        print(f"\n>>> Topic {topicNb}")
        TopicWordMatrixSeries = pd.Series(wordPerTopic).sort_values(ascending=False) 
        wordIds = TopicWordMatrixSeries.index
        topicToString = []
        for i in range(nb_word):
            topicToString.append(f"{vocab[wordIds[i]]}*{round(TopicWordMatrixSeries[wordIds[i]],2)}")
        print('+'.join(topicToString))

In [48]:
displayTopic(TopicWordMatrixUpdate, vocab, 6)


>>> Topic 0
movie*0.19+air*0.07+is*0.07+the*0.07+space*0.07+snail*0.07

>>> Topic 1
on*0.13+turkey*0.1+holiday*0.1+thanksgiving*0.07+race*0.07+like*0.04
