# XHEC - Session 5-1


-----------------

## Topic Extraction : LDA Implementation  

In this session we will build an LDA from scratch

### Import libraries 

In [4]:
import os
import pandas as pd
import numpy as np
import itertools
import random
from nltk import word_tokenize

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Create documents 

In [5]:
rawdocs = ['eat turkey on turkey day holiday',
           'i like to eat cake on holiday',
           'turkey trot race on thanksgiving holiday',
           'snail race the turtle',
           'time travel space race',
           'movie on thanksgiving',
           'movie at air and space museum is cool movie',
           'aspiring movie star']

rawdocs = [word_tokenize(sentence) for sentence in rawdocs]
rawdocs = [[word for word in sentence] for sentence in rawdocs] # TO DO : Create a list of list of words in each sentence (Split by whitespace)

In [6]:
rawdocs

[['eat', 'turkey', 'on', 'turkey', 'day', 'holiday'],
 ['i', 'like', 'to', 'eat', 'cake', 'on', 'holiday'],
 ['turkey', 'trot', 'race', 'on', 'thanksgiving', 'holiday'],
 ['snail', 'race', 'the', 'turtle'],
 ['time', 'travel', 'space', 'race'],
 ['movie', 'on', 'thanksgiving'],
 ['movie', 'at', 'air', 'and', 'space', 'museum', 'is', 'cool', 'movie'],
 ['aspiring', 'movie', 'star']]

### Set parameters

In [None]:
K = 2 #Number of topic
alpha = 0.1 #Hyperparameter alpha
eta = 0.1 #Hyperparameter eta
iterationNb = 3 #Number of iterations

### Convert to a numerical problem 

In [None]:
# TO DO: Create a dictionnary {id1: word1, id2: word2}
vocab = {}

In [None]:
vocab

{}

In [None]:
#TO DO: Swap word for id in each document
document = []

In [None]:
document

[]

### Create the topic-word matrix

wordTopicMatrix:
- each line is related to a word
- each column is related to a Topic
- The cell (w,t) is related to the number of time that the word w has been assigned to the topic t
   
In order to create wordTopicMatrix we need to assign topic for each word (topicAssignmentList)
 
topicAssignmentList has the same length as document and we randomly assign a topic to each of the word. So the (i, j) element of topicAssignmentList correspond to the topic assign to the j-th word of the i-th document 

In [None]:
def initialiseWordTopicMatrix(vocab, document, K):
    #TO DO: Initialise the topic-word count matrix with 0 - shape (numberOftopic, numberOfword)
    TopicWordMatrix = []
    #Randomly assign topic for each word in each document
    topicAssignmentList = [[random.randint(0,K-1) for i in range(len(doc))] for doc in document]

    for iDoc, doc in enumerate(document): #For all document
        for iToken, wordId in enumerate(doc): #For all token
            #TO DO : Find the topic of the given token
            #TO DO : Update the wordTopicMatrix
    return TopicWordMatrix, topicAssignmentList

In [None]:
TopicWordMatrix, topicAssignmentList = initialiseWordTopicMatrix(vocab, document, K)

In [None]:
TopicWordMatrix

In [None]:
topicAssignmentList

### Create the document-topic matrix 

DocumentTopicMatrix:

- each line is related to a document
- each column is related to a topic
- cell (d, t) is related to the number of word in the document d that hs been assigned to the topic t

In [None]:
def initialiseDocumentTopicMatrix(topicAssignmentList, document):
    #TO DO: Initialise document topic matrix with 0 - shape (number of document, number of topic)
    documentTopicMatrix = []
    for iDoc in range(len(document)):
        for iTopic in range(K):
            #TO DO : Update document matrix topic according to topicAssignmentList
    return documentTopicMatrix

In [None]:
documentTopicMatrix = initialiseDocumentTopicMatrix(topicAssignmentList, document)

In [None]:
documentTopicMatrix

### LDA iterations 

<center><img src="https://drive.google.com/uc?export=view&id=1rREuNnVIZIpPIdvC37hFEWcER6puzmYa"/></center>

- Focus each iteration:
    - For each document:
        - For each word:
            
            - Focus on the current word w -> Find the topic assign to the word w in topicAssignmentList 
            - Now we want to forget the topic initialy assigned to the word w in order to assign a better topic 
                - decrement documentTopicMatrix and TopicWordMatrix 
                - ex: For the i-th word of the j-th document, if the word was assigned to the topic 1, we'll decrement the (j, 1) element of documentTopicMatrix and the (1, word_id) element of TopicWordMatrix because the word is not assigned to the first topic anymore
            - Implement Gibbs sampling algorithm: find the probabilites that the word w will be assign to each topic
            - Build the multinomial law with the previously found probability and simulate it with a weighted random
            - Re-assign the word w to the new topic
                - increment documentTopicMatrix and TopicWordMatrix

In [None]:
def ldaModel(K, alpha, eta, iterationNb, document, vocab, TopicWordMatrix, topicAssignmentList, documentTopicMatrix):
    #For each iteration
        #For each document
            #For each word in the document
                #TO DO: Find the initial topic for the token

                #TO DO: Focus of the i-th Token - decrement the documentTopicMatrix and TopicWordMatrix
                
                #Gibbs-Sampling - For each topic
                    #TO DO: Find the probability
                
                #TO DO: Simulate the multinomial law to find the new topic
                
                #TO DO: Re-assign topic
    #Normalize matrix
    documentTopicMatrix = ((documentTopicMatrix+alpha).T/(documentTopicMatrix+alpha).sum(axis=1)).T
    TopicWordMatrix = ((TopicWordMatrix+alpha).T/(TopicWordMatrix+alpha).sum(axis=1)).T
    return documentTopicMatrix, TopicWordMatrix, topicAssignmentList


In [None]:
documentTopicMatrixUpdate, TopicWordMatrixUpdate, topicAssignmentListUpdate = ldaModel(K, alpha, eta, iterationNb, document, vocab, TopicWordMatrix, topicAssignmentList, documentTopicMatrix)


In [None]:
documentTopicMatrixUpdate

In [None]:
TopicWordMatrixUpdate

In [None]:
topicAssignmentListUpdate

### Show topic 

In [None]:
#Find the most representative word for a topic 
#Form: "word1"*coeff1 + "word2"*coeff2+... 

def displayTopic(TopicWordMatrixUpdate, vocab, nb_word):
    vocab = {v: k for k, v in vocab.items()} #Swap id and value to have a dict {id: "word"}
    for topicNb, wordPerTopic in enumerate(TopicWordMatrixUpdate):
        print(f"\n>>> Topic {topicNb}")
        TopicWordMatrixSeries = pd.Series(wordPerTopic).sort_values(ascending=False) 
        wordIds = TopicWordMatrixSeries.index
        topicToString = []
        for i in range(nb_word):
            topicToString.append(f"{vocab[wordIds[i]]}*{round(TopicWordMatrixSeries[wordIds[i]],2)}")
        print('+'.join(topicToString))

In [None]:
displayTopic(TopicWordMatrixUpdate, vocab, 6)