In [94]:
from lda import guidedlda as glda
import numpy as np
import pandas as pd
import re

## Read in Lemmatized Reviews

In [95]:
number = 5933
lemmatized_data = pd.read_csv(f'../data/reviews/lemmatized_reviews/lemmatized_reviews_{number}.csv', index_col=0)
lemmatized_data

Unnamed: 0,tconst,username,rating,review
0,tt0000574,David-240,10/10,movie be believe to be the world first feature...
1,tt0000574,F Gwynplaine MacIntyre,10/10,this afternoon at the barbican i attend the uk...
2,tt0000574,ackstasis,9/10,movie be undoubtedly one of the cinema most si...
3,tt0000574,Ziggy5446,10/10,movie symbolizes both the birth of the austral...
4,tt0000574,Fella_shibby,8/10,this be the original n the first account of ne...
...,...,...,...,...
11743,tt0018621,JohnHowardReid,8/10,paramount groom superbeautiful actress a a rep...
11744,tt0018621,kidboots,8/10,actress seem to be in every other movie during...
11745,tt0018621,F Gwynplaine MacIntyre,7/10,actress be an attractive and talented actress ...
11746,tt0018638,cliffperriam,8/10,movie exists a a reel silent french mm release...


## Create Fitted Guided LDA Model

In this section, I will be creating a Guided LDA model fitted to the lemmatized reviews.

To do this, I have prepared some basic seed words which will then get expanded with similar/correlated words
by running GLDA on some guide documents. As the guide documents are already split into the appropriate topics,
we can expand the seed words list in this manner with some confidence.

An additional GLDA model will then be created, fitted on the entire corpus of reviews. The expanded topic word
list will be used as the seed words.

In [96]:
from sklearn.feature_extraction.text import CountVectorizer

def get_vocab(cv):
    """
    Retrieve the word2id mapping and vocabulary list.

    Parameters:
    cv (CountVectorizer): sparse matrix representation of word tokens extracted from some corpus of documents.

    Returns:
    word2id: A mapping of terms to feature indices.
    vocab: A list of all unique vocabulary.

   """
    word2id = cv.vocabulary_
    vocab = cv.get_feature_names()
    return word2id, vocab

def fitted_glda_model(X, guide_words, word2id, seed_confidence=1):
    """
    Create a fitted Guided LDA model.

    Parameters:
    X (sparse matrix): document-term matrix
    guide_words (list): 2d array of seed words per topic
    word2id (dict): mapping of terms to feature indices
    seed_confidence (float): a float from [0,1] that enforces a bias toward the seed words. with a seed_confidence of 0.1 you can bias the seeded words by 10% more

    Returns:
    A fitted GLDA model

   """
    model = glda.GuidedLDA(n_topics=len(guide_words), n_iter=100)

    seed_topics = {}
    for t_id, st in enumerate(guide_words):
        for word in st:
            seed_topics[word2id[word]] = t_id

    model.fit(X, seed_topics=seed_topics, seed_confidence=seed_confidence)

    return model

def get_topic_labels(files):
    """
    Gives a list of topic labels given file names of guide documents.
    Assumes filenames are written in the form '<path>\\<topic_label>.txt'

    Parameters:
    files (list): list of filepaths

    Returns:
    list of topic labels

   """
    return [re.findall('\\\\([a-z_]+)', file)[0].replace('_', ' ') for file in files]

def get_guide_words(model, vocab, n, topics):
    """
    Displays and returns top n words of each topic of a fitted GLDA model.

    Parameters:
    model (GuidedLDA): GLDA model
    vocab (list): list of unique vocabulary in corpus
    n (int): number of top words to return
    topics (list): list of topic labels

    Returns:
    list of top n guide words per topic

   """
    guide_words = []
    for i, topic_dist in enumerate(model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
        guide_words.append(topic_words)
        print(f'Topic {topics[i]}:\n{" ".join(topic_words)}\n')

    return guide_words

### Retrieve Seed Topics

In [97]:
import glob

filenames = [file for file in glob.glob("../data/topics/lemm_topics/*.txt")]
topic_labels = get_topic_labels(filenames)

guide_cv = CountVectorizer(input='filenames', stop_words='english', max_df=0.9)
# create document term matrix over all guide documents
guide_X = guide_cv.fit_transform(filenames)

# create vocabulary over all guide documents
guide_word2id, guide_vocab = get_vocab(guide_cv)

# retrieve (manual) basic guide words
with open('../data/topics/lemm_guide_words.txt', encoding='utf-8') as f:
    basic_guide_words = [[t for t in topics.split() if t in guide_vocab] for topics in f.read().splitlines()]

# retrieve fitted glda model
guide_model = fitted_glda_model(guide_X, basic_guide_words, guide_word2id)

# retrieve guide words extracted from all guide documents
seed_guide_words = get_guide_words(guide_model, guide_vocab, 10, topic_labels)

INFO:lda:n_documents: 11
INFO:lda:vocab_size: 11
INFO:lda:n_words: 11
INFO:lda:n_topics: 10
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -54
INFO:lda:<10> log likelihood: -56
INFO:lda:<20> log likelihood: -54
INFO:lda:<30> log likelihood: -56
INFO:lda:<40> log likelihood: -61
INFO:lda:<50> log likelihood: -56
INFO:lda:<60> log likelihood: -54
INFO:lda:<70> log likelihood: -54
INFO:lda:<80> log likelihood: -54
INFO:lda:<90> log likelihood: -57
INFO:lda:<99> log likelihood: -56


Topic acting:
sound_music theme_lemm theme plot it_factor editing_effects directing dialogue cinematography attraction

Topic attraction:
acting theme_lemm theme sound_music plot it_factor editing_effects directing dialogue cinematography

Topic cinematography:
plot theme_lemm theme sound_music it_factor editing_effects directing dialogue cinematography attraction

Topic dialogue:
attraction theme_lemm theme sound_music plot it_factor editing_effects directing dialogue cinematography

Topic directing:
theme_lemm theme sound_music plot it_factor editing_effects directing dialogue cinematography attraction

Topic editing effects:
editing_effects theme_lemm theme sound_music plot it_factor directing dialogue cinematography attraction

Topic it factor:
directing dialogue theme_lemm theme sound_music plot it_factor editing_effects cinematography attraction

Topic plot:
theme theme_lemm sound_music plot it_factor editing_effects directing dialogue cinematography attraction

Topic sound music

In [98]:
guide_vocab

['acting',
 'attraction',
 'cinematography',
 'dialogue',
 'directing',
 'editing_effects',
 'it_factor',
 'plot',
 'sound_music',
 'theme',
 'theme_lemm']

### Generate Corpus Vocabulary and Seed Topics

In [None]:
glda_cv = CountVectorizer(stop_words='english', min_df=30, max_df=0.7)
# create document term matrix over all guide documents
glda_X = guide_cv.fit_transform(lemmatized_data.review)

glda_word2id, glda_vocab = get_vocab(glda_cv)

glda_guide_words = [[t for t in topics if t in glda_vocab] for topics in seed_guide_words]

### Fit Model and Display Top Topic Words

In [101]:
# TODO: cv
glda_model = fitted_glda_model(glda_X, glda_guide_words, glda_word2id, seed_confidence=0.8)
top_words = get_guide_words(glda_model, glda_vocab, 15, topic_labels)

import pickle
with open('../pickles/glda.pickle', 'wb') as f:
    pickle.dump([glda_model, topic_labels], f)

INFO:lda:n_documents: 11748
INFO:lda:vocab_size: 73608
INFO:lda:n_words: 1591076
INFO:lda:n_topics: 10
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -17841553
INFO:lda:<10> log likelihood: -15532555
INFO:lda:<20> log likelihood: -14927697
INFO:lda:<30> log likelihood: -14696768
INFO:lda:<40> log likelihood: -14565792
INFO:lda:<50> log likelihood: -14475953
INFO:lda:<60> log likelihood: -14405670
INFO:lda:<70> log likelihood: -14352188
INFO:lda:<80> log likelihood: -14308258
INFO:lda:<90> log likelihood: -14275873
INFO:lda:<99> log likelihood: -14243286


IndexError: index 46300 is out of bounds for axis 0 with size 5242