In [50]:
from lda import guidedlda as glda
import numpy as np
import pandas as pd
import re

## Read in Lemmatized Reviews

In [51]:
number = 5933
lemmatized_data = pd.read_csv(f'../data/reviews/lemmatized_reviews/lemmatized_reviews_{number}.csv', index_col=0)
lemmatized_data

Unnamed: 0,tconst,username,rating,review
0,tt0000574,David-240,10/10,movie be believe to be the world first feature...
1,tt0000574,F Gwynplaine MacIntyre,10/10,this afternoon at the barbican i attend the uk...
2,tt0000574,ackstasis,9/10,movie be undoubtedly one of the cinema most si...
3,tt0000574,Ziggy5446,10/10,movie symbolizes both the birth of the austral...
4,tt0000574,Fella_shibby,8/10,this be the original n the first account of ne...
...,...,...,...,...
11743,tt0018621,JohnHowardReid,8/10,paramount groom superbeautiful actress a a rep...
11744,tt0018621,kidboots,8/10,actress seem to be in every other movie during...
11745,tt0018621,F Gwynplaine MacIntyre,7/10,actress be an attractive and talented actress ...
11746,tt0018638,cliffperriam,8/10,movie exists a a reel silent french mm release...


## Create Fitted Guided LDA Model

In [52]:
from sklearn.feature_extraction.text import CountVectorizer

def create_doc_term_matrix(data, inp='content', min_df=1, max_df=1.0):
    cv = CountVectorizer(input=inp, stop_words='english', min_df=min_df, max_df=max_df)
    X = cv.fit_transform(data)
    return cv, X

def get_vocab(cv):
    word2id = cv.vocabulary_
    vocab = cv.get_feature_names()
    return word2id, vocab

def fitted_glda_model(X, guide_words, word2id, seed_confidence=1):
    model = glda.GuidedLDA(n_topics=len(guide_words), n_iter=100)

    seed_topics = {}
    for t_id, st in enumerate(guide_words):
        for word in st:
            seed_topics[word2id[word]] = t_id

    model.fit(X, seed_topics=seed_topics, seed_confidence=seed_confidence)

    return model

def get_guide_words(model, vocab, n):
    guide_words = []

    for i, topic_dist in enumerate(model.topic_word_):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
        guide_words.append(topic_words)
        topic_label = re.findall('\\\\([a-z_]+)', filenames[i])[0]
        print(f'Topic {topic_label}:\n{" ".join(topic_words)}\n')

    return guide_words

### Retrieve Seed Topics

In [55]:
import glob

filenames = [file for file in glob.glob("../data/topics/lemm_topics/*.txt")]

# create document term matrix over all guide documents
guide_cv, guideX = create_doc_term_matrix(filenames, inp='filename', max_df=0.9)

# create vocabulary over all guide documents
guide_word2id, guide_vocab = get_vocab(guide_cv)

# retrieve (manual) basic guide words
with open('../data/topics/lemm_guide_words.txt', encoding='utf-8') as f:
    basic_guide_words = [[t for t in topics.split() if t in guide_vocab] for topics in f.read().splitlines()]

# retrieve fitted glda model
guide_model = fitted_glda_model(guideX, basic_guide_words, guide_word2id)

# retrieve guide words extracted from all guide documents
seed_guide_words = get_guide_words(guide_model, guide_vocab, 10)

INFO:lda:n_documents: 11
INFO:lda:vocab_size: 571
INFO:lda:n_words: 1110
INFO:lda:n_topics: 10
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -11065
INFO:lda:<10> log likelihood: -8617
INFO:lda:<20> log likelihood: -8499
INFO:lda:<30> log likelihood: -8384
INFO:lda:<40> log likelihood: -8249
INFO:lda:<50> log likelihood: -8202
INFO:lda:<60> log likelihood: -8169
INFO:lda:<70> log likelihood: -8153
INFO:lda:<80> log likelihood: -8125
INFO:lda:<90> log likelihood: -8117
INFO:lda:<99> log likelihood: -8119


Topic acting:
character actor act performance tom people forrest hank run thats

Topic attraction:
arc set far structure watch interested base conform approach reason

Topic cinematography:
just set make language light cinematography element oneofakind act best

Topic dialogue:
film dialogue story help make time context use universe thing

Topic directing:
vision tell director way execute potential mean told creative onscreen

Topic editing_effects:
effect tone edit visual special seamlessly shark explosion building thriller

Topic it_factor:
premise value main entertainment entertain good instead iconic matter sequence

Topic plot:
story plot follow create character example work attention audience different

Topic sound_music:
sound music feel transcendent experience design authentic maybe aspect harmony

Topic theme:
theme intrigue gladiator able identify story form away special think



### Generate Corpus Vocabulary and Seed Topics

In [56]:
cv, X = create_doc_term_matrix(lemmatized_data.review, min_df=30, max_df=0.7)
word2id, vocab = get_vocab(cv)

guide_words = [[t for t in topics if t in vocab] for topics in seed_guide_words]

### Fit Model and Display Top Topic Words

In [58]:
# TODO: cv
model = fitted_glda_model(X, guide_words, word2id, seed_confidence=0.8)
top_words = get_guide_words(model, vocab, 15)




INFO:lda:n_documents: 11748
INFO:lda:vocab_size: 5242
INFO:lda:n_words: 1294393
INFO:lda:n_topics: 10
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -13093049
INFO:lda:<10> log likelihood: -11181366
INFO:lda:<20> log likelihood: -10833806
INFO:lda:<30> log likelihood: -10715815
INFO:lda:<40> log likelihood: -10649948
INFO:lda:<50> log likelihood: -10606758
INFO:lda:<60> log likelihood: -10570344
INFO:lda:<70> log likelihood: -10543753
INFO:lda:<80> log likelihood: -10523625
INFO:lda:<90> log likelihood: -10506936
INFO:lda:<99> log likelihood: -10490294


Topic acting:
actor actress director play make star love role young girl scene man year father come

Topic attraction:
actor watch silent war black like make train white great time just comedy scene say

Topic cinematography:
actor make silent director story great horror set time like best just act look german

Topic dialogue:
make time actor like just watch silent really scene great story good way thing look

Topic directing:
director german time story make herr actress cinema great silent war year work way life

Topic editing_effects:
city worker actress ship scene world shot life people man war set time work class

Topic it_factor:
actress make like woman man girl good just love end wife come old look young

Topic plot:
actor director character actress scene make shot story plot work comedy time act like use

Topic sound_music:
version silent director print score dvd actress actor make original release time minute year sound

Topic theme:
actor story actress love like make silent ve