In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize

%pylab inline
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

from gensim import corpora, models, similarities
from nltk.corpus import stopwords
import nltk

import snowballstemmer
import gensim

from string import ascii_lowercase

import itertools

from collections import defaultdict
import collections

from IPython.display import clear_output

Populating the interactive namespace from numpy and matplotlib




In [2]:
#start by loading the stemmed wine notes
wine_notes = pd.read_pickle('wine_notes.pickle')

In [4]:
#keywords for data parameters
data_key = 'wine_notes'
data_save = False
data_order = 'shuffle' #'reverse', 'original'

n_top = 20
kappa = 1
n_pass = 1

In [5]:
#prepare document df and document lists
if data_order == 'original':
    document_df = wine_notes.copy()
if data_order == 'shuffle':
    document_df = wine_notes.sample(frac=1, replace=False).reset_index(drop=True)
if data_order == 'reverse':
    document_df = wine_notes.sort_index(ascending=False).reset_index(drop=True)

document_list = document_df['Stemmed'].tolist()

In [None]:
if data_save: document_df.to_pickle(data_key+'_ids'+'.pickle')

In [6]:
#create dictionary of all stemmed words
dictionary = corpora.Dictionary(document_list)
if data_save: dictionary.save(data_key+'.dict')
    
#check out some dictionary methods for filtering/refining the dictionary, i.e. removing frequent/infrequent words

#pretty sure only have to do this once, since the words in the documents won't change, just the order of doucments

In [13]:
#1. create "bow" corpus from documents using the dictionary
corpus = [dictionary.doc2bow(line) for line in document_list]
if data_save: corpora.MmCorpus.serialize(data_key+'.mm', corpus)
    
#randomize/shuffle the corpus based on "data_input" input parameter

In [14]:
#2. convert to tf-idf model
tfidf = models.TfidfModel(corpus)
if data_save: tfidf.save(data_key+'.tfidf_model')
#keep the corpus model in memory for future use
corpus_tfidf = tfidf[corpus]
if data_save: corpora.MmCorpus.serialize(data_key+'_tfidf'+'.mm', corpus_tfidf)

In [15]:
#3. make lda model out of tfidf model
#default settings for the most part
#can change them as necessary to see effect on corpus_tfidf input
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics = n_top, decay = kappa, passes = n_pass, minimum_probability=0)

if data_save: lda.save(data_key+'.lda')

#keep wrapped corpus model in memory
corpus_lda = lda[corpus_tfidf]
if data_save: corpora.MmCorpus.serialize(data_key+'_lda'+'.mm', corpus_lda)

lda.show_topics(num_topics=n_top,formatted=False)

[(0,
  [('water', 0.0039502452244033828),
   ('midpal', 0.0037897879629662109),
   ('molass', 0.0034864697162264439),
   ('enrich', 0.003234074388779954),
   ('barbequ', 0.0029789892960046661),
   ('brais', 0.0029625050417803598),
   ('delicaci', 0.0028243061863772851),
   ('slate', 0.0028098283973876596),
   ('spice', 0.0027529810818336655),
   ('fruit', 0.0026527279172847916)]),
 (1,
  [('blackcurr', 0.0051355537324252664),
   ('greet', 0.0035874999484536737),
   ('red', 0.0034455074300971966),
   ('nose', 0.0032122216530690406),
   ('sour', 0.0031040221870004088),
   ('cherri', 0.0030294924481424866),
   ('black', 0.0030247910888716589),
   ('brule', 0.0029719555583118044),
   ('spice', 0.0029354180583759703),
   ('palat', 0.0028946092905639434)]),
 (2,
  [('tempranillo', 0.007781958418136543),
   ('santa', 0.0057155823640221886),
   ('garnacha', 0.0047202734991962899),
   ('satisfi', 0.0044349479280628252),
   ('pinot', 0.004014857243904997),
   ('graciano', 0.0038761120227190945),

In [None]:
wine_search = 10002
topic = 0
wine_topics = lda.get_document_topics(corpus_tfidf[wine_search])
#print(wine_topics)
#lda0.show_topic(wine_topics[0][0],topn=50)
topics_wine = [(x[1], x[0]) for x in sorted([(x[1], x[0]) for x in wine_topics], reverse=True)]
pprint(topics_wine[topic])
pprint(lda.show_topic(topics_wine[topic][0],topn=30))
print(document_list[wine_search])

In [18]:
#1. reshuffle the documents
document_df = wine_notes.sample(frac=1, replace=False).reset_index(drop=True)
document_list = document_df['Stemmed'].tolist()
#2. recreate "bow" corpus from documents using the dictionary
corpus = [dictionary.doc2bow(line) for line in document_list]
#3. convert to tf-idf model
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#4. update lda model
lda.update(corpus_tfidf)
corpus_lda = lda[corpus_tfidf]

lda.show_topics(num_topics=n_top,formatted=False)

[(0,
  [('water', 0.0065239193851581575),
   ('midpal', 0.0056755215540746995),
   ('molass', 0.0052119628522940189),
   ('enrich', 0.0050671728486812815),
   ('barbequ', 0.0047202162960725026),
   ('brais', 0.0044593762885999123),
   ('composit', 0.0039816030675668369),
   ('delicaci', 0.003947507364589895),
   ('pallet', 0.0038795043023163557),
   ('slate', 0.0037299873748366089)]),
 (1,
  [('blackcurr', 0.0078680800169473077),
   ('greet', 0.0049722267516115011),
   ('opaqu', 0.0047105873267610163),
   ('sour', 0.0046213306562212101),
   ('brule', 0.0041716372880352863),
   ('tasti', 0.003978109943106314),
   ('front', 0.003774490658870372),
   ('sandalwood', 0.0037424010335295526),
   ('creme', 0.0035240751798004907),
   ('streak', 0.003502042705506581)]),
 (2,
  [('tempranillo', 0.012691349991836035),
   ('santa', 0.008310980625931965),
   ('garnacha', 0.0074679908471471925),
   ('satisfi', 0.0071137242027039457),
   ('graciano', 0.006606951391455505),
   ('marmalad', 0.0056390785

In [None]:
#easy way to get lda results into a dataframe...
theta, _ = lda.inference(corpus_tfidf)
theta /= theta.sum(axis=1)[:, None]

In [None]:
#secondary data_save
data_save=True
if data_save:
    document_df.to_pickle(data_key+'_ids'+'.pickle')
    dictionary.save(data_key+'.dict')
    corpora.MmCorpus.serialize(data_key+'_bow'+'.mm', corpus)
    tfidf.save(data_key+'.tfidf_model')
    lda.save(data_key+'.lda')
    corpora.MmCorpus.serialize(data_key+'_tfidf'+'.mm',corpus_tfidf)
    corpora.MmCorpus.serialize(data_key+'_lda'+'.mm',corpus_lda)

In [None]:
topic_probs = pd.concat([pd.DataFrame(document_df.Id),pd.DataFrame(document_df.Stemmed), pd.DataFrame(theta)], axis=1, join_axes=[document_df.index])
topic_probs.to_pickle('wine_notes_probs.pickle')

In [None]:
pd.DataFrame(lda.print_topics(num_topics=-1, num_words=25))

In [None]:
#list of tuples (wine_lda) into a dataframe
#hard way to get lda results into a dataframe...
def create_lda_df(corp_lda, n_top):
    lda_probs = pd.DataFrame()
    docs = len(corp_lda)
        
    for d in range(docs):
        
        indiv_probs = pd.Series(0,index=range(0,n_top))
        
        for i in range(len(corp_lda[d])):
           
            indiv_probs.iloc[corp_lda[d][i][0]] = corp_lda[d][i][1]
        
        lda_probs = lda_probs.append(indiv_probs.T, ignore_index=True)
        
        if (d % round(docs*0.01,0)) == 0:
            print('processing:')
            print(round(d/docs*100,0),'%')
            clear_output(wait=True)
            
    return lda_probs

In [None]:
lda_probs_df = create_lda_df(corpus_lda,n_top)