In [6]:
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import pickle
import nltk #a python natual language toolkit. Has pruning resources
import pandas as pd
import operator
import unicodedata

import itertools
import numpy as np

from multiprocessing import Pool

In [7]:
def output_topic_model(ldamodel,output_fname='',num_words=10):
    ''' For outputting the results of a topic model 
    into a more human-readable format
    '''
    
    #Format final output
    mat = ldamodel.print_topics(num_topics=10, num_words=num_words)
    final_output = []
    
    for i in mat:
        topic = i[1]
        weights_words = topic.split('+')
        words = [w.split('*')[1] for w in weights_words] #Get rid of weights
        words = [unicodedata.normalize('NFKD', word).encode('ascii','ignore') for word in words]
        final_output.append(words)
    
    output_frame = pd.DataFrame(final_output)

    # If a filename has been specified, save the topic model
    if output_fname:
        output_frame.to_csv(output_fname,encoding='utf-8')
   
    return output_frame

In [8]:
# Load the results of topic modelling with additional computational passes
bigram_mods = pickle.load(open('tms_bigrams_mp.p','rb'))
unigram_mods = pickle.load(open('tms_unigrams_mp.p','rb'))

In [None]:
#Uncomment this when the _all_ topic models, using the 2010-2015 corpuses, have finished generating:

"""
unigram_all = pickle.load(open('tms_all_unigram_mp.p','rb'))
bigram_all = pickle.load(open('tms_all_bigram_mp.p','rb'))

#Gentrified and ungentrified bigram topic model representations
g_all = output_topic_model(mod_gent,'tm_all_mp_bigram_10w_gent.csv',num_words)
ung_all = output_topic_model(mod_ungent,'tm_all_mp_bigram_10w_ungent.csv',num_words)

#Gentrified and ungentrified unigram topic model representations
g_all_u = output_topic_model(mod_gent,'tm_all_mp_unigram_10w_gent.csv',num_words)
ung_all_u = output_topic_model(mod_ungent,'tm_all_mp_unigram_10w_ungent.csv',num_words)

g_all
"""

In [14]:
#Retrieve the topic models themselves from the pickle dumps

#bigram topic models
mod_g2015 = bigram_mods[0] # bigram topic model for 2015, gentrified-area reviews
mod_g2010 = bigram_mods[1] 
mod_u2015 = bigram_mods[2]
mod_u2010 = bigram_mods[3]

#unigrams topic models
mod_g2015u = unigram_mods[0] # unigram topic model for 2015, gentrified-area reviews
mod_g2010u = unigram_mods[1]
mod_u2015u = unigram_mods[2]
mod_u2010u = unigram_mods[3]

#Get human-readable representations of the topic models as panda dataframes
#When not given a filename, the function will not write out a file
num_words = 10

#bigrams
g2015 = output_topic_model(mod_g2015,num_words=num_words) # Pandas dataframe representation of topic model
g2010 = output_topic_model(mod_g2010,num_words=num_words)
u2015 = output_topic_model(mod_u2015,num_words=num_words)
u2010 = output_topic_model(mod_u2010,num_words=num_words)

#unigrams
g2015u = output_topic_model(mod_g2015u,num_words=num_words) 
g2010u = output_topic_model(mod_g2010u,num_words=num_words)
u2015u = output_topic_model(mod_u2015u,num_words=num_words)
u2010u = output_topic_model(mod_u2010u,num_words=num_words)

# Throw away the individual topics, get unordered lists of the 100 most weighted words
g15 = set(g2015.values.flatten()) # all 100 of the most weighted words for gentrified, 2015
g10 = set(g2010.values.flatten())
u15 = set(u2015.values.flatten())
u10 = set(u2010.values.flatten())
g15u = set(g2015u.values.flatten())
g10u = set(g2010u.values.flatten())
u15u = set(u2015u.values.flatten())
u10u = set(u2010u.values.flatten())

In [15]:
g2015

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,first_time,mac_chees,great_place,next_time,food_truck,realli_good,fri_pickl,come_back,shrimp_taco,love_love
1,come_back,love_place,craft_beer,food_truck,go_back,great_place,realli_good,live_music,coffe_shop,staff_friendli
2,food_truck,beer_select,go_back,realli_good,come_back,fri_chicken,custom_servic,will_definit,5_star,one_favorit
3,fish_taco,go_back,salt_caramel,realli_good,first_time,come_back,servic_excel,mac_chees,pretti_good,pork_belli
4,fish_taco,pretti_good,fri_pickl,food_truck,cabo_fish,ve_ever,can_get,pimento_chees,tater_tot,fri_chicken
5,ve_ever,salt_caramel,caramel_browni,fri_chicken,south_end,food_truck,beer_select,select_beer,fish_taco,come_back
6,first_time,beer_chees,come_back,fish_taco,fri_chicken,m_sure,go_back,mac_chees,can_get,great_place
7,go_back,chunki_monkey,monkey_chunki,food_truck,love_place,great_food,great_beer,great_place,beer_select,can_wait
8,go_back,food_truck,caramel_browni,pretti_good,salt_caramel,great_place,food_good,can_wait,great_food,place_go
9,love_place,go_back,mac_chees,great_place,beer_select,will_back,fri_pickl,pretti_good,caramel_browni,salt_caramel


In [23]:
#Make some difference lists, easily formatted for copy-pasting
print ', '.join(g15.difference(u15))
print ', '.join(g10.difference(u10))

beer_chees , can_get , select_beer , south_end , will_definit , craft_beer , fish_taco , one_favorit, live_music , beer_select , salt_caramel, food_truck , love_love, cabo_fish , pork_belli, place_go, love_place , chunki_monkey , tater_tot , great_place, pimento_chees , will_back , fri_chicken, ve_ever , realli_good , fri_pickl , salt_caramel , fri_chicken , shrimp_taco , great_beer , coffe_shop , can_wait, staff_friendli, 5_star , servic_excel , monkey_chunki , m_sure , caramel_browni , come_back
don_know , ameli_without , burger_compani , good_servic, will_definit , breakfast_sandwich , chicken_sandwich , crepe_cellar, smelli_cat , dessert_crepe, fish_taco , chicken_coop , french_onion , 24_hour , beer_select , peanut_butter , mix_green, close_monday , queen_citi , hot_dog, eat_ameli , last_night , price_chicken , without_mind , time_went, galleri_crawl , first_time , 5_star, across_street , realli_great , don_like, tater_tot , best_way , littl_bit , place_go , coffe_shop , next_time