In [1]:
import pandas as pd
import numpy as np
import pickle
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
from text_cleaning import lemmatize_and_stem, preprocess

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaylischulz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
countries_df = pickle.load(open('countries.pkl', 'rb'))
countries_df.head()

Unnamed: 0,country,country_summary
0,Austria,"Small, landlocked Austria offers alpine scener..."
1,Belgium,Belgium falls through the cracks. Wedged betwe...
2,Bosnia-Herzegovina,Apart from the tragic way it separated from Yu...
3,Bulgaria,"Endearing, surprising Bulgaria is a rewarding ..."
4,Croatia,With thousands of miles of seafront and more t...


In [3]:
cities_df = pickle.load(open('cities.pkl', 'rb'))
cities_df.head()

Unnamed: 0,city,city_summary,country
0,Danube Valley,The Danube is at its romantic best just west o...,Austria
1,Hallstatt,Lovable Hallstatt is a tiny town bullied onto ...,Austria
2,Salzburg,"Thanks to its charmingly preserved old town, s...",Austria
3,Tirol,Mountainous Tirol — in Austria's western panha...,Austria
4,Vienna,"Vienna is the capital of Austria, the cradle o...",Austria


In [4]:
test_sample = countries_df.iloc[0]['country_summary']
test_sample

"Small, landlocked Austria offers alpine scenery, world-class museums, cobbled quaintness, and Wiener schnitzel. Unlike Germany, its industrious neighbor to the northwest, Austria is content to bask in its good living and elegant, opulent past as the former head of one of Europe's grandest empires. Austrians tend to be relaxed, gregarious people who love the outdoors as much as a good cup of coffee in a café."

In [5]:
print('Original:')
words = test_sample.split(' ')
print(words)
print('\n\n Tokenized and Lemmatized:')
print(preprocess(test_sample))

Original:
['Small,', 'landlocked', 'Austria', 'offers', 'alpine', 'scenery,', 'world-class', 'museums,', 'cobbled', 'quaintness,', 'and', 'Wiener', 'schnitzel.', 'Unlike', 'Germany,', 'its', 'industrious', 'neighbor', 'to', 'the', 'northwest,', 'Austria', 'is', 'content', 'to', 'bask', 'in', 'its', 'good', 'living', 'and', 'elegant,', 'opulent', 'past', 'as', 'the', 'former', 'head', 'of', 'one', 'of', "Europe's", 'grandest', 'empires.', 'Austrians', 'tend', 'to', 'be', 'relaxed,', 'gregarious', 'people', 'who', 'love', 'the', 'outdoors', 'as', 'much', 'as', 'a', 'good', 'cup', 'of', 'coffee', 'in', 'a', 'café.']


 Tokenized and Lemmatized:
['small', 'landlock', 'austria', 'offer', 'alpin', 'sceneri', 'world', 'class', 'museum', 'cobbl', 'quaint', 'wiener', 'schnitzel', 'unlik', 'germani', 'industri', 'neighbor', 'northwest', 'austria', 'content', 'bask', 'good', 'live', 'eleg', 'opul', 'past', 'head', 'europ', 'grandest', 'empir', 'austrian', 'tend', 'relax', 'gregari', 'peopl', 'lov

In [6]:
processed_countries = cities_df['city_summary'].map(preprocess)
processed_countries[:10]

0     [danub, romant, best, west, vienna, mix, cruis...
1     [lovabl, hallstatt, tini, town, bulli, ledg, s...
2     [thank, charmingli, preserv, old, town, splend...
3     [mountain, tirol, austria, western, panhandl, ...
4     [vienna, capit, austria, cradl, classic, music...
5     [antwerp, antwerpen, dutch, anver, french, bel...
12    [pointi, gild, architectur, stay, café, vivid,...
13    [year, ago, brussel, nice, place, stop, buy, w...
27    [ghent, ooz, cobbl, charm, like, rival, bruge,...
28    [despit, scar, war, mostar, stun, straddl, ban...
Name: city_summary, dtype: object

In [7]:
dictionary_bow = gensim.corpora.Dictionary(processed_countries)

In [8]:
dictionary_bow.filter_extremes(no_below=20, no_above=0.15)

In [9]:
bow_corpus = [dictionary_bow.doc2bow(country) for country in processed_countries]

In [10]:
tfidf = gensim.models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [11]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10,
                                       id2word=dictionary_bow, passes=20,
                                       workers=1, alpha=.5)

In [12]:
for ind, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(ind, topic))

Topic: 0 
Words: 0.303*"place" + 0.277*"mediev" + 0.186*"café" + 0.058*"center" + 0.046*"fun" + 0.034*"line" + 0.032*"travel" + 0.012*"year" + 0.009*"experi" + 0.006*"free"
Topic: 1 
Words: 0.252*"coast" + 0.217*"beach" + 0.202*"franc" + 0.115*"mountain" + 0.063*"travel" + 0.055*"mile" + 0.046*"beauti" + 0.006*"ll" + 0.005*"north" + 0.005*"feel"
Topic: 2 
Words: 0.304*"visit" + 0.230*"ruin" + 0.156*"line" + 0.147*"mile" + 0.066*"experi" + 0.037*"explor" + 0.010*"year" + 0.006*"beauti" + 0.005*"north" + 0.004*"sit"
Topic: 3 
Words: 0.296*"peopl" + 0.258*"live" + 0.202*"shop" + 0.071*"experi" + 0.057*"life" + 0.035*"line" + 0.024*"café" + 0.010*"travel" + 0.006*"build" + 0.005*"fun"
Topic: 4 
Words: 0.191*"today" + 0.177*"cathedr" + 0.152*"build" + 0.135*"year" + 0.107*"fun" + 0.080*"sit" + 0.060*"travel" + 0.018*"live" + 0.017*"surround" + 0.016*"feel"
Topic: 5 
Words: 0.261*"feel" + 0.260*"visitor" + 0.190*"romant" + 0.161*"set" + 0.071*"beauti" + 0.015*"today" + 0.009*"experi" + 0.003

In [13]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary_bow, passes=20, workers=1)

In [14]:
for ind, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(ind, topic))

Topic: 0 
Words: 0.159*"mediev" + 0.138*"sight" + 0.130*"shop" + 0.063*"surround" + 0.059*"center" + 0.057*"set" + 0.053*"local" + 0.040*"place" + 0.035*"explor" + 0.034*"today"
Topic: 1 
Words: 0.153*"café" + 0.126*"live" + 0.110*"peopl" + 0.066*"line" + 0.056*"build" + 0.050*"free" + 0.050*"shop" + 0.049*"fun" + 0.046*"local" + 0.046*"life"
Topic: 2 
Words: 0.164*"feel" + 0.111*"experi" + 0.104*"surround" + 0.078*"build" + 0.073*"visitor" + 0.068*"church" + 0.058*"travel" + 0.043*"today" + 0.037*"shop" + 0.037*"mediev"
Topic: 3 
Words: 0.164*"countri" + 0.134*"ll" + 0.126*"peopl" + 0.083*"north" + 0.078*"local" + 0.071*"fun" + 0.070*"today" + 0.053*"visitor" + 0.040*"visit" + 0.033*"year"
Topic: 4 
Words: 0.170*"place" + 0.169*"year" + 0.128*"today" + 0.083*"roman" + 0.053*"set" + 0.047*"franc" + 0.038*"sit" + 0.037*"visitor" + 0.036*"travel" + 0.035*"feel"
Topic: 5 
Words: 0.196*"church" + 0.128*"life" + 0.087*"ruin" + 0.079*"sight" + 0.070*"beauti" + 0.067*"sit" + 0.039*"year" + 0.

In [20]:
cities_df.iloc[8]['city_summary']

"Ghent doesn't ooze with cobbles and charm, like its rival Bruges — this is a living city, with an urban grittiness and a welcome splash of creative hipster funkiness. Explore its historic quarter, ogle the Van Eyck altarpiece in its massive cathedral, tour its impressive art and design museums, stroll its picturesque embankments, bask in its finely decorated historic gables, and prowl its newly revitalized Patershol restaurant quarter. Ghent is the kind of town that you visit for a few hours…and find yourself wishing you had a few days."

In [15]:
for ind, score in sorted(lda_model[bow_corpus[8]], key=lambda tup: -1*tup[1]):
    print('\nScore: {}\t \nTopic: {}'.format(score, lda_model.print_topic(ind, 10)))


Score: 0.18688815832138062	 
Topic: 0.304*"visit" + 0.230*"ruin" + 0.156*"line" + 0.147*"mile" + 0.066*"experi" + 0.037*"explor" + 0.010*"year" + 0.006*"beauti" + 0.005*"north" + 0.004*"sit"

Score: 0.1737857460975647	 
Topic: 0.191*"today" + 0.177*"cathedr" + 0.152*"build" + 0.135*"year" + 0.107*"fun" + 0.080*"sit" + 0.060*"travel" + 0.018*"live" + 0.017*"surround" + 0.016*"feel"

Score: 0.159812793135643	 
Topic: 0.296*"peopl" + 0.258*"live" + 0.202*"shop" + 0.071*"experi" + 0.057*"life" + 0.035*"line" + 0.024*"café" + 0.010*"travel" + 0.006*"build" + 0.005*"fun"

Score: 0.14609326422214508	 
Topic: 0.407*"island" + 0.208*"explor" + 0.162*"surround" + 0.087*"sit" + 0.085*"beach" + 0.010*"set" + 0.004*"build" + 0.001*"mountain" + 0.001*"place" + 0.001*"local"

Score: 0.055608831346035004	 
Topic: 0.239*"local" + 0.194*"roman" + 0.191*"countri" + 0.183*"north" + 0.097*"ll" + 0.035*"travel" + 0.019*"year" + 0.005*"ruin" + 0.004*"visit" + 0.004*"peopl"

Score: 0.05558742582798004	 
Topi

In [16]:
for ind, score in sorted(lda_model_tfidf[bow_corpus[8]], key=lambda tup: -1*tup[1]):
    print('\nScore: {}\t \nTopic: {}'.format(score, lda_model_tfidf.print_topic(ind, 10)))


Score: 0.8199813365936279	 
Topic: 0.106*"cathedr" + 0.083*"franc" + 0.083*"mile" + 0.073*"free" + 0.063*"north" + 0.062*"center" + 0.062*"travel" + 0.048*"explor" + 0.048*"visit" + 0.039*"build"

Score: 0.020008347928524017	 
Topic: 0.153*"café" + 0.126*"live" + 0.110*"peopl" + 0.066*"line" + 0.056*"build" + 0.050*"free" + 0.050*"shop" + 0.049*"fun" + 0.046*"local" + 0.046*"life"

Score: 0.020003924146294594	 
Topic: 0.311*"mountain" + 0.099*"visit" + 0.086*"set" + 0.056*"romant" + 0.050*"franc" + 0.045*"beauti" + 0.042*"sight" + 0.041*"line" + 0.040*"ll" + 0.036*"ruin"

Score: 0.02000175043940544	 
Topic: 0.164*"countri" + 0.134*"ll" + 0.126*"peopl" + 0.083*"north" + 0.078*"local" + 0.071*"fun" + 0.070*"today" + 0.053*"visitor" + 0.040*"visit" + 0.033*"year"

Score: 0.020001566037535667	 
Topic: 0.231*"island" + 0.182*"coast" + 0.096*"roman" + 0.077*"beach" + 0.065*"ruin" + 0.040*"mediev" + 0.031*"center" + 0.030*"explor" + 0.027*"countri" + 0.025*"cathedr"

Score: 0.020001366734504

In [17]:
train_vecs = []
for i in range(len(bow_corpus)):
    top_topics = lda_model.get_document_topics(bow_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    train_vecs.append(topic_vec)

In [22]:
train_vecs[210]

[0.25020847,
 0.08957886,
 0.06252645,
 0.063820735,
 0.08779522,
 0.18664914,
 0.06306091,
 0.06254703,
 0.07131286,
 0.06250032]