In [1]:
import pandas as pd
import numpy as np
import pickle
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')
from text_cleaning import lemmatize_and_stem, preprocess, get_aggregate_score, replace_periods
import time
import urllib.request
from collections import Counter
from selenium.webdriver import Chrome

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaylischulz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
combined_descriptions_df = pickle.load(open('data/combined_cities.pkl', 'rb'))
cities_df = pickle.load(open('data/cities.pkl', 'rb'))

In [3]:
combined_descriptions_df.head()

Unnamed: 0,city,city_summary,city_url,country,text
0,Danube Valley,The Danube is at its romantic best just west o...,https://www.ricksteves.com/europe/austria/danu...,Austria,\nThe Danube (/ˈdæn.juːb/ DAN-yoob; known by v...
1,Danube Valley,The Danube is at its romantic best just west o...,https://www.ricksteves.com/europe/austria/danu...,Austria,\nThe Danube (/ˈdæn.juːb/ DAN-yoob; known by v...
2,Hallstatt,Lovable Hallstatt is a tiny town bullied onto ...,https://www.ricksteves.com/europe/austria/hall...,Austria,Hallstatt (German: [ˈhalʃtat]; Central Bavaria...
3,Salzburg,"Thanks to its charmingly preserved old town, s...",https://www.ricksteves.com/europe/austria/salz...,Austria,Salzburg (German: [ˈzaltsbʊɐ̯k] (listen);[note...
4,Tirol,Mountainous Tirol — in Austria's western panha...,https://www.ricksteves.com/europe/austria/tirol,Austria,"\nTyrol (/tɪˈroʊl, taɪ-, ˈtaɪroʊl/;[1] histori..."


In [4]:
processed_countries = list(combined_descriptions_df['city_summary'].map(preprocess))
processed_countries.extend(list(combined_descriptions_df['text'].map(preprocess)))

In [5]:
len(processed_countries)

432

In [6]:
dictionary_bow = gensim.corpora.Dictionary(processed_countries)

In [7]:
dictionary_bow.filter_extremes(no_below=5, no_above=0.30)

In [8]:
bow_corpus = [dictionary_bow.doc2bow(country) for country in processed_countries]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5,
                                       id2word=dictionary_bow, passes=20,
                                       workers=1, alpha=20)

In [95]:
for ind, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(ind, topic))

Topic: 0 
Words: 0.009*"forest" + 0.007*"valley" + 0.004*"natur" + 0.004*"mountain" + 0.004*"bc" + 0.004*"rout" + 0.003*"abbey" + 0.003*"monument" + 0.003*"univers" + 0.003*"landscap"
Topic: 1 
Words: 0.006*"palac" + 0.006*"saint" + 0.004*"million" + 0.004*"host" + 0.004*"univers" + 0.004*"urban" + 0.004*"council" + 0.003*"mayor" + 0.003*"metropolitan" + 0.003*"institut"
Topic: 2 
Words: 0.016*"island" + 0.005*"bc" + 0.005*"lake" + 0.004*"empir" + 0.004*"mountain" + 0.004*"rule" + 0.004*"pope" + 0.003*"di" + 0.003*"provinc" + 0.003*"averag"
Topic: 3 
Words: 0.006*"camp" + 0.005*"prison" + 0.005*"speak" + 0.005*"unit" + 0.004*"beach" + 0.004*"cathol" + 0.004*"land" + 0.004*"alli" + 0.004*"island" + 0.003*"armi"
Topic: 4 
Words: 0.008*"univers" + 0.007*"school" + 0.006*"council" + 0.006*"club" + 0.005*"theatr" + 0.005*"festiv" + 0.004*"music" + 0.004*"compani" + 0.004*"bridg" + 0.004*"colleg"


In [9]:
# pickle.dump(lda_model, open('models/lda_model.pkl', 'wb'))
lda_model = pickle.load(open('models/lda_model.pkl', 'rb'))

In [10]:
for ind, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(ind, topic))

Topic: 0 
Words: 0.009*"forest" + 0.007*"valley" + 0.004*"natur" + 0.004*"mountain" + 0.004*"bc" + 0.004*"rout" + 0.003*"abbey" + 0.003*"monument" + 0.003*"univers" + 0.003*"landscap"
Topic: 1 
Words: 0.006*"palac" + 0.006*"saint" + 0.004*"million" + 0.004*"host" + 0.004*"univers" + 0.004*"urban" + 0.004*"council" + 0.003*"mayor" + 0.003*"metropolitan" + 0.003*"institut"
Topic: 2 
Words: 0.016*"island" + 0.005*"bc" + 0.005*"lake" + 0.004*"empir" + 0.004*"mountain" + 0.004*"rule" + 0.004*"pope" + 0.003*"di" + 0.003*"provinc" + 0.003*"averag"
Topic: 3 
Words: 0.006*"camp" + 0.005*"prison" + 0.005*"speak" + 0.005*"unit" + 0.004*"beach" + 0.004*"cathol" + 0.004*"land" + 0.004*"alli" + 0.004*"island" + 0.003*"armi"
Topic: 4 
Words: 0.008*"univers" + 0.007*"school" + 0.006*"council" + 0.006*"club" + 0.005*"theatr" + 0.005*"festiv" + 0.004*"music" + 0.004*"compani" + 0.004*"bridg" + 0.004*"colleg"


In [11]:
lda_model[bow_corpus[228]]

[(0, 0.23005863),
 (1, 0.25213996),
 (2, 0.20869474),
 (3, 0.17230529),
 (4, 0.13680139)]

In [12]:
aggregate_scores = []
for i in range(213):
    scores = get_aggregate_score(lda_model, bow_corpus, i)
    aggregate_scores.append(scores)
aggregate_scores

[{0: 0.21785069, 1: 0.19399925, 2: 0.19662169, 3: 0.1867668, 4: 0.2047616},
 {0: 0.21564415, 1: 0.19121051, 2: 0.19510484, 3: 0.20759174, 4: 0.19044876},
 {0: 0.21522658, 1: 0.19213042, 2: 0.20830831, 3: 0.19436201, 4: 0.18997267},
 {0: 0.19681884, 1: 0.20391665, 2: 0.21273163, 3: 0.22386087, 4: 0.16267201},
 {0: 0.20095268, 1: 0.20560408, 2: 0.20945421, 3: 0.22582702, 4: 0.158162},
 {0: 0.22288862, 1: 0.20816001, 2: 0.20624828, 3: 0.1991868, 4: 0.1635163},
 {0: 0.18666238, 1: 0.19640136, 2: 0.2007449, 3: 0.22384101, 4: 0.19235036},
 {0: 0.20014264, 1: 0.21776018, 2: 0.19772941, 3: 0.21114002, 4: 0.17322776},
 {0: 0.16731112, 1: 0.25815248, 2: 0.18966483, 3: 0.19173233, 4: 0.19313926},
 {0: 0.17098513, 1: 0.23929466, 2: 0.19791326, 3: 0.19396421, 4: 0.19784272},
 {0: 0.17835915, 1: 0.24742168, 2: 0.18593034, 3: 0.19784585, 4: 0.190443},
 {0: 0.18829668, 1: 0.24528176, 2: 0.18300137, 3: 0.19117056, 4: 0.19224966},
 {0: 0.17541955, 1: 0.2486769, 2: 0.20963004, 3: 0.18856423, 4: 0.1777093

In [13]:
topics_df = pd.DataFrame(aggregate_scores)

In [14]:
topics_df.head()

Unnamed: 0,0,1,2,3,4
0,0.217851,0.193999,0.196622,0.186767,0.204762
1,0.215644,0.191211,0.195105,0.207592,0.190449
2,0.215227,0.19213,0.208308,0.194362,0.189973
3,0.196819,0.203917,0.212732,0.223861,0.162672
4,0.200953,0.205604,0.209454,0.225827,0.158162


In [15]:
col_names=['forest_mountain','palaces','island_water', 'historical_ww2', 'urban']
topics_df.columns = col_names
topics_df.head()

Unnamed: 0,forest_mountain,palaces,island_water,historical_ww2,urban
0,0.217851,0.193999,0.196622,0.186767,0.204762
1,0.215644,0.191211,0.195105,0.207592,0.190449
2,0.215227,0.19213,0.208308,0.194362,0.189973
3,0.196819,0.203917,0.212732,0.223861,0.162672
4,0.200953,0.205604,0.209454,0.225827,0.158162


In [18]:
cities_and_topics = pd.merge(cities_df, topics_df, left_index=True, right_index=True)

In [19]:
cities_and_topics.head(10)

Unnamed: 0,city,city_summary,city_url,country,forest_mountain,palaces,island_water,historical_ww2,urban
0,Danube Valley,The Danube is at its romantic best just west o...,https://www.ricksteves.com/europe/austria/danu...,Austria,0.217851,0.193999,0.196622,0.186767,0.204762
1,Hallstatt,Lovable Hallstatt is a tiny town bullied onto ...,https://www.ricksteves.com/europe/austria/hall...,Austria,0.215644,0.191211,0.195105,0.207592,0.190449
2,Salzburg,"Thanks to its charmingly preserved old town, s...",https://www.ricksteves.com/europe/austria/salz...,Austria,0.215227,0.19213,0.208308,0.194362,0.189973
3,Tirol,Mountainous Tirol — in Austria's western panha...,https://www.ricksteves.com/europe/austria/tirol,Austria,0.196819,0.203917,0.212732,0.223861,0.162672
4,Vienna,"Vienna is the capital of Austria, the cradle o...",https://www.ricksteves.com/europe/austria/vienna,Austria,0.200953,0.205604,0.209454,0.225827,0.158162
5,Antwerp,"Antwerp (Antwerpen in Dutch, Anvers in French)...",https://www.ricksteves.com/europe/belgium/antwerp,Belgium,0.222889,0.20816,0.206248,0.199187,0.163516
6,Bruges,"With pointy gilded architecture, stay-a-while ...",https://www.ricksteves.com/europe/belgium/bruges,Belgium,0.186662,0.196401,0.200745,0.223841,0.19235
7,Brussels,"Six hundred years ago, Brussels was just a nic...",https://www.ricksteves.com/europe/belgium/brus...,Belgium,0.200143,0.21776,0.197729,0.21114,0.173228
8,Ghent,"Ghent doesn't ooze with cobbles and charm, lik...",https://www.ricksteves.com/europe/belgium/ghent,Belgium,0.167311,0.258152,0.189665,0.191732,0.193139
9,Mostar,"Despite the scars of war, Mostar is still stun...",https://www.ricksteves.com/europe/bosnia-herze...,Bosnia-Herzegovina,0.170985,0.239295,0.197913,0.193964,0.197843


In [20]:
cities_and_topics['city'] = cities_and_topics['city'].map(replace_periods)

In [21]:
cities_and_topics.loc[cities_and_topics['city'] == 'St. Andrews']

Unnamed: 0,city,city_summary,city_url,country,forest_mountain,palaces,island_water,historical_ww2,urban


In [23]:
cities_and_topics.loc[cities_and_topics['city'] == 'St Andrews']

Unnamed: 0,city,city_summary,city_url,country,forest_mountain,palaces,island_water,historical_ww2,urban
179,St Andrews,"St. Andrews may be synonymous with golf, but t...",https://www.ricksteves.com/europe/scotland/st-...,Scotland,0.197317,0.184111,0.225344,0.204712,0.188517


In [22]:
cities_and_topics.describe()

Unnamed: 0,forest_mountain,palaces,island_water,historical_ww2,urban
count,213.0,213.0,213.0,213.0,213.0
mean,0.201108,0.200944,0.20727,0.196203,0.194475
std,0.01502,0.016711,0.016226,0.013738,0.01568
min,0.164735,0.154976,0.172057,0.164026,0.152617
25%,0.193024,0.19039,0.195961,0.187509,0.185979
50%,0.199937,0.199408,0.205314,0.195469,0.195182
75%,0.209939,0.20927,0.215729,0.203686,0.20155
max,0.244448,0.258152,0.268752,0.249102,0.250471


In [24]:
# pickle.dump(cities_and_topics, open('data/cities_with_topic_scores.pkl', 'wb'))