# Collection of Data from ricksteves.com

## Import Necessary Libraries

In [1]:
import pandas as pd
import time
import urllib.request
from collections import Counter
from selenium.webdriver import Chrome
import pickle
import pymongo
import numpy as np
from scrape import collect_urls, collect_all_data, collect_city_data, get_wiki_description

### Instantiate MongoDB Databases

In [2]:
mc = pymongo.MongoClient()

In [3]:
city_db = mc['city_database']

In [4]:
city_collection = city_db['city_collection']
country_collection = city_db['country_collection']
wiki_collection = city_db['wiki_collection']

### Web Scrape

In [5]:
browser = Chrome()
url = "https://www.ricksteves.com/"
browser.get(url)

In [6]:
browser.find_element_by_xpath('//*[@id="nav"]/ul/li[2]/a').click()

In [7]:
country_urls = collect_urls(browser, '/europe/')

In [8]:
country_urls

['https://www.ricksteves.com/europe/austria',
 'https://www.ricksteves.com/europe/belgium',
 'https://www.ricksteves.com/europe/bosnia-herzegovina',
 'https://www.ricksteves.com/europe/bulgaria',
 'https://www.ricksteves.com/europe/croatia',
 'https://www.ricksteves.com/europe/czech-republic',
 'https://www.ricksteves.com/europe/denmark',
 'https://www.ricksteves.com/europe/england',
 'https://www.ricksteves.com/europe/estonia',
 'https://www.ricksteves.com/europe/finland',
 'https://www.ricksteves.com/europe/france',
 'https://www.ricksteves.com/europe/germany',
 'https://www.ricksteves.com/europe/greece',
 'https://www.ricksteves.com/europe/hungary',
 'https://www.ricksteves.com/europe/iceland',
 'https://www.ricksteves.com/europe/ireland',
 'https://www.ricksteves.com/europe/italy',
 'https://www.ricksteves.com/europe/montenegro',
 'https://www.ricksteves.com/europe/netherlands',
 'https://www.ricksteves.com/europe/norway',
 'https://www.ricksteves.com/europe/poland',
 'https://www.

In [9]:
country_urls = country_urls[0:-2]

In [10]:
country_urls

['https://www.ricksteves.com/europe/austria',
 'https://www.ricksteves.com/europe/belgium',
 'https://www.ricksteves.com/europe/bosnia-herzegovina',
 'https://www.ricksteves.com/europe/bulgaria',
 'https://www.ricksteves.com/europe/croatia',
 'https://www.ricksteves.com/europe/czech-republic',
 'https://www.ricksteves.com/europe/denmark',
 'https://www.ricksteves.com/europe/england',
 'https://www.ricksteves.com/europe/estonia',
 'https://www.ricksteves.com/europe/finland',
 'https://www.ricksteves.com/europe/france',
 'https://www.ricksteves.com/europe/germany',
 'https://www.ricksteves.com/europe/greece',
 'https://www.ricksteves.com/europe/hungary',
 'https://www.ricksteves.com/europe/iceland',
 'https://www.ricksteves.com/europe/ireland',
 'https://www.ricksteves.com/europe/italy',
 'https://www.ricksteves.com/europe/montenegro',
 'https://www.ricksteves.com/europe/netherlands',
 'https://www.ricksteves.com/europe/norway',
 'https://www.ricksteves.com/europe/poland',
 'https://www.

Now that we have the url for all the countries, lets get the urls for each city. Let's start collecting some data. First go to the country's page.

In [11]:
for country in country_urls:
    collect_all_data(browser, country, country_collection, city_collection)

Inserted Austria into country collection
Inserted Danube Valley, Austria into city collection
Inserted Hallstatt, Austria into city collection
Inserted Salzburg, Austria into city collection
Inserted Tirol, Austria into city collection
Inserted Vienna, Austria into city collection
Completed scrapping Austria
Inserted Belgium into country collection
Inserted Antwerp, Belgium into city collection
Inserted Bruges, Belgium into city collection
Inserted Brussels, Belgium into city collection
Inserted Ghent, Belgium into city collection
Completed scrapping Belgium
Inserted Bosnia-Herzegovina into country collection
Inserted Mostar, Bosnia-Herzegovina into city collection
Inserted Sarajevo, Bosnia-Herzegovina into city collection
Completed scrapping Bosnia-Herzegovina
Inserted Bulgaria into country collection
Completed scrapping Bulgaria
Inserted Croatia into country collection
Inserted Dalmatian Coast, Croatia into city collection
Inserted Dubrovnik, Croatia into city collection
Inserted Hva

Inserted Pisa, Italy into city collection
Inserted Pompeii & Herculaneum, Italy into city collection
Inserted Ravenna, Italy into city collection
Inserted Rome, Italy into city collection
Inserted Sicily, Italy into city collection
Inserted Siena, Italy into city collection
Inserted Sorrento, Italy into city collection
Inserted Tuscan Hill Towns, Italy into city collection
Inserted Tuscany, Italy into city collection
Inserted Venice, Italy into city collection
Completed scrapping Italy
Inserted Montenegro into country collection
Completed scrapping Montenegro
Inserted Netherlands into country collection
Inserted Amsterdam, Netherlands into city collection
Inserted Delft, Netherlands into city collection
Inserted Edam, Netherlands into city collection
Inserted Haarlem, Netherlands into city collection
Inserted The Hague, Netherlands into city collection
Completed scrapping Netherlands
Inserted Norway into country collection
Inserted Bergen, Norway into city collection
Inserted Norwegian

Take the MongoDB collection and turn in into a list of dictionaries for the countries.

In [14]:
country_dicts = [x for x in country_collection.find()]
country_dicts

[{'_id': ObjectId('5d1106db2f3f9bdaae0a50ca'),
  'country': 'Austria',
  'country_summary': "Small, landlocked Austria offers alpine scenery, world-class museums, cobbled quaintness, and Wiener schnitzel. Unlike Germany, its industrious neighbor to the northwest, Austria is content to bask in its good living and elegant, opulent past as the former head of one of Europe's grandest empires. Austrians tend to be relaxed, gregarious people who love the outdoors as much as a good cup of coffee in a café."},
 {'_id': ObjectId('5d1106dd2f3f9bdaae0a50cb'),
  'country': 'Belgium',
  'country_summary': "Belgium falls through the cracks. Wedged between Germany, France, and the Netherlands, and famous for waffles, Smurfs, and a statue of a little boy peeing, it's no wonder it can get lost in the mix. But Belgium rewards with richer sights than you might expect — and fewer tourist crowds. You'll encounter some of Europe's finest cuisine, including the best beer, creamiest chocolates, and tastiest F

Because I scrapped the same countries a few times while I figured the bugs in my system, I want to make sure that I have each country only once. For example, I knew that there were only 31 countries but I have 55 entries in the `country_collection`. After cleaning, I have 31 as expected.

In [21]:
len(country_dicts)

55

In [25]:
country_df = pd.DataFrame(country_dicts)
country_df.drop('_id', axis=1, inplace=True)
country_df.drop_duplicates(inplace=True)
country_df.head()

Unnamed: 0,country,country_summary
0,Austria,"Small, landlocked Austria offers alpine scener..."
1,Belgium,Belgium falls through the cracks. Wedged betwe...
2,Bosnia-Herzegovina,Apart from the tragic way it separated from Yu...
3,Bulgaria,"Endearing, surprising Bulgaria is a rewarding ..."
4,Croatia,With thousands of miles of seafront and more t...


In [26]:
len(country_df)

31

Saving the dataframe to a pickle file for use in other notebooks.

In [29]:
#pickle.dump(country_df, open('data/countries.pkl', 'wb'))

Similar to the countries, convert `city_collection` to a list of dictionaries, drop the duplicate cities (from 232 to 213) and save to a pickle file for other notebooks.

In [15]:
city_dicts = [x for x in city_collection.find()]
city_dicts

[{'_id': ObjectId('5d110b165c4fbe2cc8624dfd'),
  'city': 'Danube Valley',
  'country': 'Austria',
  'city_summary': "The Danube is at its romantic best just west of Vienna. Mix a cruise with a bike ride through the Danube's Wachau Valley, lined with ruined castles, beautiful abbeys (including the glorious Melk Abbey), small towns, and vineyard upon vineyard. Much of the valley has a warm fairy-tale glow, but a trip here isn't complete without the chilling contrast of a visit to the Mauthausen concentration camp memorial."},
 {'_id': ObjectId('5d110b225c4fbe2cc8624dfe'),
  'city': 'Hallstatt',
  'country': 'Austria',
  'city_summary': 'Lovable Hallstatt is a tiny town bullied onto a ledge between a selfish mountain and a swan-ruled lake, with a waterfall ripping furiously through its middle. The big draws of Hallstatt are its village and its lakeside setting. Come here to relax, nibble, wander, and paddle. Beyond lies the Salzkammergut region, a gentle land of lakes, forested mountains,

In [16]:
len(city_dicts)

232

In [27]:
city_df = pd.DataFrame(city_dicts)
city_df.drop('_id', axis=1, inplace=True)
city_df.drop_duplicates(inplace=True)
city_df.head()

Unnamed: 0,city,city_summary,country
0,Danube Valley,The Danube is at its romantic best just west o...,Austria
1,Hallstatt,Lovable Hallstatt is a tiny town bullied onto ...,Austria
2,Salzburg,"Thanks to its charmingly preserved old town, s...",Austria
3,Tirol,Mountainous Tirol — in Austria's western panha...,Austria
4,Vienna,"Vienna is the capital of Austria, the cradle o...",Austria


In [28]:
len(city_df)

213

In [None]:
#pickle.dump(city_df, open('data/cities.pkl', 'wb'))

In [7]:
city_df = pickle.load(open('cities.pkl', 'rb'))
city_list = list(zip(city_df['city'], city_df['country']))

In [8]:
city_list

[('Danube Valley', 'Austria'),
 ('Hallstatt', 'Austria'),
 ('Salzburg', 'Austria'),
 ('Tirol', 'Austria'),
 ('Vienna', 'Austria'),
 ('Antwerp', 'Belgium'),
 ('Bruges', 'Belgium'),
 ('Brussels', 'Belgium'),
 ('Ghent', 'Belgium'),
 ('Mostar', 'Bosnia-Herzegovina'),
 ('Sarajevo', 'Bosnia-Herzegovina'),
 ('Dalmatian Coast', 'Croatia'),
 ('Dubrovnik', 'Croatia'),
 ('Hvar', 'Croatia'),
 ('Istria', 'Croatia'),
 ('Korčula', 'Croatia'),
 ('Split', 'Croatia'),
 ('Zagreb', 'Croatia'),
 ('Aarhus', 'Denmark'),
 ('Copenhagen', 'Denmark'),
 ('Ærø', 'Denmark'),
 ('Bath', 'England'),
 ('Blackpool', 'England'),
 ('Brighton', 'England'),
 ('Cambridge', 'England'),
 ('Canterbury', 'England'),
 ('Cornwall', 'England'),
 ('Cotswolds', 'England'),
 ('Dover', 'England'),
 ('Durham', 'England'),
 ('Glastonbury & Wells', 'England'),
 ('Ironbridge Gorge', 'England'),
 ('Lake District', 'England'),
 ('Liverpool', 'England'),
 ('London', 'England'),
 ('Oxford', 'England'),
 ('Portsmouth', 'England'),
 ('Stonehenge

In [12]:
for place in city_list:
    city = place[0]
    get_wiki_description(browser, city, wiki_collection)
    time.sleep(np.random.randint(20, 120))

In [14]:
wiki_collection.find_one()

{'_id': ObjectId('5d1195dc0d668bb1db943136'),
 'city': 'Vienna',
 'text': '\nVienna (/viˈɛnə/ (listen);[11][12] German: Wien [viːn] (listen)) is the federal capital, largest city and one of nine states of Austria. Vienna is Austria\'s primate city, with a population of about 1.9 million[3] (2.6 million within the metropolitan area,[6] nearly one third of the country\'s population), and its cultural, economic, and political centre. It is the 7th-largest city by population within city limits in the European Union. Until the beginning of the 20th century, it was the largest German-speaking city in the world, and before the splitting of the Austro-Hungarian Empire in World War I, the city had 2 million inhabitants.[13] Today, it has the second largest number of German speakers after Berlin.[14][15] Vienna is host to many major international organizations, including the United Nations and OPEC. The city is located in the eastern part of Austria and is close to the borders of the Czech Repub

In [15]:
wiki_df = pd.DataFrame([x for x in wiki_collection.find()])

In [16]:
wiki_df.head()

Unnamed: 0,_id,city,text
0,5d1195dc0d668bb1db943136,Vienna,\nVienna (/viˈɛnə/ (listen);[11][12] German: W...
1,5d11970b6e7463927710e427,Vienna,\nVienna (/viˈɛnə/ (listen);[11][12] German: W...
2,5d11974d6e7463927710e428,Danube Valley,\nThe Danube (/ˈdæn.juːb/ DAN-yoob; known by v...
3,5d1197626e7463927710e429,Danube Valley,\nThe Danube (/ˈdæn.juːb/ DAN-yoob; known by v...
4,5d1197936e7463927710e42a,Hallstatt,Hallstatt (German: [ˈhalʃtat]; Central Bavaria...


In [17]:
#pickle.dump(wiki_df, open('data/wiki_data.pkl', 'wb'))