In [1]:
import pandas as pd
from parse import parse_htmls
from util import read_place_desc, read_htmls_in, write_places_to_tsv, read_places
from service import PlaceService, SearchEngine
from index import preprocess, Index, TfIdfIndex

import requests
from tqdm import tqdm
import os
import time
import util

# Data collection
##  1.1. Get the list of places





In [None]:
with open('most_popular_places.txt', 'w') as file:
    for i in tqdm(range(300)):
        
        url = 'https://www.atlasobscura.com/places?page='+str(i+1)+'&sort=likes_count'
        list_page = requests.get(url)
        list_soup = BeautifulSoup(list_page.text)
        list_places = [x['href'] for x in list_soup.find_all('a', {'class':"content-card content-card-place"})]
        for place in tqdm(list_places):
            file.write('https://www.atlasobscura.com' + str(place))
            file.write('\n')

## 1.2. Crawl places

In [None]:
f= open('most_popular_places.txt', 'r')
for j, url in enumerate(f):
    
    if j %18 == 0:
        dir_path = f'page{j//18+1}'
        if not os.path.isdir(dir_path):
            os.mkdir(dir_path)
            
    response = requests.get(url.strip())
    with open(f'{dir_path}/{j+1}.html','w') as file:
        print()
        file.write(response.text)
        
    time.sleep(1)

# 1.3 Parse Pages

In [None]:
#all_htmls = read_htmls_in('./pages')

In [None]:
#%store -r htmls

In [None]:
#all_places = parse_htmls(htmls)

In [None]:
#tsv_path = write_places_to_tsv('./', all_places)

In [None]:
#%store htmls

In [2]:
ids, descriptions = read_place_desc('./places.tsv')
place_service = PlaceService()
place_service.load('./places.tsv')

# Create Conjunctive Index

## Create from saved `.tsv` file.

In [3]:
index = Index.create_from(ids, descriptions)

## Load saved index

In [4]:
index = Index.load_from('./resources/index.pickle')

# Create Search Engine

In [5]:
search_engine = SearchEngine(index, place_service)

# Run Search

In [6]:
search_engine.query('american museum')

Unnamed: 0,name,desc,url
1804,Uncommon Objects,Like an elegant antiques mall gone horribly wr...,https://www.atlasobscura.com/places/uncommon-o...
2458,Tamástslikt Cultural Institute,"The Tamástslikt Cultural Institute, situated o...",https://www.atlasobscura.com/places/tamastslik...
349,Mitsitam Native Foods Cafe,"A visit to the National Mall in Washington, D....",https://www.atlasobscura.com/places/mitsitam-n...
3701,Museum of Chinese in America,The Museum of Chinese in America is nestled—al...,https://www.atlasobscura.com/places/museum-of-...
1087,Museum of Mourning Art,Mourning and personal response to death are un...,https://www.atlasobscura.com/places/museum-of-...
...,...,...,...
6473,Museum of the American Cocktail,They say that New Orleans is the home of the f...,https://www.atlasobscura.com/places/museum-ame...
1934,Unto These Hills Cherokee Theatre,"Since 1950, members of the local Cherokee trib...",https://www.atlasobscura.com/places/unto-these...
984,Theodore Roosevelt Birthplace Museum,Behind an otherwise innocuous (if immaculately...,https://www.atlasobscura.com/places/theodore-r...
620,Canyons of the Ancients,Ripe for quiet reflection and simply awe-inspi...,https://www.atlasobscura.com/places/canyons-of...


# Create TF-IDF Index

## Ceate from saved `.tsv` file

In [7]:
tf_idf_index = TfIdfIndex.create_from(ids, descriptions)

## Load saved index

In [8]:
tf_idf_index = TfIdfIndex.load_from('./resources/tf_idf_index.pickle')

---

# Create TF-IDF Search Engine

In [9]:
tf_idf_search_engine = SearchEngine(tf_idf_index, place_service)

In [10]:
tf_idf_search_engine.query_top_k("american museum", 10)

Unnamed: 0,name,desc,url,similarity,lat,lon
3926,Smithsonian Sushi Collection,The American History Museum has collected an a...,https://www.atlasobscura.com/places/smithsonia...,0.999944,38.890729,-77.030003
6489,Mercer Museum and Fonthill Castle,"Henry Chapman Mercer, a renowned archaeologist...",https://www.atlasobscura.com/places/fonthill,0.998837,40.306903,-75.128278
2458,Tamástslikt Cultural Institute,"The Tamástslikt Cultural Institute, situated o...",https://www.atlasobscura.com/places/tamastslik...,0.998837,45.65371,-118.663375
4697,Zippo/Case Museum,Invented in and still proudly manufactured in ...,https://www.atlasobscura.com/places/zippo-case...,0.998837,41.943399,-78.650932
238,Off the Rez Cafe,The U.S. government’s forced relocation of Nat...,https://www.atlasobscura.com/places/off-the-re...,0.998837,47.660263,-122.311556
6238,Oak Ridge &quot;The Secret City&quot;,The city of Oak Ridge was established by the U...,https://www.atlasobscura.com/places/the-secret...,0.998837,36.00888,-84.257149
5429,Old Time Wooden Nickel Company,"The adage goes, “don’t take any wooden nickels...",https://www.atlasobscura.com/places/old-time-w...,0.994973,29.456517,-98.456341
5068,Self-Taught Genius Gallery,"In 2017, the American Folk Art Museum in Manha...",https://www.atlasobscura.com/places/self-taugh...,0.99231,40.740928,-73.93338
5517,Niles Essanay Silent Film Museum,It was Spring in San Francisco. One quiet Apri...,https://www.atlasobscura.com/places/niles-essa...,0.988467,37.577097,-121.980132
343,Gillette Castle State Park,"High above the Connecticut River, Gillette Cas...",https://www.atlasobscura.com/places/gillettes-...,0.988467,41.422983,-72.428414


## Own Score
For our own score, we decided to give the users three ways to rank the places:

1. Proximity
1. Popularity
1. Proximity & Popularity

### 1. Proximity
The proximity score is based on the user's current location. Places that are closer to the user's location are ranked higher than places that are further away. To user's location is obtained by fetching the currently used IP address and finding the latitudes and longitudes associated with it. Although not exact and prone to manipulation (a VPN could be used to "change" location), we avoid having to clean and parse more user input.

The similarity score for a place is calculated by subtracting the distance between the user location from the maximum possible distance, and normalising it over the maximum possible distance. Given a distance function $dist(p_1, p_2)$ that returns the distance between two positions on the surface of the earth, the proximity score is defined as:

$$proximity(place) = \frac{max\_distance - dist(place, user)}{max\_distance}$$

The $max\_distance$ is simply the earth's circumference divided by two, as this is the maximum possible distance between any two points on the surface of the earth.

The reason for subtracting the distance between the user and the place from the maximum distance is so that scores closer to 1 correspond to a higher similarity and scores closer to 0 to lower similarity. More formally:

$$\lim_{dist(place, user) \to 0} proximity(place) = 1$$

$$\lim_{dist(place, user) \to max\_distance} proximity(place) = 0$$

In [22]:
search_engine.query_custom('museum', top_k=10, proximity=True, popularity=False)[["name","desc","address","similarity"]]

Unnamed: 0,name,desc,address,similarity
5427,Palazzo Naiadi Roman Baths,Amid the construction of the luxury Palazzo Na...,"Piazza della Repubblica, 47, Rome, Italy",0.999905
5783,Mirabilia Gallery,"This gallery, founded in 2016, is nestled betw...","14 Via di S. Teodoro, Rome, 00186, Italy",0.999884
2093,Mamertine Prison,While jails and prisons were not a common feat...,"Via Clivo Argentario, 1, Rome, 00186, Italy",0.999883
4982,Rome&#39;s Gladiator School,Tucked away on a side-street of the Appian Way...,"Via Appia Antica 18, Rome, 00179, Italy",0.999873
2781,Vicus Caprarius,The Trevi Fountain is one of Rome’s most notab...,"25 Vicolo del Puttarello, Rome, 00187, Italy",0.99987
2640,Keats-Shelley Memorial House,“This Grave / contains all that was Mortal / o...,"Piazza di Spagna 26, Roma, Rome, 00187, Italy",0.999866
2592,Mussolini&#39;s Balcony,"Benito Mussolini, known for being a powerful o...","Via del Plebiscito, Rome, Italy",0.999864
281,Museum of the Holy Souls in Purgatory,Located in the back of the Chiesa del Sacro Cu...,"Lungotevere Prati 12, Rome, 00193, Italy",0.999818
5296,Centrale Montemartini,Industry is the backdrop for divinity at the C...,"Via Ostiense 106, Rome, 00154, Italy",0.999811
3692,The Criminology Museum,Operated by Italy’s federal Prison Administrat...,"Via del Gonfalone 29, Rome, 00186, Italy",0.999806


### 2. Popularity
The popularity score ranks places by popularity, with more popular places being above less popular places. The popularity of a place is calculated using the number of people that have visited it and the number of people that want to go. We chose these two variables, because the very meaning of popularity is that many people are interested in a place. If many people want to go to a place or have visited it, it therefore means that is is popular.

For the number of people that went and number of people that want to go, we calculate their ratio over the total number of people that visited all places and the total number, sum them, and equally weigh the two ratios. 

$$popularity = \frac{1}{2} \times \left(\frac{num\_people\_went}{total\_people\_went} + \frac{num\_people\_want}{total\_people\_want}\right)$$



In [21]:
search_engine.query_custom("museum", top_k=10, proximity=False, popularity=True)[["name","desc","address","similarity"]]

Unnamed: 0,name,desc,address,similarity
2000,Mütter Museum,Located inside the headquarters of the College...,"19 South 22nd Street, Philadelphia, Pennsylvan...",0.005018
6820,Museum of Pop Culture,"In Seattle, where art seems to spring from the...","325 5th Avenue North, Seattle, Washington, 981...",0.004714
0,City Hall Station,The first New York City subway was built and o...,"31 Centre St, New York, New York, 10007, Unite...",0.004609
614,Natural History Museum of London,"Established in 1881, the Natural History Museu...","Cromwell Road, London, England, SW7 2DD, Unite...",0.0045
14,The Evolution Store,Evolution stands out among the clothing stores...,"687 Broadway, New York, New York, 10012, Unite...",0.004283
5999,The Witch House of Salem,The Salem witchcraft trials took place between...,"310 1/2 Essex Street, Salem, Massachusetts, 01...",0.003955
825,Casa Batlló,"One of Gaudí’s most iconic works, Casa Batlló ...","43 Passeig de Gràcia, Barcelona, 08007, Spain",0.003873
5011,Park Güell,"At Park Güell, stone, tile, plants, and Medite...","s/n Carrer d'Olot, Barcelona, 08024, Spain",0.003774
4411,Centre Pompidou,"Located in Paris’ 4th arrondissement, Centre G...","Centre Georges Pompidou, Paris, 75004, France",0.003709
1010,La Brea Tar Pits Dragonfly Fossils,The landmarked La Brea Tar Pits and Museum is ...,"La Brea Tar Pits and Museum, 5801 Wilshire Bou...",0.003677


### 3. Proximity & Popularity
For the combination of proximity and popularity, the two scores are simply multiplied together.

$$proximity \times popularity$$

In [20]:
search_engine.query_custom("museum" , top_k=10, proximity=True, popularity=True)[["name","desc","address","similarity"]]

Unnamed: 0,name,desc,address,similarity
614,Natural History Museum of London,"Established in 1881, the Natural History Museu...","Cromwell Road, London, England, SW7 2DD, Unite...",0.004177
825,Casa Batlló,"One of Gaudí’s most iconic works, Casa Batlló ...","43 Passeig de Gràcia, Barcelona, 08007, Spain",0.003707
5011,Park Güell,"At Park Güell, stone, tile, plants, and Medite...","s/n Carrer d'Olot, Barcelona, 08024, Spain",0.003611
4411,Centre Pompidou,"Located in Paris’ 4th arrondissement, Centre G...","Centre Georges Pompidou, Paris, 75004, France",0.003504
2000,Mütter Museum,Located inside the headquarters of the College...,"19 South 22nd Street, Philadelphia, Pennsylvan...",0.003255
0,City Hall Station,The first New York City subway was built and o...,"31 Centre St, New York, New York, 10007, Unite...",0.00302
14,The Evolution Store,Evolution stands out among the clothing stores...,"687 Broadway, New York, New York, 10012, Unite...",0.002807
5999,The Witch House of Salem,The Salem witchcraft trials took place between...,"310 1/2 Essex Street, Salem, Massachusetts, 01...",0.002655
7008,221b Baker Street,Beeton’s Christmas Annual was a hugely popular...,"237 Baker Street, Devon, London, England, NW1 ...",0.002629
1016,Pergamon Museum,"Situated on Berlin’s Museum Island, the Pergam...","Berlin, 10178, Germany",0.00262


In [25]:
import plotly.express as px
d_cusin = tf_idf_search_engine.query_top_k("american museum", 10)
d_proximity = search_engine.query_custom('american museum', top_k=10, proximity=True, popularity=False)
d_popularity = search_engine.query_custom("american museum", top_k=10, proximity=False, popularity=True)
d_combination = search_engine.query_custom("american museum" , top_k=10, proximity=True, popularity=True)


In [30]:
fig = px.scatter_mapbox(
    d_cusin,  # Our DataFrame
    lat = "lat",
    lon = "lon",
    center = {"lat": 40.77, "lon": -73.96},  # where map will be centered
    width = 1000,  # Width of map
    height = 600,  # Height of map
    color="similarity", size="similarity",
    zoom=0.5,
    hover_data = ["name" , "url"],
    # what to display when hovering mouse over coordinate
)
fig.update_layout(
    title={
        'text': "Maps visualization of the most relevant place according to cousin similarity",
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.update_layout(mapbox_style="stamen-toner") 
fig.show()

In [32]:
fig = px.scatter_mapbox(
    d_popularity,  # Our DataFrame
    lat = "lat",
    lon = "lon",
    center = {"lat": 40.77, "lon": -73.96},  # where map will be centered
    width = 1000,  # Width of map
    height = 600,  # Height of map
    color="similarity", size="similarity",
    zoom=0.5,
    hover_data = ["name" ],
    # what to display when hovering mouse over coordinate
)
fig.update_layout(
    title={
        'text': "Maps visualization of the most relevant place according to popularity",
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.update_layout(mapbox_style="stamen-toner") 
fig.show()

In [34]:
fig = px.scatter_mapbox(
    d_proximity,  # Our DataFrame
    lat = "lat",
    lon = "lon",
    center = {"lat": 40.77, "lon": -73.96},  # where map will be centered
    width = 1000,  # Width of map
    height = 600,  # Height of map
    color="similarity", size="similarity",
    zoom=0.5,
    hover_data = ["name" ],
    # what to display when hovering mouse over coordinate
)
fig.update_layout(
    title={
        'text': "Maps visualization of the most relevant place according to proximity",
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.update_layout(mapbox_style="stamen-toner") 
fig.show()

In [37]:
fig = px.scatter_mapbox(
    d_combination,  # Our DataFrame
    lat = "lat",
    lon = "lon",
    center = {"lat": 40.77, "lon": -73.96},  # where map will be centered
    width = 1000,  # Width of map
    height = 600,  # Height of map
    color="similarity", size="similarity",
    zoom=0.5,
    hover_data = ["name"],
    # what to display when hovering mouse over coordinate
)
fig.update_layout(
    title={
        'text': "Maps visualization of the most relevant place according to proximity and popularity ",
        'y':0.95,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.update_layout(mapbox_style="stamen-toner") 
fig.show()