In [1]:
import pandas as pd
from parse import parse_htmls
from util import read_place_desc, read_htmls_in, write_places_to_tsv, read_places
from service import PlaceService, SearchEngine
from index import preprocess, Index, TfIdfIndex

import requests
from tqdm import tqdm
import os
import time
import util

# Data collection
##  1.1. Get the list of places





In [None]:
with open('most_popular_places.txt', 'w') as file:
    for i in tqdm(range(300)):
        
        url = 'https://www.atlasobscura.com/places?page='+str(i+1)+'&sort=likes_count'
        list_page = requests.get(url)
        list_soup = BeautifulSoup(list_page.text)
        list_places = [x['href'] for x in list_soup.find_all('a', {'class':"content-card content-card-place"})]
        for place in tqdm(list_places):
            file.write('https://www.atlasobscura.com' + str(place))
            file.write('\n')

## 1.2. Crawl places

In [None]:
f= open('most_popular_places.txt', 'r')
for j, url in enumerate(f):
    
    if j %18 == 0:
        dir_path = f'page{j//18+1}'
        if not os.path.isdir(dir_path):
            os.mkdir(dir_path)
            
    response = requests.get(url.strip())
    with open(f'{dir_path}/{j+1}.html','w') as file:
        print()
        file.write(response.text)
        
    time.sleep(1)

# 1.3 Parse Pages

In [3]:
#all_htmls = read_htmls_in('./pages')

In [2]:
#%store -r htmls

In [4]:
#all_places = parse_htmls(htmls)

In [5]:
#tsv_path = write_places_to_tsv('./', all_places)

In [263]:
#%store htmls

Stored 'htmls' (list)


In [2]:
ids, descriptions = read_place_desc('./places.tsv')
place_service = PlaceService()
place_service.load('./places.tsv')

# Create Conjunctive Index

In [3]:
index = Index.create_from(ids, descriptions)

# Create Search Engine

In [4]:
search_engine = SearchEngine(index, place_service)

# Run Search

In [5]:
search_engine.query('american museum')

Unnamed: 0,name,desc,url
6638,Thorne Miniature Rooms,In the depths of the Museum of the Art Institu...,https://www.atlasobscura.com/places/thorne-min...
2920,Grace Hopper&#39;s Bug,"On September 9, 1947, Harvard’s Mark II Aiken ...",https://www.atlasobscura.com/places/grace-hopp...
6306,Biosphere of Montreal,As their contribution to Montreal’s 1967 World...,https://www.atlasobscura.com/places/biosphere-...
5999,The Witch House of Salem,The Salem witchcraft trials took place between...,https://www.atlasobscura.com/places/witch-hous...
4745,Susanna Dickinson Museum,According to a plaque outside her last survivi...,https://www.atlasobscura.com/places/susanna-di...
...,...,...,...
576,Truth or Consequences,Located along the Rio Grande in middle of the ...,https://www.atlasobscura.com/places/truth-or-c...
718,"Basilica of Saint Lawrence, Asheville",The turn of the 20th century was a time of pro...,https://www.atlasobscura.com/places/basilica-o...
359,Stained Glass at Navy Pier,The very first American exhibit dedicated sole...,https://www.atlasobscura.com/places/stained-gl...
516,Abandoned Comet Diner,Just down the street from Mark Twain’s house a...,https://www.atlasobscura.com/places/abandoned-...


# Create TF-IDF Index

In [9]:
tf_idf_index = TfIdfIndex.create_from(ids, descriptions)

# Create TF-IDF Search Engine

In [10]:
tf_idf_search_engine = SearchEngine(tf_idf_index, place_service)

In [12]:
tf_idf_search_engine.query_top_k("american museum", 100)

Unnamed: 0,name,desc,url,similarity
343,Gillette Castle State Park,"High above the Connecticut River, Gillette Cas...",https://www.atlasobscura.com/places/gillettes-...,1.000000
456,Mildred E. Mathias Botanical Garden,"Founded in 1929, the botanical garden at Unive...",https://www.atlasobscura.com/places/mildred-e-...,1.000000
7152,Governors Island,Governors Island has quite a lot of history to...,https://www.atlasobscura.com/places/governor-s...,1.000000
2940,Dyckman Farmhouse,"Built sometime around 1785, the Dyckman Farmho...",https://www.atlasobscura.com/places/dyckman-fa...,1.000000
3661,Newberry Library Postcard Collection,"Among its many treasures, Chicago’s independen...",https://www.atlasobscura.com/places/newberry-l...,1.000000
...,...,...,...,...
5517,Niles Essanay Silent Film Museum,It was Spring in San Francisco. One quiet Apri...,https://www.atlasobscura.com/places/niles-essa...,0.960638
3788,Hidden Cave at Grimes Point,"There is a cave near Fallon, Nevada that’s bas...",https://www.atlasobscura.com/places/hidden-cav...,0.960638
3328,Navajo Code Talkers Tribute,The secret communications work of the Marine C...,https://www.atlasobscura.com/places/navajo-cod...,0.960638
1616,The Natural Bridge,Often cited as having a place among the great ...,https://www.atlasobscura.com/places/the-natura...,0.960638


## Own Score

In [10]:
search_engine.query_custom('museum', 10, True)

Unnamed: 0,name,desc,address,similarity
5427,Palazzo Naiadi Roman Baths,Amid the construction of the luxury Palazzo Na...,"Piazza della Repubblica, 47, Rome, Italy",-1.901117
5783,Mirabilia Gallery,"This gallery, founded in 2016, is nestled betw...","14 Via di S. Teodoro, Rome, 00186, Italy",-2.328042
2093,Mamertine Prison,While jails and prisons were not a common feat...,"Via Clivo Argentario, 1, Rome, 00186, Italy",-2.347713
4982,Rome&#39;s Gladiator School,Tucked away on a side-street of the Appian Way...,"Via Appia Antica 18, Rome, 00179, Italy",-2.535834
2781,Vicus Caprarius,The Trevi Fountain is one of Rome’s most notab...,"25 Vicolo del Puttarello, Rome, 00187, Italy",-2.614343
2640,Keats-Shelley Memorial House,“This Grave / contains all that was Mortal / o...,"Piazza di Spagna 26, Roma, Rome, 00187, Italy",-2.69438
2592,Mussolini&#39;s Balcony,"Benito Mussolini, known for being a powerful o...","Via del Plebiscito, Rome, Italy",-2.716135
281,Museum of the Holy Souls in Purgatory,Located in the back of the Chiesa del Sacro Cu...,"Lungotevere Prati 12, Rome, 00193, Italy",-3.650568
5296,Centrale Montemartini,Industry is the backdrop for divinity at the C...,"Via Ostiense 106, Rome, 00154, Italy",-3.792795
3692,The Criminology Museum,Operated by Italy’s federal Prison Administrat...,"Via del Gonfalone 29, Rome, 00186, Italy",-3.897203
