In [1]:
import pandas as pd
from parse import parse_htmls
from util import read_place_desc, read_htmls_in, write_places_to_tsv, read_places
from service import PlaceService, SearchEngine
from index import preprocess, Index, TfIdfIndex

import requests
from tqdm import tqdm
import os
import time
import util

# Data collection
##  1.1. Get the list of places





In [None]:
with open('most_popular_places.txt', 'w') as file:
    for i in tqdm(range(300)):
        
        url = 'https://www.atlasobscura.com/places?page='+str(i+1)+'&sort=likes_count'
        list_page = requests.get(url)
        list_soup = BeautifulSoup(list_page.text)
        list_places = [x['href'] for x in list_soup.find_all('a', {'class':"content-card content-card-place"})]
        for place in tqdm(list_places):
            file.write('https://www.atlasobscura.com' + str(place))
            file.write('\n')

## 1.2. Crawl places

In [None]:
f= open('most_popular_places.txt', 'r')
for j, url in enumerate(f):
    
    if j %18 == 0:
        dir_path = f'page{j//18+1}'
        if not os.path.isdir(dir_path):
            os.mkdir(dir_path)
            
    response = requests.get(url.strip())
    with open(f'{dir_path}/{j+1}.html','w') as file:
        print()
        file.write(response.text)
        
    time.sleep(1)

# 1.3 Parse Pages

In [3]:
#all_htmls = read_htmls_in('./pages')

In [2]:
#%store -r htmls

In [4]:
#all_places = parse_htmls(htmls)

In [5]:
#tsv_path = write_places_to_tsv('./', all_places)

In [263]:
#%store htmls

Stored 'htmls' (list)


In [2]:
ids, descriptions = read_place_desc('./places.tsv')
place_service = PlaceService()
place_service.load('./places.tsv')

# Create Conjunctive Index

## Create from saved `.tsv` file.

In [None]:
index = Index.create_from(ids, descriptions)

## Load saved index

In [4]:
index = Index.load_from('./resources/index.pickle')

# Create Search Engine

In [5]:
search_engine = SearchEngine(index, place_service)

# Run Search

In [10]:
search_engine.query('american museum')

Unnamed: 0,name,desc,url
1804,Uncommon Objects,Like an elegant antiques mall gone horribly wr...,https://www.atlasobscura.com/places/uncommon-o...
2458,Tamástslikt Cultural Institute,"The Tamástslikt Cultural Institute, situated o...",https://www.atlasobscura.com/places/tamastslik...
349,Mitsitam Native Foods Cafe,"A visit to the National Mall in Washington, D....",https://www.atlasobscura.com/places/mitsitam-n...
3701,Museum of Chinese in America,The Museum of Chinese in America is nestled—al...,https://www.atlasobscura.com/places/museum-of-...
1087,Museum of Mourning Art,Mourning and personal response to death are un...,https://www.atlasobscura.com/places/museum-of-...
...,...,...,...
6473,Museum of the American Cocktail,They say that New Orleans is the home of the f...,https://www.atlasobscura.com/places/museum-ame...
1934,Unto These Hills Cherokee Theatre,"Since 1950, members of the local Cherokee trib...",https://www.atlasobscura.com/places/unto-these...
984,Theodore Roosevelt Birthplace Museum,Behind an otherwise innocuous (if immaculately...,https://www.atlasobscura.com/places/theodore-r...
620,Canyons of the Ancients,Ripe for quiet reflection and simply awe-inspi...,https://www.atlasobscura.com/places/canyons-of...


# Create TF-IDF Index

## Ceate from saved `.tsv` file

In [None]:
tf_idf_index = TfIdfIndex.create_from(ids, descriptions)

## Load saved index

In [12]:
tf_idf_index = TfIdfIndex.load_from('./resources/tf_idf_index.pickle')

---

# Create TF-IDF Search Engine

In [13]:
tf_idf_search_engine = SearchEngine(tf_idf_index, place_service)

In [14]:
tf_idf_search_engine.query_top_k("american museum", 10)

Unnamed: 0,name,desc,url,similarity
3926,Smithsonian Sushi Collection,The American History Museum has collected an a...,https://www.atlasobscura.com/places/smithsonia...,0.999944
6489,Mercer Museum and Fonthill Castle,"Henry Chapman Mercer, a renowned archaeologist...",https://www.atlasobscura.com/places/fonthill,0.998837
2458,Tamástslikt Cultural Institute,"The Tamástslikt Cultural Institute, situated o...",https://www.atlasobscura.com/places/tamastslik...,0.998837
4697,Zippo/Case Museum,Invented in and still proudly manufactured in ...,https://www.atlasobscura.com/places/zippo-case...,0.998837
238,Off the Rez Cafe,The U.S. government’s forced relocation of Nat...,https://www.atlasobscura.com/places/off-the-re...,0.998837
6238,Oak Ridge &quot;The Secret City&quot;,The city of Oak Ridge was established by the U...,https://www.atlasobscura.com/places/the-secret...,0.998837
5429,Old Time Wooden Nickel Company,"The adage goes, “don’t take any wooden nickels...",https://www.atlasobscura.com/places/old-time-w...,0.994973
5068,Self-Taught Genius Gallery,"In 2017, the American Folk Art Museum in Manha...",https://www.atlasobscura.com/places/self-taugh...,0.99231
5517,Niles Essanay Silent Film Museum,It was Spring in San Francisco. One quiet Apri...,https://www.atlasobscura.com/places/niles-essa...,0.988467
343,Gillette Castle State Park,"High above the Connecticut River, Gillette Cas...",https://www.atlasobscura.com/places/gillettes-...,0.988467


## Own Score

In [7]:
search_engine.query_custom('museum', 10, True)

Unnamed: 0,name,desc,address,similarity
5427,Palazzo Naiadi Roman Baths,Amid the construction of the luxury Palazzo Na...,"Piazza della Repubblica, 47, Rome, Italy",0.999905
5783,Mirabilia Gallery,"This gallery, founded in 2016, is nestled betw...","14 Via di S. Teodoro, Rome, 00186, Italy",0.999884
2093,Mamertine Prison,While jails and prisons were not a common feat...,"Via Clivo Argentario, 1, Rome, 00186, Italy",0.999883
4982,Rome&#39;s Gladiator School,Tucked away on a side-street of the Appian Way...,"Via Appia Antica 18, Rome, 00179, Italy",0.999873
2781,Vicus Caprarius,The Trevi Fountain is one of Rome’s most notab...,"25 Vicolo del Puttarello, Rome, 00187, Italy",0.99987
2640,Keats-Shelley Memorial House,“This Grave / contains all that was Mortal / o...,"Piazza di Spagna 26, Roma, Rome, 00187, Italy",0.999866
2592,Mussolini&#39;s Balcony,"Benito Mussolini, known for being a powerful o...","Via del Plebiscito, Rome, Italy",0.999864
281,Museum of the Holy Souls in Purgatory,Located in the back of the Chiesa del Sacro Cu...,"Lungotevere Prati 12, Rome, 00193, Italy",0.999818
5296,Centrale Montemartini,Industry is the backdrop for divinity at the C...,"Via Ostiense 106, Rome, 00154, Italy",0.999811
3692,The Criminology Museum,Operated by Italy’s federal Prison Administrat...,"Via del Gonfalone 29, Rome, 00186, Italy",0.999806
