In [2]:
import nltk
import io
from nltk.stem.porter import *
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, defaultdict
import re
from scipy.spatial.distance import cdist
import pickle

regions = ['africa',
           'antarctica',
           'australasia',
           'caribbean',
           'central_america',
           'central_asia',
           'europe',
           'indian_subcontinent',
           'middle_east',
           'north_america',
           'north_east asia',
           'pacific',
           'south_america',
           'south_east_asia',
           'north_east_asia']

In [3]:
import sys
print(sys.executable)

/Users/NoahKaplan/miniconda3/bin/python


In [4]:
data = []
with io.open('data/FlatCorpus.txt', encoding='utf-8-sig') as f:
    for line in f:
        title, text = line.split(':  ')
        region = ''
        for r in regions:
            if title.startswith(r):
                region = ' '.join(map(lambda x: x.capitalize(), r.split('_')))
                break
        split_title = title[len(region) + 1:].split('_')
        entry_type = split_title[-1].split('.')[0]
        city = ' '.join(map(lambda x: x.capitalize(), split_title[:-1]))
        row = [region, city, entry_type, text]
        data.append(row)

In [5]:
print(data[1165])

['North America', u'Atlantic City', u'activities', u"Activities  Atlantic City is not the place to visit if you're into the outdoors. The most burning of calories you'll achive will be getting out of bed and slouching in front of a slot machine. The city's rules and regulations conspire to keep it this way. While the Boardwalk is a good spot, in theory, for cycling, bikes are only allowed between the hours of 6-10 am.  If you want some exercise, you're better off leaving Atlantic City and heading for the peaceful Pine Barrens, where there's no shortage of hiking in the huge pine forest. In Egg Harbor, you can rent equipment to canoe and kayak through the Pines. Wildwood's coast has some decent beaches from which people parasail. Whalewatching trips run from North Wildwood and Cape May throughout the summer.  \n"]


In [6]:
print(len(data))

1801


In [7]:
countries = [row[0] for row in data]
cities = [row[1] for row in data]
types = [row[2] for row in data]

In [8]:
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/NoahKaplan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def tokenize(sent):
    return re.findall('[a-zA-Z]+', sent)

In [10]:
descriptions = [[w.lower() for w in tokenize(row[3])] for row in data]

In [11]:
print(descriptions[1])

[u'attractions', u'officially', u'the', u'capital', u'the', u'government', u'long', u'ago', u'moved', u'itself', u'and', u'most', u'of', u'its', u'business', u'km', u'mi', u'west', u'to', u'cotonou', u'nevertheless', u'this', u'town', u'of', u'some', u'people', u'remains', u'a', u'beautiful', u'and', u'historical', u'place', u'its', u'proximity', u'to', u'the', u'nigerian', u'border', u'gives', u'the', u'appearance', u'that', u'more', u'is', u'going', u'on', u'than', u'actually', u'is', u'though', u'there', u'are', u'still', u'some', u'hot', u'spots', u'such', u'as', u'the', u'grand', u'marche', u'd', u'adjara', u'where', u'you', u'can', u'buy', u'drums', u'cloth', u'baskets', u'and', u'the', u'best', u'pottery', u'in', u'benin', u'the', u'musee', u'ethnographique', u'de', u'porto', u'novo', u'has', u'a', u'great', u'collection', u'of', u'yoruba', u'artefacts', u'you', u'can', u'also', u'visit', u'the', u'ornate', u'brazilian', u'style', u'church', u'now', u'a', u'mosque']


In [12]:
stemmer = PorterStemmer()

In [13]:
stems = [[stemmer.stem(w.lower()) for w in sent] for sent in descriptions]

In [14]:
print(stems[1])

[u'attract', u'offici', u'the', u'capit', u'the', u'govern', u'long', u'ago', u'move', u'itself', u'and', u'most', u'of', u'it', u'busi', u'km', u'mi', u'west', u'to', u'coton', u'nevertheless', u'thi', u'town', u'of', u'some', u'peopl', u'remain', u'a', u'beauti', u'and', u'histor', u'place', u'it', u'proxim', u'to', u'the', u'nigerian', u'border', u'give', u'the', u'appear', u'that', u'more', u'is', u'go', u'on', u'than', u'actual', u'is', u'though', u'there', u'are', u'still', u'some', u'hot', u'spot', u'such', u'as', u'the', u'grand', u'march', u'd', u'adjara', u'where', u'you', u'can', u'buy', u'drum', u'cloth', u'basket', u'and', u'the', u'best', u'potteri', u'in', u'benin', u'the', u'muse', u'ethnographiqu', u'de', u'porto', u'novo', u'ha', u'a', u'great', u'collect', u'of', u'yoruba', u'artefact', u'you', u'can', u'also', u'visit', u'the', u'ornat', u'brazilian', u'style', u'church', u'now', u'a', u'mosqu']


In [15]:
inv_idx = defaultdict(set)
for i, row in enumerate(stems):
    for w in row:
        inv_idx[w].add(i)

In [16]:
min_df = 10
max_df = 0.8
nd = len(data)
vocab = list(filter(lambda x: min_df <= len(inv_idx[x]) <= nd * max_df and x not in stopwords, inv_idx.keys()))
vocab_idx = {w: i for i, w in enumerate(vocab)}

In [17]:
print(len(vocab_idx))

3068


In [18]:
idf = {}
filt_inv_idx = {}
for w in vocab:
    idf[w] = np.log((nd) / (1 + len(inv_idx[w])) + 1)
    filt_inv_idx[w] = inv_idx[w]

In [19]:
doc_mat = np.zeros((nd, len(vocab)))

for i, row in enumerate(stems):
    counter = Counter(row)
    for w, count in counter.items():
        if w in idf:
            doc_mat[i, vocab_idx[w]] = idf[w] * count
norm = np.linalg.norm(doc_mat, axis=1)[:, np.newaxis] + 1e-8
doc_mat = doc_mat / norm

In [20]:
raw_query = tokenize('tropical beach')
query = [stemmer.stem(w) for w in raw_query]
print(query)

[u'tropic', 'beach']


In [43]:
accum = np.zeros(len(data))
for q in query:
    if q in vocab_idx:
        for doc in inv_idx[q]:
            accum[doc] += doc_mat[doc, vocab_idx[q]]
ranking = accum.argsort()[::-1]
regions = []

for r in ranking:
    if len(regions) >= 20:
        break
    regions.append(data[r][1])
    
s = set()
for r in ranking:
    s.add(data[r][1])
print("Number of unique regions")
print(len(s))

Number of unique regions
392


In [22]:
print(regions)

[u'South Korea', u'Aruba', u'Barbados', u'Molokai', u'Santa Barbara', u'Antigua And Barbuda', u'Alicante', u'Los Angeles', u'Grenada', u'Sint Eustatius', u'Miami', u'Saint Martin', u'Melbourne', u'Hawaii', u'Cook Islands', u'Sint Maarten', u'Rio De Janeiro', u'Trinidad And Tobago', u'Honolulu', u'Guadeloupe']


In [23]:
print(ranking)

[1489  318  328 ...  970  971  900]


In [44]:
# Dump raw LP text data
#with open('data/LP_raw.pickle', 'wb') as f:
    #pickle.dump(data, f, protocol=2)

# Dump pickled inverted index
#with open('data/inv_idx.pickle', 'wb') as f:
    #pickle.dump(inv_idx, f, protocol=2)

# Dump pickled TF-IDF matrix
#with open('data/tfidf_mat.pickle', 'wb') as f:
    #pickle.dump(doc_mat, f, protocol=2)
    
# Dump pickled vocab index matrix
#with open('data/vocab_idx.pickle', 'wb') as f:
    #pickle.dump(vocab_idx, f, protocol=2)

In [25]:
google_place_pickle = open("data/google_place.pickle","rb")
google_places = pickle.load(google_place_pickle)
print(google_places["los angeles"])

[['The Hollywood Museum', 5.0, '1660 N Highland Ave, Hollywood, CA 90028, USA'], ['Madame Tussauds Hollywood', 5.0, '6933 Hollywood Blvd, Hollywood, CA 90028, USA'], ['OUE Skyspace LA', 4.0, '633 W 5th St #840, Los Angeles, CA 90071, USA'], ['Hollywood Wax Museum', 5.0, '6767 Hollywood Blvd, Los Angeles, CA 90028, USA'], ["Ripley's Believe It or Not!", 5.0, '6780 Hollywood Blvd, Hollywood, CA 90028, USA'], ['L.A. Hood Life Tours', 5.0, '6326 Hollywood Blvd, Los Angeles, CA 90028, USA'], ['Little Tokyo', 5.0, '319 E 2nd St #202, Los Angeles, CA 90013, USA'], ['Arlene Dahl Star', 4.0, '1668, 1658 Vine St, Los Angeles, CA 90028, USA'], ['Robert F. Kennedy Inspiration Park', 5.0, '3384 Wilshire Blvd, Los Angeles, CA 90010, USA'], ['Edm Hall', 4.0, '123 Astronaut E S Onizuka St, Los Angeles, CA 90012, USA'], ['Mural "La Brea"', 4.0, '181 South La Brea Ave, Los Angeles, CA 90036, USA'], ['Korean Pavilion', 5.0, '1000 Normandie Ave, Los Angeles, CA 90006, USA'], ['Adam 12 Police Station', 5.0

In [26]:
def getTopPlacesInRegion(region):
    topPlaces = []
    
    sortedPlaces = sorted(google_places[region], key = lambda x: x[1], reverse = True)
    for place in sortedPlaces:
        topPlaces.append((place[0], place[2]))
    
    return topPlaces

In [27]:
getTopPlacesInRegion("cincinnati")

[('Cincinnati Food Tours', '1801 Race St, Cincinnati, OH 45202, USA'),
 ('Gibbon Islands', 'Dury Ave, Cincinnati, OH 45220, USA'),
 ('Children\xe2\x80\x99s Zoo', 'Forest Ave, Cincinnati, OH 45220, USA'),
 ('Statue of James A. Garfield', '801-811 Vine St, Cincinnati, OH 45202, USA'),
 ('Cheetah Encounter', 'Dury Ave, Cincinnati, OH 45220, USA'),
 ('World Peace Bell Center', '425 York St, Newport, KY 41071, USA'),
 ('American Legacy Tours', '1332 Vine St, Cincinnati, OH 45202, USA'),
 ('Newport Aquarium', '1 Dave Cowens Dr, Newport, KY 41071, USA'),
 ('Fox Preserve', '5801 McCray Ct, Cincinnati, OH 45224, USA'),
 ('Cincinnati USA Regional Tourism Network',
  '50 E Rivercenter Blvd #1100, Covington, KY 41011, USA'),
 ('Roadtrippers', '131 E McMicken Ave, Cincinnati, OH 45202, USA'),
 ('New Riff Distilling', '24 Distillery Way, Newport, KY 41073, USA'),
 ('Findlay Market', '1801 Race St, Cincinnati, OH 45202, USA'),
 ('Krohn Conservatory', '1501 Eden Park Dr, Cincinnati, OH 45202, USA'),
 

In [28]:
import requests
import json
from math import sin, cos, sqrt, atan2, radians

In [29]:
def getUsersLatLong():
    send_url = 'http://freegeoip.net/json'
    r = requests.get(send_url)
    j = json.loads(r.text)
    lat = j['latitude']
    lon = j['longitude']
    return lat, lon

In [31]:
with open('data/destination_geocode.json') as f:
	geocode = json.load(f)

In [34]:
def distBetweenLatLongKM(lat1, lon1, lat2, lon2):
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    R = 6373.0

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    
    return distance

In [35]:
print(distBetweenLatLongKM(52.2296756, 21.0122287, 52.406374, 16.9251681))

278.545589351


In [36]:
def filterRegionsWithinDistance(maxDistanceKM, regions):
    userLat, userLong = getUsersLatLong()
    filteredRegions = []
    
    for region in regions:
        lat = geocode[region.lower()]['results'][0]['geometry']['location']['lat']
        lon = geocode[region.lower()]['results'][0]['geometry']['location']['lng']
        if distBetweenLatLongKM(userLat, userLong, lat, lon) <= maxDistanceKM:
            filteredRegions.append(region)
        
    return filteredRegions

In [37]:
print(filterRegionsWithinDistance(3000, regions))

[u'Miami', u'Saint Martin', u'Sint Maarten']
