In [1]:
import nltk
from nltk.stem.porter import *
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter, defaultdict
import re
from scipy.spatial.distance import cdist
import pickle

regions = ['africa',
           'antarctica',
           'australasia',
           'caribbean',
           'central_america',
           'central_asia',
           'europe',
           'indian_subcontinent',
           'middle_east',
           'north_america',
           'north_east asia',
           'pacific',
           'south_america',
           'south_east_asia',
           'north_east_asia']

In [2]:
data = []
with open('data/FlatCorpus.txt', encoding='utf-8-sig') as f:
    for line in f:
        title, text = line.split(':  ')
        region = ''
        for r in regions:
            if title.startswith(r):
                region = ' '.join(map(lambda x: x.capitalize(), r.split('_')))
                break
        split_title = title[len(region) + 1:].split('_')
        entry_type = split_title[-1].split('.')[0]
        city = ' '.join(map(lambda x: x.capitalize(), split_title[:-1]))
        row = [region, city, entry_type, text]
        data.append(row)

In [3]:
print(data[1165])

['North America', 'Atlantic City', 'activities', "Activities  Atlantic City is not the place to visit if you're into the outdoors. The most burning of calories you'll achive will be getting out of bed and slouching in front of a slot machine. The city's rules and regulations conspire to keep it this way. While the Boardwalk is a good spot, in theory, for cycling, bikes are only allowed between the hours of 6-10 am.  If you want some exercise, you're better off leaving Atlantic City and heading for the peaceful Pine Barrens, where there's no shortage of hiking in the huge pine forest. In Egg Harbor, you can rent equipment to canoe and kayak through the Pines. Wildwood's coast has some decent beaches from which people parasail. Whalewatching trips run from North Wildwood and Cape May throughout the summer.  \n"]


In [4]:
print(len(data))

1801


In [5]:
# Dump raw LP text data
# with open('data/LP_raw.pickle', 'wb') as f:
#     pickle.dump(data, f)

In [6]:
countries = [row[0] for row in data]
cities = [row[1] for row in data]
types = [row[2] for row in data]

In [7]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [8]:
def tokenize(sent):
    return re.findall('[a-zA-Z]+', sent)

In [9]:
descriptions = [[w.lower() for w in tokenize(row[3])] for row in data]

In [10]:
print(descriptions[1])

['attractions', 'officially', 'the', 'capital', 'the', 'government', 'long', 'ago', 'moved', 'itself', 'and', 'most', 'of', 'its', 'business', 'km', 'mi', 'west', 'to', 'cotonou', 'nevertheless', 'this', 'town', 'of', 'some', 'people', 'remains', 'a', 'beautiful', 'and', 'historical', 'place', 'its', 'proximity', 'to', 'the', 'nigerian', 'border', 'gives', 'the', 'appearance', 'that', 'more', 'is', 'going', 'on', 'than', 'actually', 'is', 'though', 'there', 'are', 'still', 'some', 'hot', 'spots', 'such', 'as', 'the', 'grand', 'marche', 'd', 'adjara', 'where', 'you', 'can', 'buy', 'drums', 'cloth', 'baskets', 'and', 'the', 'best', 'pottery', 'in', 'benin', 'the', 'musee', 'ethnographique', 'de', 'porto', 'novo', 'has', 'a', 'great', 'collection', 'of', 'yoruba', 'artefacts', 'you', 'can', 'also', 'visit', 'the', 'ornate', 'brazilian', 'style', 'church', 'now', 'a', 'mosque']


In [11]:
stemmer = PorterStemmer()

In [12]:
stems = [[stemmer.stem(w.lower()) for w in sent] for sent in descriptions]

In [13]:
print(stems[1])

['attract', 'offici', 'the', 'capit', 'the', 'govern', 'long', 'ago', 'move', 'itself', 'and', 'most', 'of', 'it', 'busi', 'km', 'mi', 'west', 'to', 'coton', 'nevertheless', 'thi', 'town', 'of', 'some', 'peopl', 'remain', 'a', 'beauti', 'and', 'histor', 'place', 'it', 'proxim', 'to', 'the', 'nigerian', 'border', 'give', 'the', 'appear', 'that', 'more', 'is', 'go', 'on', 'than', 'actual', 'is', 'though', 'there', 'are', 'still', 'some', 'hot', 'spot', 'such', 'as', 'the', 'grand', 'march', 'd', 'adjara', 'where', 'you', 'can', 'buy', 'drum', 'cloth', 'basket', 'and', 'the', 'best', 'potteri', 'in', 'benin', 'the', 'muse', 'ethnographiqu', 'de', 'porto', 'novo', 'ha', 'a', 'great', 'collect', 'of', 'yoruba', 'artefact', 'you', 'can', 'also', 'visit', 'the', 'ornat', 'brazilian', 'style', 'church', 'now', 'a', 'mosqu']


In [14]:
inv_idx = defaultdict(set)
for i, row in enumerate(stems):
    for w in row:
        inv_idx[w].add(i)

In [15]:
min_df = 10
max_df = 0.8
nd = len(data)
vocab = list(filter(lambda x: min_df <= len(inv_idx[x]) <= nd * max_df and x not in stopwords, inv_idx.keys()))
vocab_idx = {w: i for i, w in enumerate(vocab)}

In [16]:
print(len(vocab_idx))

3068


In [17]:
idf = {}
filt_inv_idx = {}
for w in vocab:
    idf[w] = np.log((nd) / (1 + len(inv_idx[w])) + 1)
    filt_inv_idx[w] = inv_idx[w]

In [18]:
# Dump pickled inverted index
# with open('inv_idx.pickle', 'wb') as f:
#     pickle.dump(filt_inv_idx, f)

In [19]:
doc_mat = np.zeros((nd, len(vocab)))

for i, row in enumerate(stems):
    counter = Counter(row)
    for w, count in counter.items():
        if w in idf:
            doc_mat[i, vocab_idx[w]] = idf[w] * count
norm = np.linalg.norm(doc_mat, axis=1)[:, np.newaxis] + 1e-8
doc_mat = doc_mat / norm

In [20]:
# Dump pickled TF-IDF matrix
# with open('data/tfidf_mat.pickle', 'wb') as f:
#     pickle.dump(doc_mat, f)

In [21]:
raw_query = tokenize('urban city shopping')
query = [stemmer.stem(w) for w in raw_query]
print(query)

['urban', 'citi', 'shop']


In [22]:
accum = np.zeros(len(data))
for q in query:
    if q in idf:
        for doc in inv_idx[q]:
            accum[doc] += doc_mat[doc, vocab_idx[q]]
ranking = accum.argsort()[::-1]

In [23]:
for r in ranking[:3]:
    print(data[r])

['Africa', 'Cape Verde', 'attractions', "Attractions  S o Tiago is the main island and home to the capital, Praia. The city isn't the most beautiful of the archipelago's two cities (this distinction belongs to Mindelo), but it's a pleasant place, with its centre perched on a rocky plateau known as Plat. This central area is surrounded by urban sprawl in three directions. The city's two beaches, Praia Mar and Quebra-Canela, are west of the centre.  For a half-day trip out of Praia, go to the Cidade Velha (Old City), the first town built by the Portuguese on the islands. There are great views of the village on the climb up to Fort Real de S o Felipe. The Old City is about 10km (6mi) west of Praia. Some 20km (12mi) inland from Praia, the village of S o Domingos is the closest green agricultural valley to the capital. There are one or two shops selling handicrafts. At the northern end of S o Tiago is the island's second largest settlement, Tarrafal, which is famous for its beaches. It can 