# Step 1 & 2

In [312]:
# imports
import csv
import heapq
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# globals
base_dir = './data/'
tsv_dir = base_dir + 'tsv/'
dataset = base_dir + 'Airbnb_Texas_Rentals.csv'

In [313]:
stopwords = set(stopwords.words('english'))

# cleaning stuff
def preprocessing_nltk(e):
    ps = PorterStemmer()
    
    e = e.lower().replace('\n', '')
    e = word_tokenize(e)
    e = [w for w in e if w.isalpha()]
    e = [w for w in e if not w in stopwords]
    e = [ps.stem(w) for w in e]
    return e

In [434]:
# the list of words contained in a document
# it simply is the document content
docid2words = {}

# vocabulary
word2id = {}

# documents that contain a precise word
word2docid = {}

# a collection of city, coords and document id
geo2coords = {}

# document id related to the city name
docid2geo = {}

# processing the main .csv and creating a no. of .tsv
with open(dataset, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for i, elems in enumerate(csvreader): # line
    
        # skip the first line as we're dealing with a .csv
        if i == 0: continue
        
        # file index
        index = elems[0]

        # geo
        coords = (elems[6], elems[7])
        city = elems[3]
        
        # docid2geo
        docid2geo[index] = city
        
        if not city in geo2coords.keys():
            geo2coords[city] = coords
        else:
            lat, lng = geo2coords[city]
            if lat == 'NA' or lng == 'NA':
                geo2coords[city] = coords
        
        # preprocessing
        descr = preprocessing_nltk(elems[5])
        title = preprocessing_nltk(elems[8])
        # elems[5] = ' '.join(descr)
        # elems[8] = ' '.join(title)
        
        # docid2words
        docid2words[index] = []
        docid2words[index].extend(descr)
        docid2words[index].extend(title)
        
        # word2id
        for word in docid2words[index]:
            if not word in word2id.keys(): 
                word2id[word] = len(word2id.keys())
        
        # word2docid
        for word in docid2words[index]:
            if not word in word2docid.keys():
                word2docid[word] = set(index)
            else:
                word2docid[word].add(index)
        
        # put .tsv files into 'tsv' folder (that already has to exist)
        with open(tsv_dir + 'doc_' + index + '.tsv', 'w') as doc_out:
            doc_out.write('\t'.join(elems[1:]))

In [435]:
import math 

# inverse index
word2docid_tfidf = {}

# docid2word_tfidf
docid2word_tfidf = {}

# create inverse index with tfidf
for w, docs in word2docid.items():
    
    # skip in case we have it already
    if w in word2docid_tfidf.keys(): continue
    
    # empty list (of future tuples)
    word2docid_tfidf[w] = []
    
    # for each document that contain w
    for d in docs:
        
        if not d in docid2word_tfidf.keys():
            docid2word_tfidf[d] = {}
        
        # get document words (content)
        if not d in docid2words.keys(): continue
        content = docid2words[d]
        
        # compute tfidf
        # todo: export to function
        tf = content.count(w) / len(content)
        idf = math.log10(len(docid2words.keys()) / len(docs))
        tfidf = tf * idf
        
        # fill it!
        word2docid_tfidf[w].append((d, tfidf))
        docid2word_tfidf[d][w] = tfidf

In [436]:
# fix not matching results
print(len(set(docid2word_tfidf.keys())))
print(len(set(docid2words.keys())))
print(docid2word_tfidf['10'])

18252
18259
{'privat': 0.18738656253333066, 'cozi': 0.13566814294526974}


In [437]:
# computing document vectors
docid2vec = {}

# todo: use numpy
for d, doc_words in docid2words.items():
    
    # skip if we have it already
    if d in docid2vec.keys(): continue
    
    # empty vector
    docid2vec[d] = []
    
    for w in word2id.keys():
        
        # if that word is the selected doc
        if w in doc_words:
            
            # todo: fix
            # skip if missing
            if not d in docid2word_tfidf.keys():
                continue
            if not w in docid2word_tfidf[d]:
                docid2vec[d].append(0.0)
                continue
            docid2vec[d].append(docid2word_tfidf[d][w])
            
        # else, fill with zeros    
        else: 
            docid2vec[d].append(0.0)

In [438]:
print(len(docid2vec['999']))

10127


# Step 3

In [439]:
# ask user
query = input()
query = preprocessing_nltk(query)

print(query)

beautiful and decorated house on the beach
['beauti', 'decor', 'hous', 'beach']


In [441]:
# build user input query vector
uiq_vec = []

for w in word2docid:
    if not w in query:
        uiq_vec.append(0.0)
    else:
        # tfidf
        tf = query.count(w) / len(query)
        idf = math.log10(len(docid2words.keys()) / len(word2docid[w]))
        tfidf = tf * idf
        uiq_vec.append(tfidf)

In [442]:
print(len(uiq_vec))
print(len(word2docid.keys()))

10127
10127


In [502]:
# here we are taking a list of documents numbers for each word in user query and
# intersect these lists to receive only the numbers of documents which contain all of these query words.
sets = []

for q in query:
    sets.append(word2docid.get(q) or set())
matching_docs = list(set.intersection(*sets))

print(len(matching_docs))
print(matching_docs)

7
['10846', '17594', '12313', '13544', '13294', '7960', '16502']


In [503]:
# todo: show matching_docs through pandas
for md in matching_docs:
    df = pd.read_csv(tsv_dir + 'doc_' + md + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

10846
Bolivar Peninsula
Bethel Blue is a cozy 2 bedroom 1.5 bath house about 5 blocks from the beautiful beach. It is decorated nicely and equipped with everything your family needs for a wonderful beach getaway. Military and Senior citizens receive a 10% discount!
29.4733202013917
-94.5974652808875

17594
Bolivar Peninsula
Bethel Blue is a cozy 2 bedroom 1.5 bath house about 5 blocks from the beautiful beach. It is decorated nicely and equipped with everything your family needs for a wonderful beach getaway. Military and Senior citizens receive a 10% discount!
29.4733202013917
-94.5974652808875

12313
Kingsland
Nicely decorated 4 bedroom, two bath house with an extra lot for additional parking. Beautiful view of packsaddle mountain and gorgeous sunsets. Sandy bottom lake frontage with sandy beach area for the kids. Kayaks included!\n\nPricing is usually available 6 months in advance. Local and state taxes are 10% and are not reflected in the pricing.
30.6439855257378
-98.4772249136191

In [446]:
from scipy import spatial

def cosine_similarity(vec_src, vec_tgt):
    return 1 - spatial.distance.cosine(vec_src, vec_tgt)

In [504]:
matching_docs_cos = []

# to test it all over the docs -> docid2vec.keys()
for d in matching_docs:
    if len(uiq_vec) == len(docid2vec[d]):
        matching_docs_cos.append((cosine_similarity(uiq_vec, docid2vec[d]), d))

In [505]:
# creating heap structure
heapq.heapify(matching_docs_cos)

# showing the top-k where k = 10
topk_cos = heapq.nlargest(10, matching_docs_cos)
print(topk_cos)

[(0.3441485802044202, '13294'), (0.34341842971434833, '16502'), (0.33463006224016045, '13544'), (0.24303374182502657, '17594'), (0.24303374182502657, '10846'), (0.19780816569422444, '7960'), (0.17957073272176072, '12313')]


In [506]:
# todo: show top-k through pandas
for md in topk_cos:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.3441485802044202, '13294')
Surfside Beach
Comfortably sleeps 10 people. Open floor plan, beach themed decorations, over sized balconies with beautiful views of the ocean. The beach is just steps from the house and very safe without having to cross any roads.
28.9680774430393
-95.2679315250019

(0.34341842971434833, '16502')
Surfside Beach
My house is close to the beach, great views, and restaurants and dining. You’ll love it because of the views, the location, the ambiance, and the people. My place is good for couples, solo adventurers, business travelers, families (with kids), big groups, and furry friends (pets).\n\nComfortably sleeps 10 people. Open floor plan, beach themed decorations, over sized balconies with beautiful views of the ocean. The beach is just steps from the house and very safe without having to cross any roads.
28.9691337883619
-95.268177794608

(0.33463006224016045, '13544')
Galveston
Down by the Sea is luxury at the beach! It is spacious, close to the beach, an

# Step 4

In [486]:
# todo: geo2coords has to be preprocessed

# ask user's position
u_pos = input()

Kingsland


In [487]:
import sys

# todo: loop input until a valid city has been provided
# get coords
if not u_pos in geo2coords.keys():
    print('This city is not supported!')
    sys.exit(1)

coords = geo2coords[u_pos]

In [488]:
print(coords)

('30.6508725280312', '-98.4403411806001')


In [507]:
import geopy.distance

# the variables of the function takes 2 values for each location
def distance_function(first_location, second_location):
    return (geopy.distance.geodesic(first_location, second_location).km) # this is better because considers the Earth as an ellipse

In [490]:
distances = []

for m in matching_docs:
    d_city = docid2geo[m]
    d_coords = geo2coords[d_city]
    
    dist = distance_function(coords, d_coords)
    distances.append((dist, m, d_city))

In [491]:
print(distances)
print(matching_docs)
print(len(matching_docs))

[(386.25038793584775, '10846', 'Bolivar Peninsula'), (386.25038793584775, '17594', 'Bolivar Peninsula'), (0.0, '12313', 'Kingsland'), (383.57928350819566, '13544', 'Galveston'), (358.9508977632399, '13294', 'Surfside Beach'), (0.0, '7960', 'Kingsland'), (358.9508977632399, '16502', 'Surfside Beach')]
['10846', '17594', '12313', '13544', '13294', '7960', '16502']
7


In [493]:
import numpy as np
from sklearn.preprocessing import minmax_scale

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

In [508]:
distance_km = [x[0] for x in distances]
distance_km_norm = normalized(np.asarray(distance_km, dtype=np.float32))
distance_km_norm_inverted = 1.0 - distance_km_norm

print(distances)
print(distance_km)
print(distance_km_norm)
print(distance_km_norm_inverted)
print(len(distance_km_norm))

[(386.25038793584775, '10846', 'Bolivar Peninsula'), (386.25038793584775, '17594', 'Bolivar Peninsula'), (0.0, '12313', 'Kingsland'), (383.57928350819566, '13544', 'Galveston'), (358.9508977632399, '13294', 'Surfside Beach'), (0.0, '7960', 'Kingsland'), (358.9508977632399, '16502', 'Surfside Beach')]
[386.25038793584775, 386.25038793584775, 0.0, 383.57928350819566, 358.9508977632399, 0.0, 358.9508977632399]
[[0.46060482 0.46060482 0.         0.4574195  0.42805007 0.
  0.42805007]]
[[0.5393952  0.5393952  1.         0.5425805  0.57194996 1.
  0.57194996]]
1


In [495]:
# merge cosine_similarity and distance
matching_docs_ni = []

# todo: score and heap structure

for i in range(len(matching_docs_cos)):
    m_doc = matching_docs_cos[i]
    m_doc_cos = m_doc[0]
    m_doc_id = m_doc[1]
    dist_i = distance_km_norm[0][i]
    
    # weighted mean value
    # giving more weight to the distance...
    mean_v = ((m_doc_cos * 0.2) + (dist_i * 0.8)) / 1.0
    # mean_v = (m_doc_cos + dist_i)
    matching_docs_ni.append((mean_v, m_doc_id))

In [496]:
print(matching_docs_ni)
print(len(matching_docs_ni))

[(0.4043979998845011, '12313'), (0.4170906017051543, '17594'), (0.03956163313884489, '7960'), (0.43286162417288565, '13544'), (0.4112697728417264, '13294'), (0.048606748365005316, '10846'), (0.411123742743712, '16502')]
7


In [497]:
# creating heap structure
heapq.heapify(matching_docs_ni)

# showing the top-k where k = 10
topk = heapq.nlargest(10, matching_docs_ni)
print(topk)

[(0.43286162417288565, '13544'), (0.4170906017051543, '17594'), (0.4112697728417264, '13294'), (0.411123742743712, '16502'), (0.4043979998845011, '12313'), (0.048606748365005316, '10846'), (0.03956163313884489, '7960')]


In [498]:
# todo: visualize data with pandas
for md in matching_docs:
    df = pd.read_csv(tsv_dir + 'doc_' + md + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

10846
Bolivar Peninsula
Bethel Blue is a cozy 2 bedroom 1.5 bath house about 5 blocks from the beautiful beach. It is decorated nicely and equipped with everything your family needs for a wonderful beach getaway. Military and Senior citizens receive a 10% discount!
29.4733202013917
-94.5974652808875

17594
Bolivar Peninsula
Bethel Blue is a cozy 2 bedroom 1.5 bath house about 5 blocks from the beautiful beach. It is decorated nicely and equipped with everything your family needs for a wonderful beach getaway. Military and Senior citizens receive a 10% discount!
29.4733202013917
-94.5974652808875

12313
Kingsland
Nicely decorated 4 bedroom, two bath house with an extra lot for additional parking. Beautiful view of packsaddle mountain and gorgeous sunsets. Sandy bottom lake frontage with sandy beach area for the kids. Kayaks included!\n\nPricing is usually available 6 months in advance. Local and state taxes are 10% and are not reflected in the pricing.
30.6439855257378
-98.4772249136191