# Step 1 & 2

In [22]:
# imports
import csv
import heapq
import pandas as pd
import numpy as np
import math

# utils
from utils import normalized
from utils import preprocessing_nltk
from utils import compute_tfidf
from utils import cosine_similarity
from utils import distance_function

# globals
base_dir = './data/'
tsv_dir = base_dir + 'tsv/'
dataset = base_dir + 'Airbnb_Texas_Rentals.csv'

In [4]:
# the list of words contained in a document
# it simply is the document content
docid2words = {}

# vocabulary
word2id = {}

# documents that contain a precise word
word2docid = {}

# a collection of city, coords and document id
geo2coords = {}

# document id related to the city name
docid2geo = {}

# processing the main .csv and creating a no. of .tsv
with open(dataset, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # extracting data from each line
    for i, elems in enumerate(csvreader): # line
    
        # skip the first line as we're dealing with a .csv
        if i == 0: continue
        
        # file index
        index = elems[0]

        # geo (coords and city)
        coords = (elems[6], elems[7])
        city = elems[3]
        
        # docid2geo
        docid2geo[index] = city
        
        if not city in geo2coords.keys():
            geo2coords[city] = coords
        else:
            lat, lng = geo2coords[city]
            if lat == 'NA' or lng == 'NA':
                geo2coords[city] = coords
        
        # preprocessing
        descr = preprocessing_nltk(elems[5])
        title = preprocessing_nltk(elems[8])
        
        # docid2words
        docid2words[index] = []
        docid2words[index].extend(descr)
        docid2words[index].extend(title)
        
        # word2id
        for word in docid2words[index]:
            if not word in word2id.keys(): 
                word2id[word] = len(word2id.keys())
        
        # word2docid
        for word in docid2words[index]:
            if not word in word2docid.keys():
                word2docid[word] = set(index)
            else:
                word2docid[word].add(index)
        
        # produce tsv
        # put .tsv files into 'tsv' folder (that already has to exist)
        with open(tsv_dir + 'doc_' + index + '.tsv', 'w') as doc_out:
            doc_out.write('\t'.join(elems[1:]))

In [5]:
# inverse index
word2docid_tfidf = {}

# docid2word_tfidf
docid2word_tfidf = {}

# create inverse index with tfidf
for w, docs in word2docid.items():
    
    # skip in case we have it already
    if w in word2docid_tfidf.keys(): continue
    
    # empty list (of future tuples)
    word2docid_tfidf[w] = []
    
    # for each document that contains w
    for d in docs:
        
        # create an empty structure if it's the first match
        if not d in docid2word_tfidf.keys():
            docid2word_tfidf[d] = {}
        
        # get document words (content)
        if not d in docid2words.keys(): continue
        content = docid2words[d]
        
        # used for tfidf
        word_freq = content.count(w)
        n_words_doc = len(content)
        n_docs = len(docid2words.keys())
        n_docs_with_word = len(docs)
        
        # compute tfidf
        tfidf = compute_tfidf(word_freq, n_words_doc, n_docs, n_docs_with_word)
        
        # fill the vector
        word2docid_tfidf[w].append((d, tfidf))
        docid2word_tfidf[d][w] = tfidf

In [6]:
# fix not matching results -> 18252 - 18259
print(len(set(docid2word_tfidf.keys())))
print(len(set(docid2words.keys())))
print(docid2word_tfidf['10'])

18252
18259
{'privat': 0.18738656253333066, 'cozi': 0.13566814294526974}


In [7]:
# computing document vectors
docid2vec = {}

# todo: use numpy
for d, doc_words in docid2words.items():
    
    # skip if we have it already
    if d in docid2vec.keys(): continue
    
    # empty vector
    docid2vec[d] = []
    
    for w in word2id.keys():
        
        # if that word is the selected doc
        if w in doc_words:
            
            # todo: fix
            # skip if missing
            if not d in docid2word_tfidf.keys():
                continue
            if not w in docid2word_tfidf[d]:
                docid2vec[d].append(0.0)
                continue
            docid2vec[d].append(docid2word_tfidf[d][w])
            
        # else, fill with zeros    
        else: 
            docid2vec[d].append(0.0)

In [8]:
print(len(docid2vec['999']))

10127


# Step 3

In [9]:
# ask user
query = input()
query = preprocessing_nltk(query)

print(query)

a luxurious apartment downtown with pool
['luxuri', 'apart', 'downtown', 'pool']


In [11]:
# build user input query vector
uiq_vec = []

for w in word2docid:
    if not w in query:
        uiq_vec.append(0.0)
    else:
        # tfidf
        # tf = query.count(w) / len(query)
        # idf = math.log10(len(docid2words.keys()) / len(word2docid[w]))
        tfidf = compute_tfidf(query.count(w), len(query), len(docid2words.keys()), len(word2docid[w]))
        uiq_vec.append(tfidf)

In [12]:
print(len(uiq_vec))
print(len(word2docid.keys()))

10127
10127


In [13]:
# here we are taking a list of documents numbers for each word in user query and
# intersect these lists to receive only the numbers of documents which contain all of these query words.
sets = []

for q in query:
    sets.append(word2docid.get(q) or set())
matching_docs = list(set.intersection(*sets))

print(len(matching_docs))
print(matching_docs)

31
['13864', '13503', '8379', '17952', '1474', '9168', '14033', '13200', '11837', '8447', '14760', '14159', '7393', '10842', '13605', '13438', '3775', '9894', '14689', '8959', '8757', '637', '16102', '18203', '1066', '5435', '10989', '17933', '4584', '11764', '2552']


In [14]:
# todo: show matching_docs through pandas
for md in matching_docs:
    df = pd.read_csv(tsv_dir + 'doc_' + md + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

13864
Austin
Quiet relax atmosphere just 10 minutes from downtown. 1 bed room 1 bath with washer dryer included,queen bed everything all white, balcony, 17ft tall ceilings, garden tub separate from shower. Privacy and quite neighbors, own desires parking spot. Pets welcome, gym open anytime with lap pool, tv outside and bar b q pit.
30.2589482442465
-97.8628891451995

13503
Dallas
Luxury 1 bedroom/1 bath apartment in the heart of Downtown Dallas. Comes with full kitchen, wash &amp; dryer(in unit), queen bed, Wifi, flat screen TV's with cable. Walking distance to shopping, dining, pick-up orders and convenience stores. Affordable Uber rates to nightlife an tourist attractions.\n\nBuilding Amenities\n*Rooftop Pool(6am-Midight)\n*Community Area(6am-Midnight)\n*4th Floor Atrium(6am-Midnight)\n*Parking Garage(1 assigned spot)\n*Elevators\n*Mexican Restaurant\n*Luxury Nail Spa(coming soon)
32.7823823267092
-96.7994883089144

8379
Dallas
Click on my profile to check out my 2 other Downtown Da

In [15]:
matching_docs_cos = []

# to test it all over the docs -> docid2vec.keys()
for d in matching_docs:
    if len(uiq_vec) == len(docid2vec[d]):
        matching_docs_cos.append((cosine_similarity(uiq_vec, docid2vec[d]), d))

In [16]:
# creating heap structure
heapq.heapify(matching_docs_cos)

# showing the top-k where k = 10
topk_cos = heapq.nlargest(10, matching_docs_cos)
print(topk_cos)

[(0.4217916073233887, '2552'), (0.39311661489020255, '7393'), (0.39311661489020255, '13438'), (0.3279227638513016, '8757'), (0.2965393840458528, '8447'), (0.2965393840458528, '8379'), (0.2965393840458528, '14689'), (0.2965393840458528, '14159'), (0.2874108167252869, '17933'), (0.2874108167252869, '13200')]


In [17]:
# todo: show top-k through pandas
for md in topk_cos:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.4217916073233887, '2552')
Austin
Spacious apartment in a safe and friendly North Austin community. 15mins from downtown &amp; 6th street with plenty of surrounding restaurants, stores, and attractions. Pool and 24 hour-fitness center access
30.3812545540728
-97.6502337849287

(0.39311661489020255, '7393')
Little Elm
Beautiful 3 bedroom 2 bath Luxury Apartment Located 30 minutes from downtown Dallas and very close to dining and shopping. This is a gated community and the amenities include; Washer and dryer, Full size jacuzzi tub, 24 Gym access, Infiniti swimming pool, private garage access, patio, and full size closets in each room.
33.2167456954796
-96.908854561541

(0.39311661489020255, '13438')
Little Elm
Beautiful 3 bedroom 2 bath Luxury Apartment Located 30 minutes from downtown Dallas and very close to dining and shopping. This is a gated community and the amenities include; Washer and dryer, Full size jacuzzi tub, 24 Gym access, Infiniti swimming pool, private garage access, p

# Step 4

In [35]:
# todo: geo2coords has to be preprocessed

# ask user's position to show him/her
# the better place nearby
u_pos = input()

Dallas


In [36]:
import sys

# todo: loop input until a valid city has been provided
# get coords
if not u_pos in geo2coords.keys():
    print('This city is not supported!')
    sys.exit(1)

coords = geo2coords[u_pos]

In [37]:
print(coords)

('32.7866364570642', '-96.8746686879646')


In [38]:
distances = []

for m in matching_docs:
    d_city = docid2geo[m]
    d_coords = geo2coords[d_city]
    
    dist = distance_function(coords, d_coords)
    distances.append((dist, m, d_city))

In [39]:
print(distances)
print(matching_docs)
print(len(matching_docs))

[(286.45849488555126, '13864', 'Austin'), (0.0, '13503', 'Dallas'), (0.0, '8379', 'Dallas'), (369.64077265826234, '17952', 'Houston'), (38.83006355298282, '1474', 'Fort Worth'), (38.83006355298282, '9168', 'Fort Worth'), (0.0, '14033', 'Dallas'), (0.0, '13200', 'Dallas'), (0.0, '11837', 'Dallas'), (0.0, '8447', 'Dallas'), (393.731317503192, '14760', 'San Antonio'), (0.0, '14159', 'Dallas'), (41.532945819077916, '7393', 'Little Elm'), (393.731317503192, '10842', 'San Antonio'), (0.0, '13605', 'Dallas'), (41.532945819077916, '13438', 'Little Elm'), (38.83006355298282, '3775', 'Fort Worth'), (0.0, '9894', 'Dallas'), (0.0, '14689', 'Dallas'), (393.731317503192, '8959', 'San Antonio'), (393.731317503192, '8757', 'San Antonio'), (286.45849488555126, '637', 'Austin'), (393.731317503192, '16102', 'San Antonio'), (0.0, '18203', 'Dallas'), (369.64077265826234, '1066', 'Houston'), (0.0, '5435', 'Dallas'), (0.0, '10989', 'Dallas'), (0.0, '17933', 'Dallas'), (393.731317503192, '4584', 'San Antonio'

In [44]:
distance_km = [x[0] for x in distances]
distance_km_norm = normalized(np.asarray(distance_km, dtype=np.float32))

# we need to invert the index as there's an inverse correlation
# between the way we deal with our index from 0 to 1 and the distance 
distance_km_norm_inverted = 1.0 - distance_km_norm

print(distance_km_norm)
print(distance_km_norm_inverted)
print(len(distance_km_norm))

[[0.23715019 0.         0.         0.30601424 0.03214622 0.03214622
  0.         0.         0.         0.         0.32595807 0.
  0.03438385 0.32595807 0.         0.03438385 0.03214622 0.
  0.         0.32595807 0.32595807 0.23715019 0.32595807 0.
  0.30601424 0.         0.         0.         0.32595807 0.03214622
  0.23715019]]
[[0.7628498  1.         1.         0.69398576 0.9678538  0.9678538
  1.         1.         1.         1.         0.6740419  1.
  0.96561617 0.6740419  1.         0.96561617 0.9678538  1.
  1.         0.6740419  0.6740419  0.7628498  0.6740419  1.
  0.69398576 1.         1.         1.         0.6740419  0.9678538
  0.7628498 ]]
1


In [41]:
# merge cosine_similarity and distance
matching_docs_ni = []

# combining cosine_similarity and the new index
for i in range(len(matching_docs_cos)):
    m_doc = matching_docs_cos[i]
    m_doc_cos = m_doc[0]
    m_doc_id = m_doc[1]
    dist_i = distance_km_norm[0][i]
    
    # weighted mean value
    # giving more weight to the distance...
    # ((w1 * x1) + (w2 * x2)) / w1 + w2 
    mean_v = ((m_doc_cos * 0.2) + (dist_i * 0.8))
    matching_docs_ni.append((mean_v, m_doc_id))

In [42]:
# creating heap structure
heapq.heapify(matching_docs_ni)

# showing the top-k where k = 10
topk = heapq.nlargest(10, matching_docs_ni)
print(topk)

[(0.32635101128161287, '8757'), (0.3200743353205231, '8447'), (0.3149585797200743, '14033'), (0.3119898836136226, '4584'), (0.3119898836136226, '16102'), (0.3041192686396149, '14159'), (0.2985206645243008, '637'), (0.2758538904107008, '17952'), (0.2740784752732715, '2552'), (0.2439122750173155, '13503')]


In [43]:
# todo: visualize data with pandas
for md in topk:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.32635101128161287, '8757')
San Antonio
This is a Luxury apartment located in a growing and popular area of San Antonio adjacent to Fiesta Texas, the Shops at La Cantera, across the highway from the RIM and Just a short 15 minute (straight shot) drive to downtown. This is a 1 bedroom 1 bath with resort style and infinity pools. The view from my balcony is absolutely stunning: hill-country plus pool view. It is also walking distance to all amenities.
29.6206575542356
-98.6172513177697

(0.3200743353205231, '8447')
Dallas
Click on my profile to check out my 2 other Downtown Dallas apartments: Another 1-bedroom apartment &amp; a 2-bedrooms apartment\n\nYou are looking at the beautiful high-rise suite right at the heart of Downtown Dallas. You can certainly walk to almost everything you need from high-end restaurants to parks, shopping, and tourist destinations. It offers a fully equipped kitchen, stainless steel appliances, hardwood flooring, in-home washer and dryer, beautiful pool, fi