# Step 1 & 2

In [68]:
# imports
import csv
import heapq
import pandas as pd
import numpy as np
import math

# utils
from utils import normalized
from utils import preprocessing_nltk
from utils import compute_tfidf
from utils import cosine_similarity
from utils import distance_function

# globals
base_dir = './data/'
tsv_dir = base_dir + 'tsv/'
dataset = base_dir + 'Airbnb_Texas_Rentals.csv'

In [99]:
# the list of words contained in a document
# it simply is the document content
docid2words = {}

# vocabulary
word2id = {}

# documents that contain a precise word
word2docid = {}

# a collection of city, coords and document id
geo2coords = {}

# document id related to the city name
docid2geo = {}

# processing the main .csv and creating a no. of .tsv
with open(dataset, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # extracting data from each line
    for i, elems in enumerate(csvreader): # line
    
        # skip header
        if i == 0: continue
        
        # file index
        index = elems[0]
        
        # geo (coords and city)
        coords = (elems[6], elems[7])
        city = elems[3]
        
        # preprocessing content (words)
        descr = preprocessing_nltk(elems[5])
        title = preprocessing_nltk(elems[8])
        
        # discarding docs with no words...
        if len(descr) == 0 and len(title) == 0: continue
        
        # we don't want to deal with no location (needed in step 4)
        if all(isinstance(c, float) for c in coords): continue  
        
        # add city and its coords (no duplicates)
        if not city in geo2coords.keys():
            geo2coords[city] = coords       

        # docid2geo
        docid2geo[index] = city
        
        # docid2words
        docid2words[index] = []
        docid2words[index].extend(descr)
        docid2words[index].extend(title)
        
        # working with words to fill dictionaries
        for word in docid2words[index]:
            
            # word2id
            if not word in word2id.keys(): 
                word2id[word] = len(word2id.keys())
            
            # word2docid
            if not word in word2docid.keys():
                word2docid[word] = set([index])
            else:
                word2docid[word].add(index)
        
        # produce tsv
        # put .tsv files into 'tsv' folder (that already has to exist)
        with open(tsv_dir + 'doc_' + index + '.tsv', 'w') as doc_out:
            doc_out.write('\t'.join(elems[1:]))

Creating the inverse index with tf-idf

In [106]:
# inverse index
word2docid_tfidf = {}

# docid2words_tfidf
docid2words_tfidf = {}

# create inverse index with tfidf
for w, docs in word2docid.items():
    
    # skip in case we already have a word in the vocabulary
    if w in word2docid_tfidf.keys(): continue
    
    # empty list (of future tuples)
    word2docid_tfidf[w] = []
    
    # for each document that contains w
    for d in docs:
        
        # create an empty structure if it's the first match
        if not d in docid2words_tfidf.keys():
            docid2words_tfidf[d] = {}
        
        # get document words (all its words)
        content = docid2words[d]
        
        # compute tfidf
        tfidf = compute_tfidf(content.count(w), len(content), len(docid2words.keys()), len(docs))
        
        # fill the vector
        word2docid_tfidf[w].append((d, tfidf))
        docid2words_tfidf[d][w] = tfidf

In [107]:
# sample of content for doc no. 10
print(docid2words['10'])
print(docid2words_tfidf['10'])

['privat', 'entranc', 'cozi', 'histor', 'privat', 'studio']
{'privat': 0.18737070484602142, 'cozi': 0.13566021410161513, 'entranc': 0.23998598119244557, 'histor': 0.19322314834126833, 'studio': 0.2496669036660849}


In [108]:
# computing document vectors
docid2vec = {}

# todo: use numpy
for d, doc_words in docid2words.items():
    
    # skip if we have it already
    if d in docid2vec.keys(): continue
    
    # empty vector
    docid2vec[d] = []
    
    for w in word2id.keys():
        
        # if that word is the selected doc
        if w in doc_words:
            
            # todo: fix
            # skip if missing
            if not d in docid2words_tfidf.keys():
                continue
            if not w in docid2words_tfidf[d]:
                docid2vec[d].append(0.0)
                continue
            docid2vec[d].append(docid2words_tfidf[d][w])
            
        # else, fill with zeros    
        else: 
            docid2vec[d].append(0.0)

In [111]:
# length of a document vector
# it's the same length of the vocabulary
print(len(docid2vec['999']))
print(len(word2id.keys()))

10127
10127


# Step 3

In [180]:
# ask user
query = input()
query = preprocessing_nltk(query)

print(query)

beautiful apartment downtown balcony
['beauti', 'apart', 'downtown', 'balconi']


In [181]:
# build user input query vector
uiq_vec = []

# for each word in vocabulary
for w in word2docid.keys():
    
    # default is 0.0
    to_push = 0.0
    
    # if word is contained in user's query
    # we need to push tfidf value for that word
    if w in query:
        to_push = compute_tfidf(query.count(w), len(query), len(docid2words.keys()), len(word2docid[w]))
    
    # add value in position 
    uiq_vec.append(to_push)

In [182]:
# here we are taking a list of documents numbers for each word in user query and
# intersect these lists to receive only the numbers of documents which contain all of these query words.
sets = []

for q in query:
    sets.append(word2docid.get(q) or set())
matching_docs = list(set.intersection(*sets))

print(len(matching_docs))
print(matching_docs)

9
['15417', '17058', '16178', '1243', '7172', '8543', '4451', '14105', '5324']


In [183]:
# todo: show matching_docs through pandas
for md in matching_docs:
    df = pd.read_csv(tsv_dir + 'doc_' + md + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

15417
Austin
Large 1-bedroom apartment in the heart of South Austin. Beautiful view of downtown from balcony! Bus stop right across the street that will take you Dowtown/Zilker/Barton Springs. Bars and restaurants within walking distance. Quiet Complex w/parking.
30.2361541000197
-97.7829581256141

17058
Dallas
In the heart of historic Old East Dallas, this is a beautiful 1930’s Victorian apartment house that’s as classic as it is comfortable. Relax on the big, shared upstairs balcony with a cup of coffee or glass of wine overlooking sleepy Junius Street. Find original character everywhere with modern amenities like highspeed WiFi, Amazon Fire TV w/ Netflix and Hulu. Prime location near Downtown, Deep Ellum and Lower Greenville. Cafes, shopping, nightlife and restaurants are conveniently close.
32.795928386191
-96.7716865515425

16178
Dallas
My apartment is a cozy artist get away only a few minutes from nightlife, shopping and dinning in Deep Ellum, Lower Greenville, and downtown Dalla

In [184]:
matching_docs_cos = []

# to test it all over the docs -> docid2vec.keys()
for d in matching_docs:
    if len(uiq_vec) == len(docid2vec[d]):
        matching_docs_cos.append((cosine_similarity(uiq_vec, docid2vec[d]), d))

In [185]:
# creating heap structure
heapq.heapify(matching_docs_cos)

# showing the top-k where k = 10
topk_cos = heapq.nlargest(10, matching_docs_cos)
print(topk_cos)

[(0.3946997927850433, '7172'), (0.29868038710081823, '8543'), (0.28160131275452094, '15417'), (0.21705066786852723, '5324'), (0.18638229010465457, '14105'), (0.16080989170544646, '17058'), (0.15479289383844108, '1243'), (0.15090774604746593, '4451'), (0.15090774604746593, '16178')]


In [186]:
# todo: show top-k through pandas
for md in topk_cos:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.3946997927850433, '7172')
Austin
New contemporary apartment less than a mile from beautiful Lake Austin and a 15-20 minute drive to downtown Austin. Apartment features a living area, private bedroom with full bed, new queen sofa bed, private entrance and balcony.
30.3455405518117
-97.8594403179482

(0.29868038710081823, '8543')
Austin
Beautiful 1 bed 1 bath apartment with balcony view AND garage parking. In downtown Austin! 5 min drive from 6th Street/ Town Lake, &amp; Rainey Street. Average cost for a RideShare if you don't have a car to downtown nightlife is $6.
30.2861570183293
-97.7475284766327

(0.28160131275452094, '15417')
Austin
Large 1-bedroom apartment in the heart of South Austin. Beautiful view of downtown from balcony! Bus stop right across the street that will take you Dowtown/Zilker/Barton Springs. Bars and restaurants within walking distance. Quiet Complex w/parking.
30.2361541000197
-97.7829581256141

(0.21705066786852723, '5324')
Houston
Contemporary styled apartme

# Step 4

This is an example of how another feature of interest (Location in this case) could be combined to our previous "ranking" method. In fact, we could consider document cosine similarity as a good starting point to be enhanced with a new normalized index (to maintain consistency and still being able to compare the old and new ranks).

Hence, we are going to ask user's preferred geographical position (city name) and, after computing the distances between user's position and each room ads. Considering that there's a inverse correlation between distance and the way we deal with ranking. On one hand, the higher the rank (cosine) is, the better it is. On the other hand, the lower the distance is, the better it is. For this reason, we're going to invert the normalized index obtained through distance calculations.

The combination between this two normalized values (cosine similarity and distance) will be achieved through a weighted mean (https://en.wikipedia.org/wiki/Weighted_arithmetic_mean)

Moreover, we decided to give more weight to distances in Step 4. Lastly, this could be considered like a small example of how much a feature could contribute to a decent ranking system, as an alternative to cut off a significant proportion of data (filtering). 

In [187]:
# ask user's position to show him/her
# the better place nearby
u_pos = input()

Houston


In [188]:
# get coords
if not u_pos in geo2coords.keys():
    print('This city is not supported!')

coords = geo2coords[u_pos]

In [189]:
print(coords)

('29.8293522272149', '-95.0815494887563')


In [190]:
distances = []

# for each matching document
for m in matching_docs:
    
    # get city and its coords
    d_city = docid2geo[m]
    d_coords = geo2coords[d_city]
    
    # computing distance between user's location
    # and each document's one
    dist = distance_function(coords, d_coords)
    distances.append((dist, m, d_city))

In [191]:
print(distances)
print(len(matching_docs))

[(261.00404694199005, '15417', 'Austin'), (369.64077265826234, '17058', 'Dallas'), (369.64077265826234, '16178', 'Dallas'), (369.64077265826234, '1243', 'Dallas'), (261.00404694199005, '7172', 'Austin'), (261.00404694199005, '8543', 'Austin'), (369.64077265826234, '4451', 'Dallas'), (369.64077265826234, '14105', 'Dallas'), (0.0, '5324', 'Houston')]
9


In [198]:
distance_km = [x[0] for x in distances]

# we need to invert the index as there's an inverse correlation
# between the way we deal with our index from 0 to 1 and the distance 
distance_km_norm = 1.0 - normalized(np.asarray(distance_km, dtype=np.float32))

print(distance_km_norm)
print(len(distance_km_norm))

[[0.722953200340271, 0.6076391339302063, 0.6076391339302063, 0.6076391339302063, 0.722953200340271, 0.722953200340271, 0.6076391339302063, 0.6076391339302063, 1.0]]
1


In [193]:
# merge cosine_similarity and distance
matching_docs_ni = []

# combining cosine_similarity and the new index
for i in range(len(matching_docs_cos)):
    m_doc = matching_docs_cos[i]
    m_doc_id = m_doc[1]
    
    # cosine_similarity and normalized distance
    m_doc_cos = m_doc[0]
    dist_i = distance_km_norm[0][i]
    
    # weighted mean value
    # giving more weight to the distance...
    # ((w1 * x1) + (w2 * x2)) / (w1 + w2) 
    mean_v = ((m_doc_cos * 0.2) + (dist_i * 0.8))
    matching_docs_ni.append((mean_v, m_doc_id))

In [194]:
# creating heap structure
heapq.heapify(matching_docs_ni)

# showing the top-k where k = 10
topk = heapq.nlargest(10, matching_docs_ni)
print(topk)

[(0.8434101335737055, '5324'), (0.6573025188292254, '7172'), (0.6380986376923804, '8543'), (0.60854410948171, '16178'), (0.5424315696950692, '15417'), (0.523387765165096, '14105'), (0.5182732854852543, '17058'), (0.5170698859118533, '1243'), (0.5162928563536583, '4451')]


In [195]:
# todo: visualize data with pandas
for md in topk:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.8434101335737055, '5324')
Houston
Contemporary styled apartment with a beautiful color scheme attached. Consists of 1 Bedroom (Master), Dining area, 2 balconies, Kitchen (Fully equipped), Living Room, and Bathroom with bath/shower combo and two sinks included. \nLocated 24 miles from NRG Stadium; approximately 35 minutes away. \nLocated 22 miles from Downtown Houston; approximately 32 minutes away.
29.7320840939491
-95.6537604044484

(0.6573025188292254, '7172')
Austin
New contemporary apartment less than a mile from beautiful Lake Austin and a 15-20 minute drive to downtown Austin. Apartment features a living area, private bedroom with full bed, new queen sofa bed, private entrance and balcony.
30.3455405518117
-97.8594403179482

(0.6380986376923804, '8543')
Austin
Beautiful 1 bed 1 bath apartment with balcony view AND garage parking. In downtown Austin! 5 min drive from 6th Street/ Town Lake, &amp; Rainey Street. Average cost for a RideShare if you don't have a car to downtown nig