# Step 1 & 2

In [68]:
# imports
import csv
import heapq
import pandas as pd
import numpy as np
import math

# utils
from utils import normalized
from utils import preprocessing_nltk
from utils import compute_tfidf
from utils import cosine_similarity
from utils import distance_function

# globals
base_dir = './data/'
tsv_dir = base_dir + 'tsv/'
dataset = base_dir + 'Airbnb_Texas_Rentals.csv'

In [99]:
# the list of words contained in a document
# it simply is the document content
docid2words = {}

# vocabulary
word2id = {}

# documents that contain a precise word
word2docid = {}

# a collection of city, coords and document id
geo2coords = {}

# document id related to the city name
docid2geo = {}

# processing the main .csv and creating a no. of .tsv
with open(dataset, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    
    # extracting data from each line
    for i, elems in enumerate(csvreader): # line
    
        # skip header
        if i == 0: continue
        
        # file index
        index = elems[0]
        
        # geo (coords and city)
        coords = (elems[6], elems[7])
        city = elems[3]
        
        # preprocessing content (words)
        descr = preprocessing_nltk(elems[5])
        title = preprocessing_nltk(elems[8])
        
        # discarding docs with no words...
        if len(descr) == 0 and len(title) == 0: continue
        
        # we don't want to deal with no location (needed in step 4)
        if all(isinstance(c, float) for c in coords): continue  
        
        # add city and its coords (no duplicates)
        if not city in geo2coords.keys():
            geo2coords[city] = coords       

        # docid2geo
        docid2geo[index] = city
        
        # docid2words
        docid2words[index] = []
        docid2words[index].extend(descr)
        docid2words[index].extend(title)
        
        # working with words to fill dictionaries
        for word in docid2words[index]:
            
            # word2id
            if not word in word2id.keys(): 
                word2id[word] = len(word2id.keys())
            
            # word2docid
            if not word in word2docid.keys():
                word2docid[word] = set([index])
            else:
                word2docid[word].add(index)
        
        # produce tsv
        # put .tsv files into 'tsv' folder (that already has to exist)
        with open(tsv_dir + 'doc_' + index + '.tsv', 'w') as doc_out:
            doc_out.write('\t'.join(elems[1:]))

Creating the inverse index with tf-idf

In [106]:
# inverse index
word2docid_tfidf = {}

# docid2words_tfidf
docid2words_tfidf = {}

# create inverse index with tfidf
for w, docs in word2docid.items():
    
    # skip in case we already have a word in the vocabulary
    if w in word2docid_tfidf.keys(): continue
    
    # empty list (of future tuples)
    word2docid_tfidf[w] = []
    
    # for each document that contains w
    for d in docs:
        
        # create an empty structure if it's the first match
        if not d in docid2words_tfidf.keys():
            docid2words_tfidf[d] = {}
        
        # get document words (all its words)
        content = docid2words[d]
        
        # compute tfidf
        tfidf = compute_tfidf(content.count(w), len(content), len(docid2words.keys()), len(docs))
        
        # fill the vector
        word2docid_tfidf[w].append((d, tfidf))
        docid2words_tfidf[d][w] = tfidf

In [107]:
# sample of content for doc no. 10
print(docid2words['10'])
print(docid2words_tfidf['10'])

['privat', 'entranc', 'cozi', 'histor', 'privat', 'studio']
{'privat': 0.18737070484602142, 'cozi': 0.13566021410161513, 'entranc': 0.23998598119244557, 'histor': 0.19322314834126833, 'studio': 0.2496669036660849}


In [108]:
# computing document vectors
docid2vec = {}

# todo: use numpy
for d, doc_words in docid2words.items():
    
    # skip if we have it already
    if d in docid2vec.keys(): continue
    
    # empty vector
    docid2vec[d] = []
    
    for w in word2id.keys():
        
        # if that word is the selected doc
        if w in doc_words:
            
            # todo: fix
            # skip if missing
            if not d in docid2words_tfidf.keys():
                continue
            if not w in docid2words_tfidf[d]:
                docid2vec[d].append(0.0)
                continue
            docid2vec[d].append(docid2words_tfidf[d][w])
            
        # else, fill with zeros    
        else: 
            docid2vec[d].append(0.0)

In [111]:
# length of a document vector
# it's the same length of the vocabulary
print(len(docid2vec['999']))
print(len(word2id.keys()))

10127
10127


# Step 3

In [129]:
# ask user
query = input()
query = preprocessing_nltk(query)

print(query)

a luxurious apartment downtown with pool quiet
['luxuri', 'apart', 'downtown', 'pool', 'quiet']


In [130]:
# build user input query vector
uiq_vec = []

# for each word in vocabulary
for w in word2docid.keys():
    
    # default is 0.0
    to_push = 0.0
    
    # if word is contained in user's query
    # we need to push tfidf value for that word
    if w in query:
        to_push = compute_tfidf(query.count(w), len(query), len(docid2words.keys()), len(word2docid[w]))
    
    # add value in position 
    uiq_vec.append(to_push)

In [131]:
# here we are taking a list of documents numbers for each word in user query and
# intersect these lists to receive only the numbers of documents which contain all of these query words.
sets = []

for q in query:
    sets.append(word2docid.get(q) or set())
matching_docs = list(set.intersection(*sets))

print(len(matching_docs))
print(matching_docs)

2
['13864', '637']


In [132]:
# todo: show matching_docs through pandas
for md in matching_docs:
    df = pd.read_csv(tsv_dir + 'doc_' + md + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

13864
Austin
Quiet relax atmosphere just 10 minutes from downtown. 1 bed room 1 bath with washer dryer included,queen bed everything all white, balcony, 17ft tall ceilings, garden tub separate from shower. Privacy and quite neighbors, own desires parking spot. Pets welcome, gym open anytime with lap pool, tv outside and bar b q pit.
30.2589482442465
-97.8628891451995

637
Austin
Quiet relax atmosphere just 10 minutes from downtown. 1 bed room 1 bath with washer dryer included,queen bed everything all white, balcony, 17ft tall ceilings, garden tub separate from shower. Privacy and quite neighbors, own desires parking spot. Pets welcome, gym open anytime with lap pool, tv outside and bar b q pit.
30.2589482442465
-97.8628891451995



In [133]:
matching_docs_cos = []

# to test it all over the docs -> docid2vec.keys()
for d in matching_docs:
    if len(uiq_vec) == len(docid2vec[d]):
        matching_docs_cos.append((cosine_similarity(uiq_vec, docid2vec[d]), d))

In [134]:
# creating heap structure
heapq.heapify(matching_docs_cos)

# showing the top-k where k = 10
topk_cos = heapq.nlargest(10, matching_docs_cos)
print(topk_cos)

[(0.20318214484274322, '637'), (0.20318214484274322, '13864')]


In [135]:
# todo: show top-k through pandas
for md in topk_cos:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.20318214484274322, '637')
Austin
Quiet relax atmosphere just 10 minutes from downtown. 1 bed room 1 bath with washer dryer included,queen bed everything all white, balcony, 17ft tall ceilings, garden tub separate from shower. Privacy and quite neighbors, own desires parking spot. Pets welcome, gym open anytime with lap pool, tv outside and bar b q pit.
30.2589482442465
-97.8628891451995

(0.20318214484274322, '13864')
Austin
Quiet relax atmosphere just 10 minutes from downtown. 1 bed room 1 bath with washer dryer included,queen bed everything all white, balcony, 17ft tall ceilings, garden tub separate from shower. Privacy and quite neighbors, own desires parking spot. Pets welcome, gym open anytime with lap pool, tv outside and bar b q pit.
30.2589482442465
-97.8628891451995



# Step 4

In [136]:
# todo: geo2coords has to be preprocessed

# ask user's position to show him/her
# the better place nearby
u_pos = input()

Austin


In [138]:
# get coords
if not u_pos in geo2coords.keys():
    print('This city is not supported!')

coords = geo2coords[u_pos]

In [139]:
print(coords)

('30.3095220042058', '-97.731710471095')


In [156]:
distances = []

# for each matching document
for m in matching_docs:
    
    # get city and its coords
    d_city = docid2geo[m]
    d_coords = geo2coords[d_city]
    
    # computing distance between user's location
    # and each document's one
    dist = distance_function(coords, d_coords)
    distances.append((dist, m, d_city))

In [157]:
print(distances)
print(len(matching_docs))

[(0.0, '13864', 'Austin'), (0.0, '637', 'Austin')]
2


In [168]:
distance_km = [x[0] for x in distances]

# we need to invert the index as there's an inverse correlation
# between the way we deal with our index from 0 to 1 and the distance 
distance_km_norm = 1.0 - normalized(np.asarray(distance_km, dtype=np.float32))

print(distance_km_norm)
print(len(distance_km_norm[0]))

[[1. 1.]]
2


In [178]:
# merge cosine_similarity and distance
matching_docs_ni = []

# combining cosine_similarity and the new index
for i in range(len(matching_docs_cos)):
    m_doc = matching_docs_cos[i]
    m_doc_id = m_doc[1]
    
    # cosine_similarity and normalized distance
    m_doc_cos = m_doc[0]
    dist_i = distance_km_norm[0][i]
    
    # weighted mean value
    # giving more weight to the distance...
    # ((w1 * x1) + (w2 * x2)) / (w1 + w2) 
    mean_v = ((m_doc_cos * 0.2) + (dist_i * 0.8))
    matching_docs_ni.append((mean_v, m_doc_id))

In [177]:
# creating heap structure
heapq.heapify(matching_docs_ni)

# showing the top-k where k = 10
topk = heapq.nlargest(10, matching_docs_ni)
print(topk)

[(0.8406364289685487, '637'), (0.8406364289685487, '13864')]


In [179]:
# todo: visualize data with pandas
for md in topk:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.8406364289685487, '637')
Austin
Quiet relax atmosphere just 10 minutes from downtown. 1 bed room 1 bath with washer dryer included,queen bed everything all white, balcony, 17ft tall ceilings, garden tub separate from shower. Privacy and quite neighbors, own desires parking spot. Pets welcome, gym open anytime with lap pool, tv outside and bar b q pit.
30.2589482442465
-97.8628891451995

(0.8406364289685487, '13864')
Austin
Quiet relax atmosphere just 10 minutes from downtown. 1 bed room 1 bath with washer dryer included,queen bed everything all white, balcony, 17ft tall ceilings, garden tub separate from shower. Privacy and quite neighbors, own desires parking spot. Pets welcome, gym open anytime with lap pool, tv outside and bar b q pit.
30.2589482442465
-97.8628891451995

