**TSV**

In [144]:
# imports
import csv
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# globals
base_dir = './data/'
tsv_dir = base_dir + 'tsv/'
dataset = base_dir + 'Airbnb_Texas_Rentals.csv'

In [145]:
stopwords = set(stopwords.words('english'))

# cleaning stuff
def preprocessing_nltk(e):
    ps = PorterStemmer()
    
    e = e.lower().replace('\n', '')
    e = word_tokenize(e)
    e = [w for w in e if w.isalpha()]
    e = [w for w in e if not w in stopwords]
    e = [ps.stem(w) for w in e]
    return e

In [149]:
docid2words = {}
word2id = {}
word2docid = {}
geo2coords = {}

with open(dataset, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    for i, elems in enumerate(csvreader): # line
    
        # skip the first line as we're dealing with a .csv
        if i == 0: continue
        
        # parsing data and taking the index that has to be part of filenames
        #print(len(line.rstrip('\n').split(',')))
        #elems = list(map(lambda x: x.replace('"', ''), line.rstrip('\n').split(',')))
        index = elems[0]

        # geo
        coords = (elems[6], elems[7])
        city = elems[3]
        
        if not city in geo2coords.keys():
            geo2coords[city] = coords
        
        # preprocessing
        descr = preprocessing_nltk(elems[5])
        title = preprocessing_nltk(elems[8])
        elems[5] = ' '.join(descr)
        elems[8] = ' '.join(title)
        
        # docid2words
        docid2words[index] = []
        docid2words[index].extend(descr)
        docid2words[index].extend(title)
        
        # word2id
        for word in docid2words[index]:
            if not word in word2id.keys(): 
                word2id[word] = len(word2id.keys())
        
        # word2docid
        for word in docid2words[index]:
            if not word in word2docid.keys():
                word2docid[word] = set(index)
            else:
                word2docid[word].add(index)
        
        # put .tsv files into 'tsv' folder (that already has to exist)
        with open(tsv_dir + 'doc_' + index + '.tsv', 'w') as doc_out:
            doc_out.write('\t'.join(elems[1:]))

In [150]:
print(word2id['marilyn'])
print(geo2coords)

3190
{'Humble': ('30.0201379199512', '-95.2939960042513'), 'San Antonio': ('29.5030676756061', '-98.4476879378504'), 'Houston': ('29.8293522272149', '-95.0815494887563'), 'Bryan': ('30.6373042787676', '-96.3378459729631'), 'Fort Worth': ('32.7470973543511', '-97.2864343970125'), 'Conroe': ('30.370455202614', '-95.3853190358678'), 'Cedar Creek': ('30.109838308143', '-97.4734169128682'), 'Rockport': ('28.1037257224157', '-97.025832094037'), 'Irving': ('32.8621160084978', '-97.0026429124416'), 'Euless': ('32.8653483241832', '-97.0810343348168'), 'Round Mountain': ('30.3520048470848', '-98.146242154868'), 'Kerrville': ('30.0736670299709', '-99.1613652179931'), 'New Braunfels': ('29.6428109748231', '-98.1144006594656'), 'Austin': ('30.3095220042058', '-97.731710471095'), 'Port Aransas': ('27.8171755178407', '-97.0680901451661'), 'Frisco': ('33.1557762184237', '-96.9304385019675'), 'Katy': ('29.8238021572358', '-95.7306365899502'), 'College Station': ('30.5467038425849', '-96.2828530854182')

In [151]:
import math 

# inverse index
word2docid_tfidf = {}

# docid2word_tfidf
docid2word_tfidf = {}

# create inverse index with tfidf
for w, docs in word2docid.items():
    
    # skip in case we have it already
    if w in word2docid_tfidf.keys(): continue
    
    # empty list (of future tuples)
    word2docid_tfidf[w] = []
    
    # for each document that contain w
    for d in docs:
        
        if not d in docid2word_tfidf.keys():
            docid2word_tfidf[d] = {}
        
        # get document words (content)
        if not d in docid2words.keys(): continue
        content = docid2words[d]
        
        # compute tfidf
        tf = content.count(w) / len(content)
        idf = math.log10(len(docid2words.keys()) / len(docs))
        tfidf = tf * idf
        
        # fill it!
        word2docid_tfidf[w].append((d, tfidf))
        docid2word_tfidf[d][w] = tfidf

In [152]:
# fix not matching results
print(len(set(docid2word_tfidf.keys())))
print(len(set(docid2words.keys())))
print(docid2word_tfidf['10'])

18252
18259
{'privat': 0.18738656253333066, 'cozi': 0.13566814294526974}


In [153]:
# computing document vectors
docid2vec = {}

for d, doc_words in docid2words.items():
    
    # skip if we have it already
    if d in docid2vec.keys(): continue
    
    # empty vector
    docid2vec[d] = []
    
    for w in word2id.keys():
        
        # if that word is the selected doc
        if w in doc_words:
            
            # todo: fix
            # skip if missing
            if not d in docid2word_tfidf.keys():
                continue
            if not w in docid2word_tfidf[d]:
                docid2vec[d].append(0.0)
                continue
            docid2vec[d].append(docid2word_tfidf[d][w])
            
        # else, fill with zeros    
        else: 
            docid2vec[d].append(0.0)

In [154]:
# ask user
query = input()
query = preprocessing_nltk(query)

print(query)

a beautiful room at the beach
['beauti', 'room', 'beach']


In [155]:
# todo: maybe it's wrong... it should consider only stuff within the intersection.
# here we are taking a list of documents numbers for each word in user query and
# intersect these lists to receive only the numbers of documents which contain all of these query words.
sets = []

for q in query:
    sets.append(word2docid.get(q) or set())
matching_docs = list(set.intersection(*sets))

In [156]:
print(len(matching_docs))
print(matching_docs)

75
['2649', '2577', '2114', '12867', '2572', '1238', '2093', '15661', '13263', '15094', '14523', '2166', '1057', '12752', '3368', '288', '2670', '537', '11273', '17413', '8949', '10451', '17606', '4364', '16613', '2636', '16112', '1239', '8523', '18147', '11885', '12025', '2045', '5831', '16853', '12946', '2582', '1760', '16353', '7124', '2592', '15428', '16144', '15885', '10747', '2335', '13662', '14106', '507', '8', '16514', '1564', '240', '4056', '3652', '8751', '4953', '2408', '952', '1604', '8477', '2328', '2020', '2249', '11245', '2355', '14685', '482', '379', '2193', '18164', '2252', '251', '13019', '5562']


In [157]:
print(len(docid2vec.keys()))
# print(len(docid2vec['9999']))
# print(len(docid2vec['89']))
# print(docid2words['9999'])
# print(docid2words['89'])

18259


In [158]:
from scipy import spatial

def cosine_similarity(vec_src, vec_tgt):
    return 1 - spatial.distance.cosine(vec_src, vec_tgt)

In [159]:
# print(cosine_similarity(docid2vec['9999'], docid2vec['89']))

# pointer
i = 0

while i < len(matching_docs) - 1:
    src_id = matching_docs[i]
    tgt_id = matching_docs[i + 1]
    
    print('similarity between ' + src_id + ' and ' + (tgt_id))
    print(docid2words[src_id])
    print(docid2words[tgt_id])
    print(cosine_similarity(docid2vec[src_id], docid2vec[tgt_id]))
    
    i += 1

similarity between 2649 and 2577
['quiet', 'clean', 'beach', 'save', 'time', 'money', 'vacat', 'trip', 'central', 'locat', 'condo', 'great', 'spot', 'kick', 'back', 'enjoy', 'hard', 'earn', 'time', 'away', 'kitchen', 'stock', 'dish', 'cookwar', 'flatwar', 'come', 'equip', 'electr', 'rang', 'oven', 'refriger', 'coffe', 'maker', 'microwav', 'laundri', 'room', 'coin', 'oper', 'laundri', 'floor', 'remodel', 'fulli', 'furnish', 'great', 'view', 'beach', 'beauti', 'corpu', 'christi', 'beach', 'condo']
['exquisit', 'stun', 'home', 'unparallel', 'eleg', 'locat', 'pharoah', 'valley', 'complet', 'remodel', 'sq', 'ft', 'pure', 'luxuri', 'amp', 'attent', 'detail', 'evid', 'everi', 'room', 'beauti', 'room', 'privat', 'bathroom', 'across', 'hall', 'come', 'take', 'walk', 'tree', 'line', 'street', 'glass', 'wine', 'back', 'porch', 'minut', 'bay', 'minut', 'beach', 'padr', 'inland', 'come', 'enjoy', 'spark', 'citi', 'sea', 'ami', 'place', 'room', 'book', 'look']
0.07353539441849077
similarity between 

In [134]:
print(cosine_similarity(docid2vec['9138'], docid2vec['8131']))

0.13497472174093772


In [160]:
# step 4
print(geo2coords)

{'Humble': ('30.0201379199512', '-95.2939960042513'), 'San Antonio': ('29.5030676756061', '-98.4476879378504'), 'Houston': ('29.8293522272149', '-95.0815494887563'), 'Bryan': ('30.6373042787676', '-96.3378459729631'), 'Fort Worth': ('32.7470973543511', '-97.2864343970125'), 'Conroe': ('30.370455202614', '-95.3853190358678'), 'Cedar Creek': ('30.109838308143', '-97.4734169128682'), 'Rockport': ('28.1037257224157', '-97.025832094037'), 'Irving': ('32.8621160084978', '-97.0026429124416'), 'Euless': ('32.8653483241832', '-97.0810343348168'), 'Round Mountain': ('30.3520048470848', '-98.146242154868'), 'Kerrville': ('30.0736670299709', '-99.1613652179931'), 'New Braunfels': ('29.6428109748231', '-98.1144006594656'), 'Austin': ('30.3095220042058', '-97.731710471095'), 'Port Aransas': ('27.8171755178407', '-97.0680901451661'), 'Frisco': ('33.1557762184237', '-96.9304385019675'), 'Katy': ('29.8238021572358', '-95.7306365899502'), 'College Station': ('30.5467038425849', '-96.2828530854182'), 'De

In [162]:
# step 4

import geopy.distance

# the variables of the function takes 2 values for each location
def distance_function (first_location, second_location):
    return round((geopy.distance.geodesic(first_location, second_location).km), 5)#this is better because considers the Earth as an ellipse

house_18257 = geo2coords['Washington']
house_18258 = geo2coords['Dallas']

distance_function (house_18257, house_18258)#this is better because considers the Earth as an ellipse

280.18022