# Homework 3 ADM
### group №15 composed by Francesco Romeo, Katsiaryna Zavadskaya, Leandro B. Gentili. 

# Step 1 & 2: tsv documents' creation

In [1]:
# imports
import csv
import heapq
import pandas as pd
import numpy as np
import math

# utils
from csv2dict import CSV2Dict
from utils import normalized
from utils import preprocessing_nltk
from utils import compute_tfidf
from utils import cosine_similarity
from utils import distance_function

# globals
base_dir = './data/'
tsv_dir = base_dir + 'tsv/'
dataset = base_dir + 'Airbnb_Texas_Rentals.csv'

# pickle store data
pickle_location = base_dir + 'store/'

#### As it is written in README file here we load tsv files which were created before using CSV2Dict class. The code for tsv files creation is provived in README and was used once in the very beginning to create these files. Here instead we load our previously created tsv files.

In [2]:
# wrapper for csv processing

# 1. dataset => is the dataset we're using
# 2. quotechar and delimiter are two csv reader prefs
# 3. splittsv => if None, any .tsv will be created; otherwise, for each line of the .csv a .tsv file will be 
#    created in the provided folder (tsv_dir)
# 4. topickle => is a tuple containing the method (load or save) and the folder where to store/read a pickle file
csv2dict = CSV2Dict(dataset, delimiter=',', quotechar='"', splittsv=tsv_dir, topickle=('load', pickle_location))
data = csv2dict.init()

Loading from pickle: ./data/store/store.pkl


#### docid2words is a list of dictionaries. Each dictionary contains the number of tsv document as a key and list of all words from title and description as a value. The value list is preprocessed by preprocessing_nltk function from utils.py. The length of the docid2words list is the same as the number of tsv documents.

#### word2id is simply a vocabulary of all the unique words from description and title of all the tsv docs.

#### word2docid is in fact an inverse index where for each word which is a key of dictionary we store a value e.g. a list of numbers of documents which contain this specific word.

#### geo2coords is also created using class csv2dict and it contains the name of the city and coordinates from the document.

#### docid2geo keeps the id of each document and the name of the city from this document.

In [3]:
# the list of words contained in a document
# it simply is the document content
docid2words = data['docid2words']

# vocabulary
word2id = data['word2id']

# documents that contain a precise word
word2docid = data['word2docid']

# a collection of city, coords and document id
geo2coords = data['geo2coords']

# document id related to the city name
docid2geo = data['docid2geo']

#### Next using our inverse index which is word2docid, for each key in word2docid, which is a unique word from title and description we add tf-idf value respectively to each word for each document. Sample is shown below.

In [4]:
tfidf_ii = csv2dict.tfidf_inverse_index(word2docid, docid2words)

# inverse index
word2docid_tfidf = tfidf_ii['word2docid_tfidf']

# docid2words_tfidf
docid2words_tfidf = tfidf_ii['docid2words_tfidf']

In [5]:
# sample of content for doc №10
print(docid2words['10'])
print(docid2words_tfidf['10'])

['privat', 'entranc', 'cozi', 'histor', 'privat', 'studio']
{'privat': 0.18737070484602142, 'cozi': 0.13566021410161513, 'entranc': 0.23998598119244557, 'histor': 0.19322314834126833, 'studio': 0.2496669036660849}


#### Our next step is to create document vectors, where for each doc id -- d for each word from document d we have tfidf. As a result for each document we have a numeric vector with tf-idf values on positions of each word from vocabulary.

In [6]:
# computing document vectors
docid2vec = {}

# empty vector
words_voc = list(word2id.keys()) 
empty_vec = [0.0 for w in words_voc] 

for d, doc_words in docid2words.items():
    
    # shallow copy through slicing
    docid2vec[d] = empty_vec[:]
    
    # putting tfidf values related to words contained
    # in document in position i of the vector
    for w in doc_words:
        i = words_voc.index(w)
        docid2vec[d][i] = docid2words_tfidf[d][w]

In [7]:
# length of a document vector
# it's the same length of the vocabulary
print(len(docid2vec['999']))
print(len(word2id.keys()))

10127
10127


# Step 3 : Search Engine

#### Asking user to put the query and preprocessing this query in the same way as title and description previously.

In [14]:
# ask user
query = input()
query = preprocessing_nltk(query)

print(query)

beautiful apartment downtown balcony
['beauti', 'apart', 'downtown', 'balconi']


#### Next we are creating query vector. We treat user query as a document from our collection, so for each word in the query we put tfidf value. This is our query vector.

In [20]:
# build user input query vector

# empty vector
words_voc = list(word2id.keys()) 
uiq_vec = [0.0 for w in words_voc] 

# for each word in query
for w in query:
    
    # if word is contained in user's query
    # we need to push tfidf value for that word
    i = words_voc.index(w)
    tfidf_w = compute_tfidf(query.count(w), len(query), len(docid2words.keys()), len(word2docid[w]))
    uiq_vec[i] = tfidf_w

#### Now when we have our query and word2docid which contains all the documents for each word, we find this list of id-documents for each word from query and intersect these lists. matching_docs is list of documents id which contain all the words from query. matching_docs is the result of our search engine.

In [23]:
# here we are taking a list of documents numbers for each word in user query and
# intersect these lists to receive only the numbers of documents which contain ALL of these query words.
sets = []

# for each word in user's query
for q in query:
    
    # get document ids containing that word or empty set()
    sets.append(word2docid.get(q) or set())

# conjunctive query (AND)
matching_docs = list(set.intersection(*sets))

# info
print(len(matching_docs))
print(matching_docs)

9
['5324', '1243', '14105', '15417', '7172', '4451', '16178', '8543', '17058']


#### Here we visualize each document which appeared in the result of our search engine.

In [24]:
for md in matching_docs:
    df = pd.read_csv(tsv_dir + 'doc_' + md + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

5324
Houston
Contemporary styled apartment with a beautiful color scheme attached. Consists of 1 Bedroom (Master), Dining area, 2 balconies, Kitchen (Fully equipped), Living Room, and Bathroom with bath/shower combo and two sinks included. \nLocated 24 miles from NRG Stadium; approximately 35 minutes away. \nLocated 22 miles from Downtown Houston; approximately 32 minutes away.
29.7320840939491
-95.6537604044484

1243
Dallas
In the heart of historic Old East Dallas, this is a beautiful 1930’s Victorian apartment house that’s as classic as it is comfortable. Relax on the big, shared upstairs balcony with a cup of coffee or glass of wine overlooking sleepy Junius Street. Find original character everywhere with modern amenities like highspeed WiFi and Amazon Fire TV w/ Netflix and Hulu. Prime location near Downtown, Deep Ellum and Lower Greenville. Cafes, shopping, nightlife and restaurants are conveniently close.
32.7963935721254
-96.7717690173793

14105
Dallas
Our apartment is a cute ar

#### Next we are calculating cosine similarity between each document from matching_docs and query vector. To do that we use scipy package.

In [38]:
matching_docs_cos = []

for d in matching_docs:
    matching_docs_cos.append((cosine_similarity(uiq_vec, docid2vec[d]), d))

#### Then we create heap structure from list with documents' vectors. And after that we take top 10 documents according to cosine similarity.

In [40]:
# creating heap structure
heapq.heapify(matching_docs_cos)

# showing the top-k where k = 10
topk_cos = heapq.nlargest(10, matching_docs_cos)
print(topk_cos)

[(0.3946997927850433, '7172'), (0.29868038710081823, '8543'), (0.28160131275452094, '15417'), (0.21705066786852723, '5324'), (0.18638229010465457, '14105'), (0.16080989170544646, '17058'), (0.15479289383844108, '1243'), (0.15090774604746593, '4451'), (0.15090774604746593, '16178')]


In [41]:
for md in topk_cos:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.3946997927850433, '7172')
Austin
New contemporary apartment less than a mile from beautiful Lake Austin and a 15-20 minute drive to downtown Austin. Apartment features a living area, private bedroom with full bed, new queen sofa bed, private entrance and balcony.
30.3455405518117
-97.8594403179482

(0.29868038710081823, '8543')
Austin
Beautiful 1 bed 1 bath apartment with balcony view AND garage parking. In downtown Austin! 5 min drive from 6th Street/ Town Lake, &amp; Rainey Street. Average cost for a RideShare if you don't have a car to downtown nightlife is $6.
30.2861570183293
-97.7475284766327

(0.28160131275452094, '15417')
Austin
Large 1-bedroom apartment in the heart of South Austin. Beautiful view of downtown from balcony! Bus stop right across the street that will take you Dowtown/Zilker/Barton Springs. Bars and restaurants within walking distance. Quiet Complex w/parking.
30.2361541000197
-97.7829581256141

(0.21705066786852723, '5324')
Houston
Contemporary styled apartme

# Step 4

#### This is an example of how another feature of interest (Location in this case) could be combined to our previous "ranking" method. In fact, we could consider document cosine similarity as a good starting point to be enhanced with a new normalized index (to maintain consistency and still being able to compare the old and new ranks).

#### Hence, we are going to ask user's preferred geographical position (city name) and, after computing the distances between user's position and each room ads. Considering that there's a inverse correlation between distance and the way we deal with ranking. On one hand, the higher the rank (cosine) is, the better it is. On the other hand, the lower the distance is, the better it is. For this reason, we're going to invert the normalized index obtained through distance calculations.

#### The combination between this two normalized values (cosine similarity and distance) will be achieved through a weighted mean (https://en.wikipedia.org/wiki/Weighted_arithmetic_mean)

#### Moreover, we decided to give more weight to distances in Step 4. Lastly, this could be considered like a small example of how much a feature could contribute to a decent ranking system, as an alternative to cut off a significant proportion of data (filtering). 

In [28]:
# ask user's position to show him/her
# the better place nearby
u_pos = input()

Houston


In [29]:
# get coords of users input
if not u_pos in geo2coords.keys():
    print('This city is not supported!')

coords = geo2coords[u_pos]

In [30]:
print(coords)

('29.8293522272149', '-95.0815494887563')


#### Here we calculate distances between each document from matching_docs and user's city.

In [31]:
distances = []

# for each matching document
for m in matching_docs:
    
    # get city and its coords
    d_city = docid2geo[m]
    d_coords = geo2coords[d_city]
    
    # computing distance between user's location
    # and each document's one
    dist = distance_function(coords, d_coords)
    distances.append((dist, m, d_city))

In [37]:
print(distances)
print(len(distances))
print(len(matching_docs))

[(0.0, '5324', 'Houston'), (369.64077265826234, '1243', 'Dallas'), (369.64077265826234, '14105', 'Dallas'), (261.00404694199005, '15417', 'Austin'), (261.00404694199005, '7172', 'Austin'), (369.64077265826234, '4451', 'Dallas'), (369.64077265826234, '16178', 'Dallas'), (261.00404694199005, '8543', 'Austin'), (369.64077265826234, '17058', 'Dallas')]
9
9


In [50]:
distance_km = [x[0] for x in distances]

# we need to invert the index as there's an inverse correlation
# between the way we deal with our index from 0 to 1 and the distance 
distance_km_norm = 1.0 - normalized(np.asarray(distance_km, dtype=np.float32))
matching_docs_dist = [(distance_km_norm[0][i], distances[i][1], distances[i][2]) for i in range(len(distances))]

# index to easily access and avoid loop of loops

matching_docs_dist_i = [i[1] for i in matching_docs_dist]

print(matching_docs_dist)
print(matching_docs_dist_i)
print(len(matching_docs_dist))

[(1.0, '5324', 'Houston'), (0.60763913, '1243', 'Dallas'), (0.60763913, '14105', 'Dallas'), (0.7229532, '15417', 'Austin'), (0.7229532, '7172', 'Austin'), (0.60763913, '4451', 'Dallas'), (0.60763913, '16178', 'Dallas'), (0.7229532, '8543', 'Austin'), (0.60763913, '17058', 'Dallas')]
['5324', '1243', '14105', '15417', '7172', '4451', '16178', '8543', '17058']
9


#### Next we are merging together our two rankings with weights 20% to cosine and 80% to distance.

In [54]:
# merge cosine_similarity and distance
matching_docs_ni = []

# combining cosine_similarity and the new index
for md_cos in matching_docs_cos:
    
    # document id
    m_cos_id = md_cos[1]
    
    # get cosine_similarity value
    m_cos_val = md_cos[0]
    
    # get distance value
    dist_index = matching_docs_dist_i.index(m_cos_id)
    dist_val = matching_docs_dist[dist_index][0]
    
    # weighted mean value (more importance to distances)
    mean_v = (m_cos_val * 0.2) + (dist_val * 0.8)
    matching_docs_ni.append((mean_v, m_cos_id))

#### Similarly to cosine similarity, we create heap structure and receive top 10 documents according our new ranking.

In [57]:
# creating heap structure
heapq.heapify(matching_docs_ni)

# showing the top-k where k = 10
topk = heapq.nlargest(10, matching_docs_ni)

# info
print(topk)

[(0.8434101335737055, '5324'), (0.6573025188292254, '7172'), (0.6380986376923804, '8543'), (0.6346828228231209, '15417'), (0.523387765165096, '14105'), (0.5182732854852543, '17058'), (0.5170698859118533, '1243'), (0.5162928563536583, '4451'), (0.5162928563536583, '16178')]


#### Visualisation of top 10 documents according our new ranking:

In [58]:
for md in topk:
    df = pd.read_csv(tsv_dir + 'doc_' + md[1] + '.tsv', sep='\t', usecols=[2, 4, 5, 6])
    print(md)
    print(df.columns[0])
    print(df.columns[1])
    print(df.columns[2])
    print(df.columns[3] + '\n')

(0.8434101335737055, '5324')
Houston
Contemporary styled apartment with a beautiful color scheme attached. Consists of 1 Bedroom (Master), Dining area, 2 balconies, Kitchen (Fully equipped), Living Room, and Bathroom with bath/shower combo and two sinks included. \nLocated 24 miles from NRG Stadium; approximately 35 minutes away. \nLocated 22 miles from Downtown Houston; approximately 32 minutes away.
29.7320840939491
-95.6537604044484

(0.6573025188292254, '7172')
Austin
New contemporary apartment less than a mile from beautiful Lake Austin and a 15-20 minute drive to downtown Austin. Apartment features a living area, private bedroom with full bed, new queen sofa bed, private entrance and balcony.
30.3455405518117
-97.8594403179482

(0.6380986376923804, '8543')
Austin
Beautiful 1 bed 1 bath apartment with balcony view AND garage parking. In downtown Austin! 5 min drive from 6th Street/ Town Lake, &amp; Rainey Street. Average cost for a RideShare if you don't have a car to downtown nig