### Import packages

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 100)

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import pairwise_distances

In [3]:
import warnings
warnings.filterwarnings('ignore')

### Functions

### Load Data

In [4]:
data = pd.read_pickle('../data/jabama_gilan_products.pickle')
data.head()

Unnamed: 0,product_id,score,area,capacity,room,floor,stair,price,lat,lng,iranian_toilet,western_toilet,bathroom,exclusive,breakfast
0,villa-503599,,98,7,2,0,15.0,1000000,37.328673,49.128113,True,True,True,False,False
1,ecotourism-597882,3.0,80,7,1,0,2.0,660000,37.096812,49.65271,True,True,True,False,False
2,villa-361255,4.5,60,3,1,0,,467000,37.18439,49.169312,True,False,True,False,False
3,complex-376108,4.0,28,3,0,0,,500000,36.804887,49.413757,True,False,True,False,False
4,villa-323476,3.6,120,6,2,0,4.0,1500000,36.96306,49.578552,True,False,True,True,False


## Preprocessing

In [5]:
# fill none stair and score by zero
data.loc[data.score.isna(), 'score'] = 0
data.loc[data.stair.isna(), 'stair'] = 0

In [6]:
data.isna().sum()

product_id        0
score             0
area              0
capacity          0
room              0
floor             0
stair             0
price             0
lat               0
lng               0
iranian_toilet    0
western_toilet    0
bathroom          0
exclusive         0
breakfast         0
dtype: int64

In [7]:
data.iranian_toilet = data.iranian_toilet.apply(lambda x: 1 if x else 0)
data.western_toilet = data.western_toilet.apply(lambda x: 1 if x else 0)
data.bathroom = data.bathroom.apply(lambda x: 1 if x else 0)
data.exclusive = data.exclusive.apply(lambda x: 1 if x else 0)
data.breakfast = data.breakfast.apply(lambda x: 1 if x else 0)

In [62]:
data['type'] = data.product_id.apply(lambda x: x.split('-')[0])

## Numerical features

In [63]:
num_product = data.drop(['product_id', 'lat', 'lng'], axis=1)
num_columns = num_product.columns

In [65]:
num_product

Unnamed: 0,score,area,capacity,room,floor,stair,price,iranian_toilet,western_toilet,bathroom,exclusive,breakfast,type
0,0,98,7,2,0,15,1000000,1,1,1,0,0,villa
1,3,80,7,1,0,2,660000,1,1,1,0,0,ecotourism
2,4.5,60,3,1,0,0,467000,1,0,1,0,0,villa
3,4,28,3,0,0,0,500000,1,0,1,0,0,complex
4,3.6,120,6,2,0,4,1500000,1,0,1,1,0,villa
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4698,4.8,40,4,0,0,0,814000,1,0,1,0,0,complex
4699,4.5,70,6,2,0,0,1000000,1,1,1,0,0,villa
4700,4.7,150,15,3,2,20,2000000,1,1,1,1,0,apartment
4701,4.7,90,6,1,0,3,553000,1,0,1,0,0,villa


In [66]:
from sklearn.preprocessing import OneHotEncoder

In [67]:
enc = OneHotEncoder()

In [71]:
enc.fit_transform(num_product[['type']])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4703 stored elements and shape (4703, 9)>

In [64]:
num_scaler = MinMaxScaler()
scaled_num = num_scaler.fit_transform(num_product)

pd.DataFrame(scaled_num, index=data.product_id, columns=num_columns).head() # just for show result

ValueError: could not convert string to float: 'villa'

In [37]:
num_weights = {
    "score": 2,
    "area": 1,
    "capacity": 4,
    "room": 3,
    "floor": 2,
    "stair": 1,
    "price": 10,
    "iranian_toilet": 0.5,
    "western_toilet": 1,
    "bathroom": 1,
    "exclusive": 1,
    "breakfast": 1
}
num_weights_df = pd.DataFrame(num_weights, columns=num_columns, index=num_product.index) # convert to df

In [39]:
num_products_vector = pd.DataFrame(scaled_num, columns=num_columns).mul(num_weights_df)
num_products_vector.describe().T.sort_values('mean', ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
score,4703.0,1.444329,0.686545,0.0,1.4,1.76,1.88,2.0
capacity,4703.0,0.750824,0.400715,0.0,0.5,0.625,0.875,4.0
price,4703.0,0.666129,0.57513,0.0,0.35,0.5,0.75,10.0
western_toilet,4703.0,0.588773,0.492109,0.0,0.0,1.0,1.0,1.0
room,4703.0,0.553769,0.337437,0.0,0.375,0.75,0.75,3.0
iranian_toilet,4703.0,0.448969,0.151381,0.0,0.5,0.5,0.5,0.5
floor,4703.0,0.429159,0.224736,0.0,0.333333,0.333333,0.333333,2.0
exclusive,4703.0,0.305337,0.460599,0.0,0.0,0.0,1.0,1.0
area,4703.0,0.143796,0.086295,0.0,0.092308,0.130769,0.169231,1.0
stair,4703.0,0.066222,0.100763,0.0,0.0,0.042857,0.071429,1.0


### Calculate rooms distance

In [41]:
num_distance_matrix = pairwise_distances(num_products_vector.values, metric='euclidean')
pd.DataFrame(num_distance_matrix, index=data.product_id, columns=data.product_id) # just for show result

product_id,villa-503599,ecotourism-597882,villa-361255,complex-376108,villa-323476,complex-380288,villa-501818,complex-655221,villa-545746,apartment-508766,...,inn-389108,villa-402641,villa-639276,cottage-475354,cottage-611772,complex-370677,villa-379164,apartment-491137,villa-487515,villa-399760
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
villa-503599,0.000000,1.282490,2.179675,2.119556,2.043909,2.065832,1.580433,1.425173,2.277216,1.890498,...,2.015340,1.786333,0.196104,2.585176,1.882600,2.334963,1.817526,2.526058,2.184065,2.364748
ecotourism-597882,1.282490,0.000000,1.273215,1.250696,1.547531,1.194774,0.521527,1.768358,1.321070,1.233295,...,0.792763,0.947030,1.277851,2.372509,1.229803,1.345430,0.739055,1.998991,1.217098,1.377567
villa-361255,2.179675,1.273215,0.000000,0.428160,1.299772,0.464114,1.268166,2.025723,0.586616,0.637697,...,1.118775,1.561485,2.204308,2.655845,1.815793,0.449109,1.162974,2.439605,0.390949,1.008845
complex-376108,2.119556,1.250696,0.428160,0.000000,1.414929,0.134037,1.145224,1.810680,0.780371,0.871661,...,1.078583,1.683505,2.146008,2.692694,1.995500,0.378173,1.345288,2.591878,0.609333,0.988560
villa-323476,2.043909,1.547531,1.299772,1.414929,0.000000,1.391183,1.776716,2.053794,1.216837,1.147397,...,1.684800,1.533423,2.049156,1.836541,1.668361,1.411192,1.483672,1.777968,1.249304,1.463071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
complex-370677,2.334963,1.345430,0.449109,0.378173,1.411192,0.403464,1.281295,2.068934,0.534241,0.933578,...,1.012597,1.576230,2.351983,2.607733,1.892950,0.000000,1.284593,2.462563,0.479071,0.762765
villa-379164,1.817526,0.739055,1.162974,1.345288,1.483672,1.327074,1.041507,2.308572,1.102621,1.137334,...,0.763075,0.685859,1.821022,2.446903,0.940893,1.284593,0.000000,1.789937,1.095336,1.356083
apartment-491137,2.526058,1.998991,2.439605,2.591878,1.777968,2.533112,2.325902,2.891906,2.116946,2.156604,...,2.162268,1.442927,2.486355,1.606295,1.302374,2.462563,1.789937,0.000000,2.205339,2.103794
villa-487515,2.184065,1.217098,0.390949,0.609333,1.249304,0.563182,1.256194,2.053671,0.361716,0.689823,...,1.091452,1.372832,2.189082,2.369447,1.598713,0.479071,1.095336,2.205339,0.000000,0.678860


## Geographical features

In [43]:
geo_product = data[['lat', 'lng']]

### Calculate products geo distance

In [44]:
geo_product_radians = np.radians(geo_product.values)
geo_distance_matrix = pairwise_distances(geo_product_radians, metric='haversine')
geo_distance_matrix_km = geo_distance_matrix * 6371

In [45]:
s_min = 0
s_max = 1
geo_distance_matrix = (geo_distance_matrix - geo_distance_matrix.min()) / (geo_distance_matrix.max() - geo_distance_matrix.min())
geo_distance_matrix = geo_distance_matrix * (s_max - s_min) + s_min
pd.DataFrame(geo_distance_matrix, index=data.product_id, columns=data.product_id) # just for show result

product_id,villa-503599,ecotourism-597882,villa-361255,complex-376108,villa-323476,complex-380288,villa-501818,complex-655221,villa-545746,apartment-508766,...,inn-389108,villa-402641,villa-639276,cottage-475354,cottage-611772,complex-370677,villa-379164,apartment-491137,villa-487515,villa-399760
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
villa-503599,0.000000,0.223483,0.069205,0.267175,0.239674,0.085923,0.286804,0.129414,0.034690,0.183104,...,0.121854,0.182083,0.388930,0.142767,0.169810,0.146992,0.546289,0.114834,0.057458,0.146543
ecotourism-597882,0.223483,0.000000,0.184829,0.163157,0.068413,0.137810,0.179441,0.204763,0.190237,0.100008,...,0.146563,0.100139,0.235762,0.160256,0.176833,0.182691,0.330759,0.143570,0.264861,0.105793
villa-361255,0.069205,0.184829,0.000000,0.199613,0.184498,0.061193,0.291545,0.168896,0.062101,0.177550,...,0.055510,0.176584,0.384732,0.162853,0.193722,0.175783,0.515358,0.050704,0.126662,0.088919
complex-376108,0.267175,0.163157,0.199613,0.000000,0.096302,0.201846,0.342391,0.320497,0.247746,0.248011,...,0.145322,0.247695,0.392058,0.287470,0.313492,0.309410,0.423942,0.152708,0.323715,0.130405
villa-323476,0.239674,0.068413,0.184498,0.096302,0.000000,0.157186,0.247636,0.254288,0.210979,0.162537,...,0.133011,0.162444,0.296497,0.214054,0.235523,0.236849,0.354162,0.134714,0.289864,0.095623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
complex-370677,0.146992,0.182691,0.175783,0.309410,0.236849,0.124785,0.150701,0.030122,0.122096,0.088054,...,0.193854,0.087478,0.257679,0.022808,0.024493,0.000000,0.449313,0.184213,0.144982,0.183975
villa-379164,0.546289,0.330759,0.515358,0.423942,0.354162,0.463956,0.318129,0.479277,0.511665,0.371618,...,0.475632,0.372549,0.231673,0.434893,0.428846,0.449313,0.000000,0.473623,0.576610,0.434436
apartment-491137,0.114834,0.143570,0.050704,0.152708,0.134714,0.059908,0.275610,0.185579,0.095910,0.160682,...,0.009720,0.159849,0.360263,0.166399,0.197068,0.184213,0.473623,0.000000,0.171023,0.040184
villa-487515,0.057458,0.264861,0.126662,0.323715,0.289864,0.132733,0.294888,0.118842,0.079032,0.205631,...,0.178526,0.204663,0.401048,0.149717,0.169475,0.144982,0.576610,0.171023,0.000000,0.199882


## Total rooms distance

In [53]:
geo_weight = 100

total_rooms_distance = 1 * num_distance_matrix + geo_weight * geo_distance_matrix
result_df = pd.DataFrame(total_rooms_distance, index=data.product_id, columns=data.product_id)

In [58]:
product_id = 'suite-518283'

for i in result_df.loc[product_id].iloc[:].sort_values().head(10).index:
    print(f'https://www.jabama.com/stay/{i}')

https://www.jabama.com/stay/suite-518283
https://www.jabama.com/stay/apartment-104932
https://www.jabama.com/stay/apartment-103834
https://www.jabama.com/stay/apartment-353414
https://www.jabama.com/stay/suite-327771
https://www.jabama.com/stay/villa-108434
https://www.jabama.com/stay/villa-527782
https://www.jabama.com/stay/villa-656682
https://www.jabama.com/stay/apartment-361071
https://www.jabama.com/stay/apartment-602107
