## Import packages

In [1]:
import numpy as np
import matplotlib
import pandas as pd
pd.set_option('display.max_rows', 100)

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import pairwise_distances

In [3]:
import warnings
warnings.filterwarnings('ignore')

### Load Data

In [4]:
data = pd.read_pickle('../data/jabama_gilan_products.pickle')
data.head()

Unnamed: 0,product_id,score,area,capacity,room,floor,stair,price,lat,lng,iranian_toilet,western_toilet,bathroom,exclusive,breakfast
0,villa-503599,,98,7,2,0,15.0,1000000,37.328673,49.128113,True,True,True,False,False
1,ecotourism-597882,3.0,80,7,1,0,2.0,660000,37.096812,49.65271,True,True,True,False,False
2,villa-361255,4.5,60,3,1,0,,467000,37.18439,49.169312,True,False,True,False,False
3,complex-376108,4.0,28,3,0,0,,500000,36.804887,49.413757,True,False,True,False,False
4,villa-323476,3.6,120,6,2,0,4.0,1500000,36.96306,49.578552,True,False,True,True,False


## Preprocessing

In [5]:
# fill none stair and score by zero
data.loc[data.score.isna(), 'score'] = 0
data.loc[data.stair.isna(), 'stair'] = 0

In [6]:
data.isna().sum()

product_id        0
score             0
area              0
capacity          0
room              0
floor             0
stair             0
price             0
lat               0
lng               0
iranian_toilet    0
western_toilet    0
bathroom          0
exclusive         0
breakfast         0
dtype: int64

In [7]:
data.iranian_toilet = data.iranian_toilet.apply(lambda x: 1 if x else 0)
data.western_toilet = data.western_toilet.apply(lambda x: 1 if x else 0)
data.bathroom = data.bathroom.apply(lambda x: 1 if x else 0)
data.exclusive = data.exclusive.apply(lambda x: 1 if x else 0)
data.breakfast = data.breakfast.apply(lambda x: 1 if x else 0)

In [8]:
data['type'] = data.product_id.apply(lambda x: x.split('-')[0])

encoder = OneHotEncoder(sparse_output=False)
encoded_array = encoder.fit_transform(data[['type']])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(['type']))
data = pd.concat([data, encoded_df], axis=1)

del data['type']

## Numerical features

In [9]:
num_product = data.drop(['product_id', 'lat', 'lng'], axis=1)
num_columns = num_product.columns

In [10]:
num_scaler = MinMaxScaler()
scaled_num = num_scaler.fit_transform(num_product)

pd.DataFrame(scaled_num, index=data.product_id, columns=num_columns).head() # just for show result

Unnamed: 0_level_0,score,area,capacity,room,floor,stair,price,iranian_toilet,western_toilet,bathroom,...,breakfast,type_apartment,type_complex,type_cottage,type_ecotourism,type_hostel,type_inn,type_suite,type_traditional,type_villa
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
villa-503599,0.0,0.150769,0.1875,0.25,0.166667,0.214286,0.05,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
ecotourism-597882,0.6,0.123077,0.1875,0.125,0.166667,0.028571,0.033,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
villa-361255,0.9,0.092308,0.0625,0.125,0.166667,0.0,0.02335,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
complex-376108,0.8,0.043077,0.0625,0.0,0.166667,0.0,0.025,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
villa-323476,0.72,0.184615,0.15625,0.25,0.166667,0.057143,0.075,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
num_weights = {
    "score": 2,
    "area": 1,
    "capacity": 10,
    "room": 5,
    "floor": 2,
    "stair": 1,
    "price": 15,
    "iranian_toilet": 1,
    "western_toilet": 1,
    "bathroom": 0,
    "exclusive": 1,
    "breakfast": 1,
    "type_apartment": 10,
    "type_complex": 10,
    "type_cottage": 15,
    "type_ecotourism": 15,
    "type_hostel": 10,
    "type_inn": 10,
    "type_suite": 10,
    "type_traditional": 15,
    "type_villa": 10
}
num_weights_df = pd.DataFrame(num_weights, columns=num_columns, index=num_product.index) # convert to df

In [12]:
num_products_vector = pd.DataFrame(scaled_num, columns=num_columns).mul(num_weights_df)
num_products_vector.describe().T.sort_values('mean', ascending=False)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
type_villa,4703.0,5.473102,4.978096,0.0,0.0,10.0,10.0,10.0
type_cottage,4703.0,1.92324,5.015486,0.0,0.0,0.0,0.0,15.0
capacity,4703.0,1.87706,1.001788,0.0,1.25,1.5625,2.1875,10.0
score,4703.0,1.444329,0.686545,0.0,1.4,1.76,1.88,2.0
type_suite,4703.0,1.078035,3.101651,0.0,0.0,0.0,0.0,10.0
price,4703.0,0.999194,0.862695,0.0,0.525,0.75,1.125,15.0
room,4703.0,0.922948,0.562395,0.0,0.625,1.25,1.25,5.0
type_apartment,4703.0,0.910057,2.876479,0.0,0.0,0.0,0.0,10.0
iranian_toilet,4703.0,0.897937,0.302763,0.0,1.0,1.0,1.0,1.0
type_ecotourism,4703.0,0.896236,3.555699,0.0,0.0,0.0,0.0,15.0


### Calculate rooms distance

In [13]:
num_distance_matrix = pairwise_distances(num_products_vector.values, metric='euclidean')
pd.DataFrame(num_distance_matrix, index=data.product_id, columns=data.product_id) # just for show result

product_id,villa-503599,ecotourism-597882,villa-361255,complex-376108,villa-323476,complex-380288,villa-501818,complex-655221,villa-545746,apartment-508766,...,inn-389108,villa-402641,villa-639276,cottage-475354,cottage-611772,complex-370677,villa-379164,apartment-491137,villa-487515,villa-399760
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
villa-503599,0.000000,18.081231,2.530269,14.383433,2.082720,14.356200,1.972891,14.251815,2.351647,14.294221,...,14.331533,2.045090,0.347114,18.607782,18.194641,14.394483,1.839954,14.566862,2.272579,2.707600
ecotourism-597882,18.081231,0.000000,18.109258,18.114490,18.109318,18.095014,18.048273,18.124122,18.080884,18.097387,...,18.061689,18.093249,18.083171,21.697928,21.330010,18.105410,18.053098,18.325040,18.071163,18.108934
villa-361255,2.530269,18.109258,0.000000,14.157460,1.735268,14.161480,1.632385,14.323479,1.118169,14.169451,...,14.208743,2.710993,2.692310,18.902437,18.343852,14.162322,1.558707,14.815484,0.945216,2.322394
complex-376108,14.383433,18.114490,14.157460,0.000000,14.284705,0.317456,14.217171,2.019312,14.203886,14.208095,...,14.196530,14.438359,14.412989,18.926374,18.395779,0.505823,14.269765,14.882320,14.190153,14.321009
villa-323476,2.082720,18.109318,1.735268,14.284705,0.000000,14.269129,2.204506,14.328842,1.326204,14.205554,...,14.284015,1.927145,2.145994,18.567808,18.193809,14.264174,1.509771,14.495993,1.446034,2.125173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
complex-370677,14.394483,18.105410,14.162322,0.505823,14.264174,0.451872,14.218892,2.147942,14.174276,14.208154,...,14.181315,14.380758,14.417182,18.833178,18.340858,0.000000,14.247425,14.787818,14.171412,14.267177
villa-379164,1.839954,18.053098,1.558707,14.269765,1.509771,14.254255,1.548301,14.364212,1.215842,14.199794,...,14.200966,1.426894,1.908991,18.648554,18.146306,14.247425,0.000000,14.505547,1.229716,2.037705
apartment-491137,14.566862,18.325040,14.815484,14.882320,14.495993,14.810240,14.681769,14.751546,14.571410,3.908545,...,14.680145,14.296361,14.517713,18.164776,18.095496,14.787818,14.505547,0.000000,14.600081,14.457135
villa-487515,2.272579,18.071163,0.945216,14.190153,1.446034,14.173871,1.386935,14.300860,0.511521,14.180023,...,14.197215,2.003540,2.330804,18.658163,18.220729,14.171412,1.229716,14.600081,0.000000,1.449324


## Geographical features

In [14]:
geo_product = data[['lat', 'lng']]

### Calculate products geo distance

In [15]:
geo_product_radians = np.radians(geo_product.values)
geo_distance_matrix = pairwise_distances(geo_product_radians, metric='haversine')
geo_distance_matrix_km = geo_distance_matrix * 6371

In [16]:
s_min = 0
s_max = 10
geo_distance_matrix = (geo_distance_matrix - geo_distance_matrix.min()) / (geo_distance_matrix.max() - geo_distance_matrix.min())
geo_distance_matrix = geo_distance_matrix * (s_max - s_min) + s_min
pd.DataFrame(geo_distance_matrix, index=data.product_id, columns=data.product_id) # just for show result

product_id,villa-503599,ecotourism-597882,villa-361255,complex-376108,villa-323476,complex-380288,villa-501818,complex-655221,villa-545746,apartment-508766,...,inn-389108,villa-402641,villa-639276,cottage-475354,cottage-611772,complex-370677,villa-379164,apartment-491137,villa-487515,villa-399760
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
villa-503599,0.000000,2.234825,0.692047,2.671751,2.396743,0.859231,2.868036,1.294143,0.346900,1.831037,...,1.218544,1.820834,3.889295,1.427666,1.698101,1.469920,5.462890,1.148345,0.574584,1.465433
ecotourism-597882,2.234825,0.000000,1.848295,1.631575,0.684127,1.378103,1.794411,2.047629,1.902369,1.000079,...,1.465630,1.001387,2.357624,1.602558,1.768332,1.826914,3.307589,1.435697,2.648614,1.057927
villa-361255,0.692047,1.848295,0.000000,1.996135,1.844979,0.611926,2.915447,1.688961,0.621014,1.775496,...,0.555095,1.765840,3.847316,1.628525,1.937224,1.757828,5.153575,0.507044,1.266618,0.889190
complex-376108,2.671751,1.631575,1.996135,0.000000,0.963020,2.018464,3.423910,3.204973,2.477462,2.480105,...,1.453222,2.476946,3.920582,2.874697,3.134917,3.094097,4.239424,1.527076,3.237147,1.304054
villa-323476,2.396743,0.684127,1.844979,0.963020,0.000000,1.571855,2.476362,2.542883,2.109789,1.625374,...,1.330113,1.624438,2.964973,2.140540,2.355225,2.368487,3.541619,1.347145,2.898638,0.956226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
complex-370677,1.469920,1.826914,1.757828,3.094097,2.368487,1.247849,1.507013,0.301222,1.220964,0.880541,...,1.938544,0.874783,2.576791,0.228076,0.244927,0.000000,4.493129,1.842130,1.449823,1.839753
villa-379164,5.462890,3.307589,5.153575,4.239424,3.541619,4.639564,3.181289,4.792774,5.116652,3.716179,...,4.756319,3.725493,2.316729,4.348928,4.288458,4.493129,0.000000,4.736229,5.766100,4.344361
apartment-491137,1.148345,1.435697,0.507044,1.527076,1.347145,0.599078,2.756102,1.855789,0.959103,1.606819,...,0.097202,1.598491,3.602633,1.663989,1.970678,1.842130,4.736229,0.000000,1.710234,0.401845
villa-487515,0.574584,2.648614,1.266618,3.237147,2.898638,1.327328,2.948885,1.188419,0.790317,2.056309,...,1.785259,2.046632,4.010483,1.497172,1.694748,1.449823,5.766100,1.710234,0.000000,1.998817


## Total rooms distance

In [17]:
geo_weight = 50

total_rooms_distance = 1 * num_distance_matrix + geo_weight * geo_distance_matrix
result_df = pd.DataFrame(total_rooms_distance, index=data.product_id, columns=data.product_id)

## Test

In [18]:
n_top = 10
product_id = 'cottage-475984'

In [19]:
data[data.product_id.isin(result_df.loc[product_id].iloc[:].sort_values().head(n_top).index.tolist())]

Unnamed: 0,product_id,score,area,capacity,room,floor,stair,price,lat,lng,...,breakfast,type_apartment,type_complex,type_cottage,type_ecotourism,type_hostel,type_inn,type_suite,type_traditional,type_villa
149,cottage-475984,4.5,65,7,2,0,11,1064000,37.234702,49.913635,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1018,cottage-519883,4.6,40,4,1,0,4,998000,37.210644,49.957581,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1129,cottage-473168,4.8,40,4,0,0,2,790000,37.232515,49.86969,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1731,cottage-558171,5.0,70,10,2,0,12,1200000,37.180014,49.924622,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1771,cottage-102331,4.6,50,5,0,0,0,500000,37.263124,49.864197,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1859,cottage-373942,4.5,75,7,2,0,0,760000,37.276238,49.976807,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2870,cottage-339011,4.2,80,5,2,0,0,2200000,37.182202,49.866943,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3094,cottage-349312,4.7,50,4,1,0,0,1100000,37.186579,49.902649,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3952,cottage-400166,4.5,40,3,1,0,0,2220000,37.186579,49.902649,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4499,cottage-382617,4.7,100,9,3,1,8,1800000,37.271867,49.971313,...,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
for i in result_df.loc[product_id].iloc[:].sort_values().head(n_top).index:
    print(f'https://www.jabama.com/stay/{i}')

https://www.jabama.com/stay/cottage-475984
https://www.jabama.com/stay/cottage-473168
https://www.jabama.com/stay/cottage-519883
https://www.jabama.com/stay/cottage-349312
https://www.jabama.com/stay/cottage-400166
https://www.jabama.com/stay/cottage-102331
https://www.jabama.com/stay/cottage-558171
https://www.jabama.com/stay/cottage-382617
https://www.jabama.com/stay/cottage-339011
https://www.jabama.com/stay/cottage-373942
