In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


In [79]:
df = pd.read_json("data/hotels_processed.json")

In [80]:
df= df.drop(columns="place_name")

In [81]:
def fill_distance(row):
    if pd.isna(row['distance_from_downtown_km']):
        if row['hotel_type'] in ['resort', 'luxury']:
            return np.random.uniform(8,20)
        else:
            return np.random.uniform(0.5,5)
    return row['distance_from_downtown_km']


In [82]:
df['distance_from_downtown_km'] = df.apply(fill_distance,axis=1)

In [83]:
df['price_per_night'] = df['price_per_night'].fillna(df['price_per_night'].median())

In [84]:
df = df.drop(columns=['stars'])

In [85]:
df['rating'] = df['rating'].fillna(df['rating'].mean())

In [86]:
encode = MultiLabelBinarizer()
tags = encode.fit_transform(df['amenities'])
tag_df=pd.DataFrame(tags,columns=encode.classes_)
df=pd.concat([df,tag_df],axis=1)

In [87]:
encode = OneHotEncoder(sparse_output=False)
typ = encode.fit_transform(df[['hotel_type']])
typ_df = pd.DataFrame(typ,columns=encode.get_feature_names_out(['hotel_type']))
df = pd.concat([df,typ_df],axis=1)

In [88]:
df= df.drop(columns=["hotel_type","amenities"])

In [89]:
df = df.set_index('hotel_id')

In [90]:
df['distance_from_downtown_km'].describe()

count    36607.000000
mean         5.346032
std          6.627945
min          0.000000
25%          0.900000
50%          2.573468
75%          8.000000
max        204.200000
Name: distance_from_downtown_km, dtype: float64

In [92]:
scaler = StandardScaler()
df['price_per_night']=scaler.fit_transform(df[['price_per_night']])
df['distance_from_downtown_km']=scaler.fit_transform(df[['distance_from_downtown_km']])
df['rating']=scaler.fit_transform(df[['rating']])


In [95]:

for col in df.columns:
    if df[col].dropna().isin([0, 1]).all():
        df[col] = df[col].astype('int64')


In [97]:
df.to_csv("data/hotels_preprocessed.csv",index=True)