In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib
import folium
import branca

In [None]:
perth = pd.read_csv('PerthHousing.csv', parse_dates=['date_sold'], infer_datetime_format=True)

train_indices, test_indices = train_test_split(np.array(perth.index), test_size=0.2, random_state=0)

train = perth.loc[train_indices].copy()
test = perth.loc[test_indices].copy()

train["log_price"] = np.log(train["price"])

In [None]:
sample_train = train.iloc[:10_000].copy()

In [None]:
def get_quantile(val, quantiles):
    for i, quantile_val in enumerate(quantiles, 1):
        if val < quantile_val:
            return i
    return len(quantiles)

In [None]:
values = train["log_price"]

In [None]:
x_train = train.loc[:, ['longitude', 'latitude']]

scores = []
for num_quantiles in range(2, 20):
    quantiles = np.quantile(values, [i / num_quantiles for i in range(1, num_quantiles + 1)])
    
    model = RandomForestClassifier(max_leaf_nodes=30)
    
    y_train = train["log_price"].apply(lambda x: get_quantile(x, quantiles))
    model.fit(x_train, y_train)
    
    scores.append(model.score(x_train, y_train))
    print("Finished", num_quantiles)

In [None]:
plt.plot(np.arange(2, 20), scores, "--o")

In [None]:
num_quantiles = 7
quantiles = np.quantile(train["log_price"], [i / num_quantiles for i in range(1, num_quantiles + 1)])
    
model = RandomForestClassifier(max_leaf_nodes=30)

y_train = train["log_price"].apply(lambda x: get_quantile(x, quantiles))
model.fit(x_train, y_train)

In [None]:
predicted_quantiles = model.predict(sample_train.loc[:, ["longitude", "latitude"]])

In [None]:
price = predicted_quantiles
colormap = branca.colormap.linear.YlOrRd_09.scale(0, max(price))

folium_map = folium.Map(location=(-31.96, 115.87), zoom_start=9)

for i in range(len(sample_train)):
    values = sample_train.iloc[i, :]
    lat = values["latitude"]
    lon = values["longitude"]
    price = predicted_quantiles[i]

    color = colormap(price)
    folium.CircleMarker([lat, lon], radius=0.01, color=color,
                        popup=price,
                        fill=True, fill_opacity=0.7).add_to(folium_map)
    
folium_map.add_child(colormap)
folium_map

In [None]:
df = pd.DataFrame()

df["price"] = train["price"].copy()
df["log_price"] = np.log(train["price"])
df["labels"] = model.predict(train.loc[:, ["longitude", "latitude"]])
df["bedrooms"] = train["bedrooms"].copy()

In [None]:
df.boxplot("price", "labels")

In [None]:
sns.lmplot(y='log_price', x="bedrooms", data=df, hue='labels')

In [None]:
df = pd.DataFrame()

df["log_price"] = train["log_price"]
df["longitude"] = train["longitude"].copy()
df["latitude"] = train["latitude"].copy()

In [None]:
price = df["log_price"]
colormap = branca.colormap.linear.YlOrRd_09.scale(min(price), max(price))

folium_map = folium.Map(location=(-31.96, 115.87), zoom_start=9)

for i in range(10_000):
    values = df.iloc[i, :]
    lat = values["latitude"]
    lon = values["longitude"]
    price = values["log_price"]

    color = colormap(price)
    folium.CircleMarker([lat, lon], radius=0.01, color=color,
                        fill=True, fill_opacity=0.7).add_to(folium_map)
    
folium_map.add_child(colormap)
folium_map