# This notebook contains your standard data exploration

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import re
from datetime import datetime
from sklearn.linear_model import LinearRegression
import numpy as np

In [None]:
data = pd.read_csv('../scrapers/asuntojen-hintatiedot/scraped_data/2021-04-19_Helsinki.csv')

In [None]:
data

## Data wrangling happens here

In [None]:
def get_number_of_rooms(data):
    room_arrangement = data["room_arrangement"]
    if not type(room_arrangement) == str:
        return 0
    if (room_arrangement == 'AH'):
        # AH = asuinhuoneisto = single room, no toilet/kitchen/etc, use special value
        return 0.5
    variable_rooms_regexp = re.compile('([0-9])-([0-9])\s?h', re.IGNORECASE)
    variable_rooms = variable_rooms_regexp.findall(room_arrangement)
    if len(variable_rooms) > 0:
        return ((int(variable_rooms[0][0]) + int(variable_rooms[0][1])) / 2)
    simple_rooms_regexp = re.compile('([0-9])\s?h', re.IGNORECASE)
    simple_rooms = simple_rooms_regexp.findall(room_arrangement)
    if len(simple_rooms) > 0:
        return int(simple_rooms[0])
    return 0

In [None]:
data["has_elevator"] * 1

In [None]:
data["number_of_rooms"] = data.apply(get_number_of_rooms, axis=1)
data["shape_is_good"] = (data["shape"] == 'good') * 1
data["has_elevator"] = data["has_elevator"] * 1
data["age"] = datetime.today().year - data["built_in"]
data["is_apartment"] = (data["house_type"] == "apartment") * 1
data["is_rowhouse"] = (data["house_type"] == "rowhouse") * 1
data["is_townhouse"] = (data["house_type"] == "townhouse") * 1
data["lot_is_owned"] = (data["lot"] == "owned") * 1
data

## Visualize data

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
house_type_colors = {
    'is_apartment': 'purple',
    'is_rowhouse': 'cyan',
    'is_townhouse': 'blue'
}
for house_type in ["is_apartment", "is_rowhouse", "is_townhouse"]:
    type_data = data[data[house_type] == 1]
    ax.scatter(type_data["square_meters"], type_data["price_including_loans"], color=house_type_colors[house_type], label=house_type)
    
plt.title("Apartment size in square meters vs price")
plt.ylabel("Price, m€")
plt.xlabel("Size, m^2")
plt.legend()

plt.show()

### Thoughts on square meters vs price
- Well, less suprisingly, there seems to be a big dependency between the size and the price of the apartment.
- townhouses tend to be bigger than rowhouses and rowhouses tend to be bigger than apartments
- the bigger the house the more variance there is in the price

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

ax.scatter(data["square_meters"], data["number_of_rooms"])
    
plt.title("Apartment size square meters vs room numbers")
plt.ylabel("Number of rooms")
plt.xlabel("Size, m^2")

plt.show()

### Thoughts on apartment size vs room numbers
- quite a lot of 0s in there, might need some extra cleaning on the wrangling part?
- looks like there is a pretty linear dependency between number of rooms and size, which is again unsuprising


In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
house_type_colors = {
    'is_apartment': 'purple',
    'is_rowhouse': 'cyan',
    'is_townhouse': 'blue'
}
for house_type in ["is_apartment", "is_rowhouse", "is_townhouse"]:
    type_data = data[data[house_type] == 1]
    ax.scatter(type_data["age"], type_data[ "price_including_loans"], color=house_type_colors[house_type], label=house_type)
    
plt.title("House age vs price")
plt.ylabel("Price, m€")
plt.xlabel("Age, years")
plt.legend()

plt.show()

### Thoughts on age vs price
- well, rather surprisingly, apartment age does not seem to correlate with its price

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
house_type_colors = {
    'is_apartment': 'purple',
    'is_rowhouse': 'cyan',
    'is_townhouse': 'blue'
}

for house_type in ["is_apartment", "is_rowhouse", "is_townhouse"]:
    type_data = data[data[house_type] == 1]
    scale = 25 + (75 * type_data["shape_is_good"])
    ax.scatter(type_data["age"], type_data[ "price_including_loans"], color=house_type_colors[house_type], label=house_type, s=scale)
    
plt.title("House age vs price")
plt.ylabel("Price, m€")
plt.xlabel("Age, years")
plt.legend()

plt.show()

In [None]:
f'C{data["lot_is_owned"].iloc[0]}'

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

for house_type in [1, 0]:
    type_data = data[data["lot_is_owned"] == house_type]
    ax.scatter(type_data["square_meters"], type_data[ "price_including_loans"], c=f'C{type_data["lot_is_owned"].iloc[0]}', label=house_type)
    
plt.title("Apartment size vs price by lot owned(1) or rented(0)")
plt.ylabel("Price, m€")
plt.xlabel("Size in square meters")
plt.legend()

plt.show()

## Do regression!

In [None]:
data.columns

In [None]:
x = data[['square_meters', 'has_elevator', 'shape_is_good', 'age', 'is_apartment','is_rowhouse', 'is_townhouse', 'lot_is_owned']]
y = data["price_including_loans"]


In [None]:

model = LinearRegression().fit(x, y)

In [None]:
model.coef_

In [None]:
candidate_apartment = np.array([50, 1, 1, 30, 0, 1, 0, 0]).reshape(1, -1)
model.predict(candidate_apartment)

## Only rowhouses!

In [None]:
rowhouse_data = data[data["is_rowhouse"] == 1]
rw_x = rowhouse_data[['square_meters', 'shape_is_good', 'age', 'lot_is_owned']]
rw_y = rowhouse_data["price_including_loans"]

In [None]:
rowhouse_model = LinearRegression().fit(rw_x, rw_y)

In [None]:
rowhouse_model.coef_

In [None]:
candidate_rowhouse = np.array([80, 1, 30, 1]).reshape(1, -1)
rowhouse_model.predict(candidate_rowhouse)

## Plot candidate

In [None]:
rowhouse_data = data[data["is_rowhouse"] == 1]
rw_sq_x = np.array(rowhouse_data['square_meters']).reshape(-1, 1)
rw_sq_y = rowhouse_data["price_including_loans"]
rowhouse_square_meter_model = LinearRegression().fit(rw_sq_x, rw_sq_y)

In [None]:
rw_coef = rowhouse_square_meter_model.coef_
rw_intercept = rowhouse_square_meter_model.intercept_

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

ax.scatter(rowhouse_data["square_meters"], rowhouse_data[ "price_including_loans"])
ax.scatter(110, 409414, label="Candidate predicted price", s=100)
ax.scatter(110, 498367, label="Candidate asking price", s=100)
plt.plot((0, 250), (rw_intercept, 250*rw_coef + rw_intercept), c='m', label="Rowhouse square meter estimated price")

plt.title("Rowhouse size vs price")
plt.ylabel("Price, m€")
plt.xlabel("Size in square meters")
plt.grid(True)
plt.legend()