In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [167]:
data = pd.read_json("../Dataset/train.json")

In [168]:
data.head()

Unnamed: 0,id,province,district,subdistrict,address,property_type,total_units,bedrooms,baths,floor_area,...,longitude,nearby_stations,nearby_station_distance,nearby_bus_stops,nearby_supermarkets,nearby_shops,year_built,month_built,facilities,price
0,8448321,Bangkok,Watthana,Phra Khanong Nuea,"36 Soi Sukhumvit 63, Ekamai Road",Condo,273.0,2.0,2.0,66,...,100.58485,2,"[[E7 Ekkamai BTS, 270], [E6 Thong Lo BTS, 800]]",,16.0,20,2011,June,"[Car Park, Community Garden, CCTV, Fitness cor...",8500000
1,10936325,Bangkok,Watthana,Khlong Toei Nuea,31 สุขุมวิท,Condo,74.0,1.0,1.0,49,...,100.566949,3,"[[BL22 Sukhumvit MRT, 720], [BL21 Phetchaburi ...",,11.0,20,2012,September,"[CCTV, Fitness corner, 24 hours security, Swim...",5900000
2,10927931,Bangkok,Khlong Toei,Khlong Tan,"68 Sukhumvit 24 Alley, Khong Tan",Condo,940.0,1.0,1.0,34,...,100.56566,2,"[[E5 Phrom Phong BTS, 650], [BL23 Queen Siriki...",,20.0,20,2017,January,"[Car Park, Clubhouse, Community Garden, CCTV, ...",6290000
3,11004792,Nonthaburi,Bang Kruai,Bang Khun Kong,Bang Khun Kong,Detached House,,3.0,3.0,170,...,100.428438,0,,,2.0,4,0,,"[Covered car park, Playground, 24 hours securi...",8900000
4,10757452,Nonthaburi,Mueang Nonthaburi,Bang Phai,พระราม5-นครอินทร์,Townhouse,,3.0,2.0,120,...,100.494129,1,"[[PP09 Yaek Nonthaburi 1 MRT, 10]]",,6.0,15,0,,"[Covered car park, 24 hours security]",2390000


In [169]:
data = data[~((data['bedrooms'].isnull()) | (data['baths'].isnull()))]
data = data[~((data['baths'] + data['bedrooms']) > data['total_units'])]

In [170]:
def get_closest_station(station_info_list):
    if station_info_list in [None, 0] or len(station_info_list) == 0:
        return '-', 0
    
    stations_with_values = [(item[0].split(',')[0], item[1]) for item in station_info_list if isinstance(item, list)]
    
    min_station = min(stations_with_values, key=lambda x: x[1])[0]  
    station_distance = min(stations_with_values, key=lambda x: x[1])[1] 

    text = min_station.split(' ')
        
    index_bts = text.index('BTS') if 'BTS' in text else -1
    index_mrt = text.index('MRT') if 'MRT' in text else -1
    
    if index_bts != -1:
        range_index = index_bts
    elif index_mrt != -1:
        range_index = index_mrt
    else:
        range_index = len(text)

    station_name = ' '.join(text[1:range_index]).replace('SRT','').replace('Airport Link','').replace('BRT','').strip()

    return station_name, station_distance

# Apply the function and assign values to separate columns
data[['station_name', 'station_distance']] = data['nearby_station_distance'].apply(lambda x: pd.Series(get_closest_station(x)))

data.drop(columns = ['nearby_station_distance'], inplace =True)


In [171]:
data['num_facilities'] = data['facilities'].apply(lambda x: len(x) if isinstance(x, list) else 0)

In [172]:
def price_per_unit(row):
    if row['property_type'] == 'Condo':
        return row['price'] / row['floor_area'] if row['floor_area'] != 0 else 0
    else:
        return row['price'] / row['land_area'] if row['land_area'] != 0 else 0

data['price/m^2'] = data.apply(price_per_unit, axis=1)

---

In [174]:
data.columns

Index(['id', 'province', 'district', 'subdistrict', 'address', 'property_type',
       'total_units', 'bedrooms', 'baths', 'floor_area', 'floor_level',
       'land_area', 'latitude', 'longitude', 'nearby_stations',
       'nearby_bus_stops', 'nearby_supermarkets', 'nearby_shops', 'year_built',
       'month_built', 'facilities', 'price', 'station_name',
       'station_distance', 'num_facilities', 'price/m^2'],
      dtype='object')

In [175]:
data['property_type'].unique()

array(['Condo', 'Detached House', 'Townhouse'], dtype=object)

In [176]:
data.groupby('district')['price/m^2'].mean().sort_values(ascending = False).head(10)

district
Bang Rak        166494.521700
Pathum Wan      165336.314322
Ratchathewi     151199.814948
Watthana        144127.087561
Khlong Toei     141090.690854
Sathon          126715.930420
Phaya Thai      124444.236328
Khlong San      107202.855107
Chatuchak       104540.253004
Phra Khanong    104405.376104
Name: price/m^2, dtype: float64

In [206]:
condo = data[data['property_type'] == 'Condo']
pd.DataFrame(condo.groupby('district')['price'].mean().sort_values( ascending=False).head(10))


Unnamed: 0_level_0,price
district,Unnamed: 1_level_1
Bang Rak,6821040.0
Pathum Wan,6724390.0
Ratchathewi,6034428.0
Watthana,6017544.0
Khlong Toei,5713927.0
Sathon,5597031.0
Bang Kho Laem,5548908.0
Phaya Thai,5188392.0
Khlong San,5068262.0
Phra Khanong,4299965.0


In [178]:
pd.DataFrame(data.groupby(['district','station_name'])['price'].mean().sort_values(ascending = False).head(10))

Unnamed: 0_level_0,Unnamed: 1_level_0,price
district,station_name,Unnamed: 2_level_1
Bangkok Yai,-,9870000.0
Sathon,Sala Daeng,9000000.0
Bang Sue,Sanam Pao,8900000.0
Bang Kho Laem,Rama IX Bridge,8746364.0
Ratchathewi,Ploen Chit,8650000.0
Bang Rak,Charoen Nakhon,8649667.0
Sathon,Sathorn,8276667.0
Bang Rak,Saphan Taksin,8250000.0
Khlong San,Saphan Taksin,8200000.0
Wang Thonglang,Ratchadaphisek,8150000.0


In [179]:
pd.DataFrame(data.groupby('district')['price/m^2'].mean().sort_values(ascending = False).head(10))

Unnamed: 0_level_0,price/m^2
district,Unnamed: 1_level_1
Bang Rak,166494.5217
Pathum Wan,165336.314322
Ratchathewi,151199.814948
Watthana,144127.087561
Khlong Toei,141090.690854
Sathon,126715.93042
Phaya Thai,124444.236328
Khlong San,107202.855107
Chatuchak,104540.253004
Phra Khanong,104405.376104
