In [None]:
import numpy as np
import pandas as pd
import json
import csv
import os
from haversine import haversine

In [None]:
# 讀取包含周邊設施的大表
addrDF = pd.read_csv("./combine/presale_location_spot.csv")

In [None]:
# 整理報表所需欄位
tempDF = addrDF.drop_duplicates(subset = ["土地區段位置建物區段門牌"])
tempDF = tempDF[["土地區段位置建物區段門牌"]]

# MRT

In [None]:
# 讀取 MRT data
dfMRT = pd.read_csv("./data/northern-taiwan_新北.csv")

In [None]:
# 找出房屋物件與捷運站的最短距離
# presale_location_spot data
df1 = addrDF.loc[:, ["土地區段位置建物區段門牌", "鄉鎮市區", "y_h", "x_h"]].drop_duplicates()

# MRT data
# dropna
df2 = dfMRT.loc[:, ["station_name_tw", "line_name", "address", "lat", "lon"]]

# cartesian combine
df3 = df1.assign(key = 1).merge(df2.assign(key = 1), on = "key").drop("key", axis = 1)

# 計算兩個座標的距離
def distance(row):
    lat1 = row["y_h"]
    long1 = row["x_h"]
    building = (lat1, long1)
    
    lat2 = row["lat"]
    long2 = row["lon"]
    station = (lat2, long2)
    
    result = haversine(building, station)
    return result

# haversine計算距離
df3["building_station_distance"] = df3.apply(lambda x: distance(x), axis = 1)

# 抓出最小的距離
df4 = df3.iloc[df3.groupby("土地區段位置建物區段門牌")["building_station_distance"].idxmin().tolist(), :]

# 合併兩張表
outputDf = pd.merge(tempDF, df4, left_on = "土地區段位置建物區段門牌", right_on = "土地區段位置建物區段門牌", how = "left")

# delete temporary objects
del df1, df2, df3, df4

In [None]:
# 以下三條捷運線是有高架段
a1 = outputDf["line_name"] == "環狀線"
a2 = outputDf["line_name"] == "機場線"
a3 = outputDf["station_name_tw"] == "小碧潭"

# 若為高架 = 1
outputDf.loc[(a1 | a2 | a3), "高架"] = 1

# 非高架 = 0
tmpDF01 = outputDf.fillna(value = 0)

In [None]:
# 0.1 = 100m，check有無距離高價很近的房子( < 100m & "高架" = 1 )
tmpDF01[(tmpDF01["building_station_distance"] < 0.1) & (tmpDF01["高架"] == 1)]

## 分類: 與捷運站的距離
<p> 5分鐘內: x < 375m => 1 <p>
<p> 5-10分鐘內: 375m < x < 750m => 2 <p>
<p> 超過10分鐘: x > 750m => 3 <p>
<p> 隔壁就高架: < 100m & "高架" = 1(為高架) => 4 <p>

In [None]:
def MRT_distance_classify(row):
    if row["building_station_distance"] < 0.375:
        return 1
    elif (row["building_station_distance"] > 0.375) and (row["building_station_distance"] < 0.75):
        return 2
    elif (row["building_station_distance"] > 0.75):
        return 3
    else:
        return 4
    
tmpDF01["class_MRT"] = tmpDF01.apply(lambda x: MRT_distance_classify(x), axis=1)

# MRT_addr表刪除不要的欄位
tmpDF01_1 = tmpDF01.drop(["鄉鎮市區", "y_h", "x_h"], axis = 1)

# 併回大表
comDF_MRT = pd.merge(addrDF, tmpDF01_1, left_on = "土地區段位置建物區段門牌", right_on = "土地區段位置建物區段門牌", how = "left")

# Bus

In [None]:
# 讀取 BUS data
busPath = "./data/bus/"
fileList = os.listdir(busPath)

# 合併所有BUS檔案
busList = [pd.read_csv(busPath + f) for f in fileList]
busDF = pd.concat(busList)

In [None]:
# 找出房屋物件與公車站的最短距離
# presale_location_spot data
df1 = addrDF.loc[:, ["土地區段位置建物區段門牌", "鄉鎮市區", "y_h", "x_h"]].drop_duplicates()

# bus stop data
# dropna
df2 = busDF.loc[:, ["nameZh", "address", "latitude", "longitude"]]

# cartesian combine
df3 = df1.assign(key = 1).merge(df2.assign(key = 1), on = "key").drop("key", axis = 1)

# 計算兩個座標的距離
def distance(row):
    lat1 = row["y_h"]
    long1 = row["x_h"]
    building = (lat1, long1)
    
    lat2 = row["latitude"]
    long2 = row["longitude"]
    busstop = (lat2, long2)
    
    result = haversine(building, busstop)
    return result

# haversine計算距離
df3["building_busstop_distance"] = df3.apply(lambda x: distance(x), axis = 1)

# 抓出最小的距離
df4 = df3.iloc[df3.groupby("土地區段位置建物區段門牌")["building_busstop_distance"].idxmin().tolist(), :]

# 合併兩張表
outputDf1 = pd.merge(tempDF, df4, left_on = "土地區段位置建物區段門牌", right_on = "土地區段位置建物區段門牌", how = "left")

# delete temporary objects
del df1, df2, df3, df4

## 分類: 與公車站的距離
<p> 3分鐘內: x < 225m => 1 <p>
<p> 3-6分鐘內: 225m < x < 450m => 2 <p>
<p> 6-10分鐘內: 450m < x < 750m => 3 <p>
<p> 超過10分鐘: x > 750m => 4 <p>

In [None]:
def bus_distance_classify(row):
    if row["building_busstop_distance"] < 0.225:
        return 1
    elif (row["building_busstop_distance"] > 0.225) and (row["building_busstop_distance"] < 0.45):
        return 2
    elif (row["building_busstop_distance"] > 0.45) and (row["building_busstop_distance"] < 0.75):
        return 3
    else:
        return 4
    
outputDf1["class_bus"] = outputDf1.apply(lambda x: bus_distance_classify(x), axis=1)

# bus_addr表刪除不要的欄位
outputDf1_1 = outputDf1.drop(["鄉鎮市區", "y_h", "x_h"], axis = 1)

# 併回大表comDF_MRT
comDF_MRT_bus = pd.merge(comDF_MRT, outputDf1_1, left_on = "土地區段位置建物區段門牌", right_on = "土地區段位置建物區段門牌", how = "left")

In [None]:
comDF_MRT_bus.to_csv("./combine/presale_location_spot_MRT_bus.csv", index = False)