In [1]:
import pandas as pd
import numpy as np
import yaml

In [2]:
#不含車位_各屋齡合計棟數
#不含車位_各屋齡合計面積
#不含車位_各屋齡整體中位數房價
#不含車位_店面(店舖)棟數
#不含車位_店面(店舖)面積
#不含車位_辦公商業大樓棟數
#不含車位_辦公商業大樓面積
#不含車位_店面(店舖)中位數房價
#不含車位_辦公商業大樓中位數房價

In [3]:
## load yaml
with open('../tbl/sum_mean.yaml', 'r') as f:
    cols = yaml.safe_load(f)
    sum_cols = cols['sum']
    mean_cols = cols['mean']

In [4]:
data = pd.read_csv(f'../data/109_combine.csv')
data = data.drop(columns=['土地面積總數', '商業', '純住宅', '混合使用住宅'])

for year in range(104, 110):
    print(year)
    tmp = pd.read_csv(f'../data/{year}_combine.csv')
    tmp[f'house_price_per'] = tmp['不含車位_各屋齡整體中位數房價'] * tmp['不含車位_各屋齡合計棟數'] / (tmp['不含車位_各屋齡合計面積'] + 1e-7)
    tmp[f'shop_price_per'] = tmp['不含車位_店面(店舖)中位數房價'] * tmp['不含車位_店面(店舖)棟數'] / (tmp['不含車位_店面(店舖)面積'] + 1e-7)
    tmp[f'office_price_per'] = tmp['不含車位_辦公商業大樓中位數房價'] * tmp['不含車位_辦公商業大樓棟數'] / (tmp['不含車位_辦公商業大樓面積'] + 1e-7)
    ###
    data[f'house_price_unit_{year}'] = tmp['house_price_per'].replace(0, np.nan)
    data[f'shop_price_unit_{year}'] = tmp['shop_price_per'].replace(0, np.nan)
    data[f'office_price_unit_{year}'] = tmp['office_price_per'].replace(0, np.nan)
    ## Get 土地面積
    if year == 108:
        #二級發布區代碼
        #一級發布區代碼
        #最小統計區代碼
        land_use108 = tmp[['最小統計區代碼', '土地面積總數', '商業', '純住宅', '混合使用住宅']]

104
105
106
107
108
109


In [5]:
data = data.merge(land_use108, on='最小統計區代碼')

In [6]:
## TOWN, 二級發布區代碼, 一級發布區代碼	
test = 'TOWN'
sum_data = data[sum_cols+[test]].groupby(test).sum()
mean_data = data[mean_cols+[test]].groupby(test).mean()
out_data = sum_data.merge(mean_data, left_index=True, right_index=True)

In [7]:
## output to final data folder
out_data.to_csv('../data/final_output/district_features.csv')

## Test simple embedding

In [8]:
## Get dictionary

if test == 'TOWN':
    dicts = dict([(i, i) for i in list(mean_data.index)])
else:
    dicts = data[[test, 'TOWN']].drop_duplicates([test]).set_index(test).to_dict()['TOWN']

#print(dicts)
town2idx = dict([(j, i) for i, j in enumerate(set(dicts.values()))])


In [9]:
mean_data = mean_data.drop(columns=['不含車位_各屋齡整體中位數房價', '不含車位_各屋齡合計棟數', '不含車位_各屋齡合計面積',
                                                                                      '不含車位_店面(店舖)中位數房價', '不含車位_店面(店舖)棟數', '不含車位_店面(店舖)面積',
                                                                                      '不含車位_辦公商業大樓中位數房價', '不含車位_辦公商業大樓棟數', '不含車位_辦公商業大樓面積'])

KeyError: "['不含車位_各屋齡整體中位數房價', '不含車位_各屋齡合計棟數', '不含車位_各屋齡合計面積', '不含車位_店面(店舖)中位數房價', '不含車位_店面(店舖)棟數', '不含車位_店面(店舖)面積', '不含車位_辦公商業大樓中位數房價', '不含車位_辦公商業大樓棟數', '不含車位_辦公商業大樓面積'] not found in axis"

In [None]:
embedding = mean_data.iloc[:, 6:]
## min/max normalization
#embedding = (embedding - embedding.min()) / (embedding.max() - embedding.min())
## standardization
embedding = (embedding - embedding.mean()) / embedding.std()

## fill na
embedding = embedding.fillna(0)
## Test
embedding = embedding.apply(lambda x: x / np.sqrt(np.sum(x**2)), axis=1)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
scores = np.squeeze(cosine_similarity(embedding, embedding.loc[['內湖區']]))
index = np.argsort(scores)[::-1]

In [None]:
print(scores[index])
embedding.iloc[index]

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus']=False

In [None]:
tsne_results = TSNE(n_components=2, learning_rate='auto', init='pca', random_state=10).fit_transform(embedding)
pca_results = PCA(n_components=2).fit_transform(embedding)

In [None]:
colors = embedding.index.map(lambda x: town2idx[dicts[x]])

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(10, 8))

sc = plt.scatter(pca_results[:, 0], pca_results[:, 1], alpha=0.8, s=30, c=colors)#, cmap='rainbow')
for index, pt in enumerate(pca_results):
    plt.text(pt[0], pt[1], list(embedding.index)[index])
    
plt.legend(sc.legend_elements()[0], town2idx.keys(), title="DISTRICT")
plt.show()