In [1]:
import pandas as pd
from itertools import chain
from collections import Counter
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus']=False

In [2]:
def check_(x):
    try:
        y = float(x.replace(',', ''))
    except:
        y = x
        
    return y

In [3]:
data = pd.read_csv('../data/591_shop_features.tsv', sep='\t')

In [4]:
data.columns

Index(['user_index', 'town', 'name', 'rental_price', 'size', 'floor',
       'address', '小巷0/大路1', 'longitude', 'latitude', 'type', 'house_age',
       'feature', 'MRT_within_lkm', 'Bus_within_lkm', 'MRT_name',
       'MRT_distance', '管理費', '車位', '裝潢', 'shop_within_lkm', '餐廳餐館', '其他綜合零售',
       '便利商店', '美容美髮服務', '日常用品零售', '飲料店業', '其他餐飲業'],
      dtype='object')

In [5]:
## area feature
area_data = pd.read_csv('../data/final_output/district_features.csv')

# Feature processing

In [6]:
all_features = []
for fea in data['feature']:
    features = []
    [features.extend(i.split('：')) for i in fea.split('；')]
    all_features.extend(features)

cnt = Counter(all_features)

In [7]:
cnt.most_common(15)

[('生活機能', 461),
 ('近便利商店', 433),
 ('公園綠地', 345),
 ('傳統市場', 314),
 ('學校', 306),
 ('百貨公司', 217),
 ('醫療機構', 200),
 ('夜市附近交通', 194),
 ('附近交通', 65),
 ('醫療機構附近交通', 59),
 ('夜市', 54),
 ('學校附近交通', 54),
 ('NAN', 51),
 (' 台北火車站', 25),
 (' 松山火車站', 16)]

In [8]:
data.columns

Index(['user_index', 'town', 'name', 'rental_price', 'size', 'floor',
       'address', '小巷0/大路1', 'longitude', 'latitude', 'type', 'house_age',
       'feature', 'MRT_within_lkm', 'Bus_within_lkm', 'MRT_name',
       'MRT_distance', '管理費', '車位', '裝潢', 'shop_within_lkm', '餐廳餐館', '其他綜合零售',
       '便利商店', '美容美髮服務', '日常用品零售', '飲料店業', '其他餐飲業'],
      dtype='object')

In [9]:
## Type
shop_features = data
#shop_features = data[['town', 'rental_price', 'size', 'floor', '小巷0/大路1', 'longitude', 'latitude', 'house_age', 'MRT_within_lkm',  'type', '裝潢',
#                        '餐廳餐館', '其他綜合零售', '便利商店', '美容美髮服務', '日常用品零售', '飲料店業']]

In [10]:
one_hot_type = pd.get_dummies(shop_features['type'])
shop_features = shop_features.join(one_hot_type)
shop_features = shop_features.drop('type', axis=1)

In [11]:
ss = {'尚未裝潢': 0, '簡易裝潢': 1, '中檔裝潢': 2, '高檔裝潢': 3}
shop_features['裝潢'] = shop_features['裝潢'].map(lambda x: ss[x])

In [12]:
one_hot_type = pd.get_dummies(shop_features['車位'].map(lambda x: x.split('，')[0]))
shop_features = shop_features.join(one_hot_type)
shop_features = shop_features.drop('車位', axis=1)
shop_features = shop_features.rename(columns={'無': '無車位'})

In [13]:
shop_features['rental_price'] = shop_features['rental_price'].map(check_)

In [14]:
shop_features.columns

Index(['user_index', 'town', 'name', 'rental_price', 'size', 'floor',
       'address', '小巷0/大路1', 'longitude', 'latitude', 'house_age', 'feature',
       'MRT_within_lkm', 'Bus_within_lkm', 'MRT_name', 'MRT_distance', '管理費',
       '裝潢', 'shop_within_lkm', '餐廳餐館', '其他綜合零售', '便利商店', '美容美髮服務', '日常用品零售',
       '飲料店業', '其他餐飲業', '公寓', '別墅', '店面（店鋪）', '透天厝', '電梯大樓', '平面式停車位',
       '機械式停車位', '無車位'],
      dtype='object')

In [15]:
area_data.columns

Index(['TOWN', '人口數', 'AREA', 'bus_stops', 'MRT_stops', '0-14歲人口數',
       '15-64歲人口數', '65歲以上人口數', '土地面積總數', '商業', '純住宅', '混合使用住宅', '批發及零售業',
       '住宿及餐飲業', '藝術、娛樂及休閒服務業', 'X', 'Y', 'lon', 'lat', '人口密度', 'NIGHT_WORK',
       'DAY_WORK(7:00~13:00)', 'DAY_WORK(13:00~19:00)', 'NIGHT_WEEKEND',
       'DAY_WEEKEND(7:00~13:00)', 'DAY_WEEKEND(13:00~19:00)', 'MORNING_WORK',
       'MIDDAY_WORK', 'AFTERNOON_WORK', 'EVENING_WORK', 'MORNING_WEEKEND',
       'MIDDAY_WEEKEND', 'AFTERNOON_WEEKEND', 'EVENING_WEEKEND',
       'house_price_unit_104', 'house_price_unit_105', 'house_price_unit_106',
       'house_price_unit_107', 'house_price_unit_108', 'house_price_unit_109',
       'shop_price_unit_104', 'shop_price_unit_105', 'shop_price_unit_106',
       'shop_price_unit_107', 'shop_price_unit_108', 'shop_price_unit_109',
       'office_price_unit_104', 'office_price_unit_105',
       'office_price_unit_106', 'office_price_unit_107',
       'office_price_unit_108', 'office_price_unit_109'],
      

In [16]:
area_features = area_data
#area_features = area_data[['TOWN', '0-14歲人口數', '15-64歲人口數', '65歲以上人口數', 
#              'DAY_WORK(7:00~13:00)', 'DAY_WORK(13:00~19:00)', 'NIGHT_WORK', 
#              'DAY_WEEKEND(7:00~13:00)', 'DAY_WEEKEND(13:00~19:00)', 'NIGHT_WEEKEND']]
area_features = area_features.set_index('TOWN')

In [17]:
new_features = []
for index, item in shop_features.iterrows():
    tmp = area_features.loc[item['town']]
    
    new_features.append(pd.concat((item, tmp)))
    #new_features.append(item)
    
new_features = pd.DataFrame(new_features)
new_features = new_features.drop(columns=['town', 'floor'])

In [18]:
#new_features = new_features.applymap(check_)

In [19]:
new_features.to_csv("591shop_area.tsv", sep="\t", index=None)

In [None]:
new_features['rental_price'] = np.log(new_features['rental_price'])
new_features['size'] = np.log(new_features['size'])

In [None]:
embedding = (new_features - new_features.mean()) / new_features.std()
#embedding = embedding[['rental_price', 'size', 'MRT_within_lkm', 'house_age', '小巷0/大路1', '0-14歲人口數', '15-64歲人口數', '65歲以上人口數']]

In [None]:
embedding.columns

In [None]:
embedding.hist(figsize=(15, 12))
plt.show()

# Testing

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
scores = np.squeeze(cosine_similarity(embedding, embedding.iloc[[0]]))
index = np.argsort(scores)[::-1]
embedding.iloc[index]

# Visualization

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [None]:
pca_results = PCA(n_components=2).fit_transform(embedding)
tsne_results = TSNE(n_components=2, learning_rate='auto', init='pca', random_state=20).fit_transform(embedding)

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(15, 12))

plt.subplot(1, 2, 1)
sc = plt.scatter(pca_results[:, 0], pca_results[:, 1], alpha=0.8, s=30)#, cmap='rainbow')

sc = plt.scatter(pca_results[0, 0], pca_results[0, 1], alpha=0.8, s=30, color='blue')#, cmap='rainbow')
sc = plt.scatter(pca_results[8, 0], pca_results[8, 1], alpha=0.8, s=30, color='blue')#, cmap='rainbow')

plt.subplot(1, 2, 2)
sc = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], alpha=0.8, s=30)#, cmap='rainbow')

sc = plt.scatter(tsne_results[0, 0], tsne_results[0, 1], alpha=0.8, s=30, color='blue')#, cmap='rainbow')
sc = plt.scatter(tsne_results[8, 0], tsne_results[8, 1], alpha=0.8, s=30, color='blue')#, cmap='rainbow')


plt.show()

In [None]:
import tensorflow as tf