In [1]:
%load_ext autoreload
%autoreload 2

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
sys.path.insert(0, os.path.join(parent_dir))

In [66]:
from unittest import TestCase
import re

from geopandas import GeoDataFrame
from shapely import wkt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from coord2vec.config import BUILDINGS_FEATURES_TABLE
from coord2vec.feature_extraction.feature_bundles import karka_bundle_features, create_building_features
from coord2vec.feature_extraction.features_builders import FeaturesBuilder

In [55]:
def get_csv_data(): # -> Tuple[Tuple[float, float], pd.DataFrame, Any]:

    server_csv_path = "/data/home/morpheus/coord2vec_noam/coord2vec/evaluation/tasks/house_pricing/Housing price in Beijing.csv"
    df = pd.read_csv(server_csv_path, engine='python')
    df['coord'] = df.apply(lambda row: tuple(row[['Lng', 'Lat']].values), axis=1)
    coords = df['coord'].values
    features = df[["DOM", "followers", "square" ,"livingRoom", "drawingRoom", "kitchen", "bathRoom",
                  "floor", "buildingType", "constructionTime", "renovationCondition", "buildingStructure", "ladderRatio",
                  "elevator", "fiveYearsProperty", "subway", "district", "communityAverage"]]
    # in features all csv exept: 'url', 'id', 'Lng', 'Lat', 'coord', "Cid", "tradeTime", 
    y = df["totalPrice"].values
    return coords, features, y

In [110]:
coords, csv_features, y = get_csv_data()

In [111]:
# some floors are not writen well
csv_features["floor"][csv_features["floor"].apply(lambda floor: len(floor.split()))==1]

92235     �ֻ�ṹ
92251     �ֻ�ṹ
92267     �ֻ�ṹ
92270     ��Ͻṹ
92297     �ֻ�ṹ
92299     ��Ͻṹ
92300     �ֻ�ṹ
92304     ��Ͻṹ
92340     ��Ͻṹ
92349     �ֻ�ṹ
92356     ��Ͻṹ
92398     �ֻ�ṹ
92409     ��Ͻṹ
92414     �ֻ�ṹ
92467     �ֻ�ṹ
92520     ��Ͻṹ
92610     ��Ͻṹ
92660     ��Ͻṹ
92814     �ֻ�ṹ
92845     �ֻ�ṹ
92899     �ֻ�ṹ
113275    ��Ͻṹ
141376    �ֻ�ṹ
208214    ��Ͻṹ
220567    ��Ͻṹ
220569    ��Ͻṹ
220570    ��Ͻṹ
220603    ��Ͻṹ
224349    �ֻ�ṹ
243731    �ֻ�ṹ
244054    �ֻ�ṹ
245394    �ֻ�ṹ
Name: floor, dtype: object

In [103]:
def filter_beijing_csv_data(csv_features, filter_bad_floors=True):
    # remove data points with no complete data
    filtered_csv_featurs = csv_features.copy()
    if filter_bad_floors:
        # if not dropping floors data then filter the bad data
        filtered_csv_featurs = csv_features[csv_features["floor"].apply(lambda floor: len(floor.split()))==2]
    return filtered_csv_featurs

In [104]:
def reformat_bejing_csv_data(csv_features_filtered):
    reformated_csv_features = csv_features_filtered.copy()
    if "floor" in reformated_csv_features.columns:
        reformated_csv_features["floor"] = reformated_csv_features["floor"].apply(lambda floor: floor.split()[1])
    return reformated_csv_features

In [112]:
filtered_csv_features = reformat_bejing_csv_data(filter_beijing_csv_data(csv_features))
len(filtered_csv_features), len(csv_features)

(318819, 318851)

In [12]:
geo_feats = create_building_features(karka_bundle_features)
builder = FeaturesBuilder(geo_feats, cache_table=BUILDINGS_FEATURES_TABLE)

In [21]:
#not a correct use
cls.builder = FeaturesBuilder(additional_feats, cache_table=BUILDINGS_FEATURES_TABLE)

Unnamed: 0,url,id,Lng,Lat,Cid,tradeTime,DOM,followers,totalPrice,price,...,buildingType,constructionTime,renovationCondition,buildingStructure,ladderRatio,elevator,fiveYearsProperty,subway,district,communityAverage
0,https://bj.lianjia.com/chengjiao/101084782030....,101084782030,116.475489,40.019520,1111027376244,2016-08-09,1464.0,106,415.0,31680,...,1.0,2005,3,6,0.217,1.0,0.0,1.0,7,56021.0
1,https://bj.lianjia.com/chengjiao/101086012217....,101086012217,116.453917,39.881534,1111027381879,2016-07-28,903.0,126,575.0,43436,...,1.0,2004,4,6,0.667,1.0,1.0,0.0,7,71539.0
2,https://bj.lianjia.com/chengjiao/101086041636....,101086041636,116.561978,39.877145,1111040862969,2016-12-11,1271.0,48,1030.0,52021,...,4.0,2005,3,6,0.500,1.0,0.0,0.0,7,48160.0
3,https://bj.lianjia.com/chengjiao/101086406841....,101086406841,116.438010,40.076114,1111043185817,2016-09-30,965.0,138,297.5,22202,...,1.0,2008,1,6,0.273,1.0,0.0,0.0,6,51238.0
4,https://bj.lianjia.com/chengjiao/101086920653....,101086920653,116.428392,39.886229,1111027381174,2016-08-28,927.0,286,392.0,48396,...,4.0,1960,2,2,0.333,0.0,1.0,1.0,1,62588.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318846,https://bj.lianjia.com/chengjiao/BJYZ92311192....,BJYZ92311192,116.546899,39.755236,1111027377053,2016-06-10,1.0,8,350.0,20580,...,4.0,2003,1,6,0.333,0.0,1.0,0.0,3,36545.0
318847,https://bj.lianjia.com/chengjiao/BJYZ92320171....,BJYZ92320171,116.497474,39.810115,1111027377957,2016-06-10,,1,108.8,31006,...,1.0,2009,1,6,0.222,1.0,0.0,1.0,3,54282.0
318848,https://bj.lianjia.com/chengjiao/BJYZ92324217....,BJYZ92324217,116.497256,39.804081,1111027380056,2016-06-05,1.0,2,359.0,35138,...,4.0,2000,3,6,0.500,1.0,1.0,0.0,3,46927.0
318849,https://bj.lianjia.com/chengjiao/BJYZ92333313....,BJYZ92333313,116.501794,39.799347,1111027377054,2016-06-12,,4,720.0,40373,...,4.0,2003,4,2,0.500,0.0,1.0,0.0,3,54842.0


In [None]:
X = filtered_csv_features.values
X_train, X_test, y_train, y_test = train_test_split(X, y)
models = [LinearRegression()]
scores = []
for model in models:
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    scores.append(mean_squared_error(y_test, y_test_pred))
