### Preperations with pycharm

In [1]:
%load_ext autoreload
%autoreload 2

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
sys.path.insert(0, os.path.join(parent_dir))

### imports

In [2]:
from unittest import TestCase
import re

from geopandas import GeoDataFrame
from shapely import wkt
import pandas as pd
import numpy as np
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor


from coord2vec.config import BUILDINGS_FEATURES_TABLE
from coord2vec.feature_extraction.feature_bundles import karka_bundle_features, create_building_features
from coord2vec.feature_extraction.features_builders import FeaturesBuilder

  from tqdm.autonotebook import tqdm


### get csv data

In [3]:
def get_csv_data(use_full_dataset=True): # -> Tuple[Tuple[float, float], pd.DataFrame, Any]:
    server_csv_filder_path = "/data/home/morpheus/coord2vec_noam/coord2vec/evaluation/tasks/house_pricing"
    if use_full_dataset:
        csv_path = f"{server_csv_filder_path}/Housing price in Beijing.csv"
    else:
        small_or_medium = "medium"
        csv_path = f"{server_csv_filder_path}/Housing price in Beijing {small_or_medium}.csv"
    df = pd.read_csv(csv_path, engine='python')
#     print(df)
    df['coord'] = df.apply(lambda row: tuple(row[['Lng', 'Lat']].values), axis=1)
    features = df[["DOM", "followers", "square" ,"livingRoom", "drawingRoom", "kitchen", "bathRoom",
                  "floor", "buildingType", "constructionTime", "renovationCondition", "buildingStructure", "ladderRatio",
                  "elevator", "fiveYearsProperty", "subway", "district", "communityAverage", "coord", "totalPrice"]]
    # in features all csv exept: 'url', 'id', 'Lng', 'Lat', 'coord', "Cid", "tradeTime", 
    return features

In [4]:
use_full_dataset = True
csv_features = get_csv_data(use_full_dataset=use_full_dataset)

## cleaning the data

### generic clean funcion

In [5]:
def generic_clean_col(df, clean_funcs):
    ''' df - data frame
        cols - list of strings contains cols that should be cleaned
        clean_funcs - list of funcs that clean cols that should be cleand in df 
    '''
    for i, col in enumerate(clean_funcs):
        df = clean_funcs[i](df)
    cleaned_df = df.fillna(0)
    return cleaned_df

### clean floor column

In [6]:
# some floors are not writen well
csv_features["floor"][csv_features["floor"].apply(lambda floor: len(floor.split()))==1]

92235     �ֻ�ṹ
92251     �ֻ�ṹ
92267     �ֻ�ṹ
92270     ��Ͻṹ
92297     �ֻ�ṹ
92299     ��Ͻṹ
92300     �ֻ�ṹ
92304     ��Ͻṹ
92340     ��Ͻṹ
92349     �ֻ�ṹ
92356     ��Ͻṹ
92398     �ֻ�ṹ
92409     ��Ͻṹ
92414     �ֻ�ṹ
92467     �ֻ�ṹ
92520     ��Ͻṹ
92610     ��Ͻṹ
92660     ��Ͻṹ
92814     �ֻ�ṹ
92845     �ֻ�ṹ
92899     �ֻ�ṹ
113275    ��Ͻṹ
141376    �ֻ�ṹ
208214    ��Ͻṹ
220567    ��Ͻṹ
220569    ��Ͻṹ
220570    ��Ͻṹ
220603    ��Ͻṹ
224349    �ֻ�ṹ
243731    �ֻ�ṹ
244054    �ֻ�ṹ
245394    �ֻ�ṹ
Name: floor, dtype: object

In [7]:
def clean_floor_col(df):
    # remove data points with no complete data
    cleaned_df = df.copy()
    cleaned_df = cleaned_df[cleaned_df["floor"].apply(lambda floor: len(floor.split()))==2]
    cleaned_df["floor"] = cleaned_df["floor"].apply(lambda floor: floor.split()[1])
    return cleaned_df

In [8]:
cleaned_floor_col = clean_floor_col(csv_features)
len(cleaned_floor_col), len(csv_features)

(318819, 318851)

### clean constructionTime column

In [9]:
# some constructionTime are not numeric
csv_features[csv_features['constructionTime'].apply(lambda time : not time.isnumeric())]['constructionTime']

41        δ֪
82        δ֪
101       δ֪
158       δ֪
160       δ֪
          ..
318825    δ֪
318828    δ֪
318833    δ֪
318839    δ֪
318850    δ֪
Name: constructionTime, Length: 19283, dtype: object

In [10]:
def clean_constructionTime_col(df):
    cleaned_df = df.copy()
    cleaned_df['constructionTime'][cleaned_df['constructionTime'].apply(lambda time : not time.isnumeric())] = 0
    return cleaned_df

### clean the data using the generic clean function

In [11]:
clean_funcs = [clean_floor_col, clean_constructionTime_col] # can add function if needed
cleaned_features = generic_clean_col(csv_features, clean_funcs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### get geographical data

In [12]:
# coords = cleaned_features["coord"].values
unique_coords = cleaned_features["coord"].unique()
shapely_coords_unique = [Point(coord[0], coord[1]) for coord in unique_coords]

coord2coord_id = {coord:i for i, coord in enumerate(cleaned_features["coord"].unique())}
# test - len(cleaned_features["coord"].unique()) == len(coord2coord_id)

cleaned_features["coord_id"] = cleaned_features["coord"].apply(lambda coord: coord2coord_id[coord])

In [13]:
geo_feats = create_building_features(karka_bundle_features)
builder = FeaturesBuilder(geo_feats, cache_table=BUILDINGS_FEATURES_TABLE)
gdf = GeoDataFrame(pd.DataFrame({'geom': shapely_coords_unique}), geometry='geom')
geo_results_list = []
batch_size = 10000
n_samples = len(gdf.geometry)
calculate_geo_features_with_batches = use_full_dataset
if (not calculate_geo_features_with_batches) or (n_samples<=batch_size):
    geo_results = builder.transform(gdf.geometry)
else:
    for batch_start_ind in range(0, n_samples, batch_size):
        batch_end_ind = batch_start_ind+batch_size if batch_start_ind+batch_size<n_samples else n_samples
        geo_results_list.append(builder.transform(gdf.geometry[batch_start_ind:batch_end_ind]))
    geo_results = pd.concat(geo_results_list)
    geo_results = geo_results.reset_index(drop=True)

# print(len(geo_results[0]) + len(geo_results[0]) + len(geo_results[0]), n_samples ))

                                                                     

HBox(children=(FloatProgress(value=0.0, description='Calculating intersection', max=9.0, style=ProgressStyle(d…



HBox(children=(FloatProgress(value=0.0, description='Calculating Features for 4034 geoms', max=34.0, style=Pro…

Inserting Features to building_features:   0%|          | 0/34 [00:00<?, ?feature/s]



                                                                                             

In [14]:
# print(geo_results.shape[0], gdf.shape[0])
# print(geo_results.shape[1], len(builder.all_feat_names))
# print("index" in geo_results.columns, "coord_id" in cleaned_features.columns)
all_features = cleaned_features.merge(geo_results,left_on='coord_id', right_index=True, how='left')

## fit a simple linear regression on the data

In [18]:
X = all_features.drop(columns=["coord", "coord_id", "totalPrice"]).values
y = all_features['totalPrice'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)
n_cat_iter = 150
models = [LinearRegression(),
          CatBoostRegressor(iterations=n_cat_iter, learning_rate=1, depth=3)]
scores = []
for model in models:
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    scores.append(mean_squared_error(y_test, y_test_pred))
print("mean price - ", np.mean(y_test))
print(f"MSE: linear regression - {scores[0]}, catboost - {scores[1]}")
print(f"RMSE: linear regression - {np.sqrt(scores[0])}, catboost - {np.sqrt(scores[1])}")

0:	learn: 159.3914702	total: 10.9ms	remaining: 1.63s
1:	learn: 133.9659971	total: 21.6ms	remaining: 1.6s
2:	learn: 123.6800959	total: 31.8ms	remaining: 1.56s
3:	learn: 117.8735169	total: 56.3ms	remaining: 2.05s
4:	learn: 112.5187110	total: 66.9ms	remaining: 1.94s
5:	learn: 109.3993985	total: 77.7ms	remaining: 1.86s
6:	learn: 106.6194170	total: 89ms	remaining: 1.82s
7:	learn: 105.0817476	total: 99.7ms	remaining: 1.77s
8:	learn: 103.9586299	total: 111ms	remaining: 1.73s
9:	learn: 102.7758533	total: 122ms	remaining: 1.71s
10:	learn: 101.8256744	total: 133ms	remaining: 1.68s
11:	learn: 99.3688980	total: 145ms	remaining: 1.67s
12:	learn: 98.0116059	total: 163ms	remaining: 1.72s
13:	learn: 96.8390057	total: 172ms	remaining: 1.67s
14:	learn: 96.2541806	total: 180ms	remaining: 1.62s
15:	learn: 95.4283284	total: 189ms	remaining: 1.58s
16:	learn: 94.7097625	total: 196ms	remaining: 1.54s
17:	learn: 92.9934356	total: 204ms	remaining: 1.5s
18:	learn: 91.9423734	total: 222ms	remaining: 1.53s
19:	lea