### Preperations with pycharm

In [None]:
%load_ext autoreload
%autoreload 2

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
sys.path.insert(0, os.path.join(parent_dir))

### imports

In [None]:
from unittest import TestCase
import re

from geopandas import GeoDataFrame
from shapely import wkt
import pandas as pd
import numpy as np
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor


from coord2vec.config import BUILDINGS_FEATURES_TABLE
from coord2vec.feature_extraction.feature_bundles import karka_bundle_features, create_building_features
from coord2vec.feature_extraction.features_builders import FeaturesBuilder

### get csv data

In [None]:
def get_csv_data(use_full_dataset=True): # -> Tuple[Tuple[float, float], pd.DataFrame, Any]:
    server_csv_filder_path = "/data/home/morpheus/coord2vec_noam/coord2vec/evaluation/tasks/house_pricing"
    if use_full_dataset:
        csv_path = f"{server_csv_filder_path}/Housing price in Beijing.csv"
    else:
        small_or_medium = "medium"
        csv_path = f"{server_csv_filder_path}/Housing price in Beijing {small_or_medium}.csv"
    df = pd.read_csv(csv_path, engine='python')
#     print(df)
    df['coord'] = df.apply(lambda row: tuple(row[['Lng', 'Lat']].values), axis=1)
    features = df[["DOM", "followers", "square" ,"livingRoom", "drawingRoom", "kitchen", "bathRoom",
                  "floor", "buildingType", "constructionTime", "renovationCondition", "buildingStructure", "ladderRatio",
                  "elevator", "fiveYearsProperty", "subway", "district", "communityAverage", "coord", "totalPrice"]]
    # in features all csv exept: 'url', 'id', 'Lng', 'Lat', 'coord', "Cid", "tradeTime", 
    return features

In [None]:
use_full_dataset = True
csv_features = get_csv_data(use_full_dataset=use_full_dataset)

## cleaning the data

### generic clean funcion

In [None]:
def generic_clean_col(df, clean_funcs):
    ''' df - data frame
        cols - list of strings contains cols that should be cleaned
        clean_funcs - list of funcs that clean cols that should be cleand in df 
    '''
    for i, col in enumerate(clean_funcs):
        df = clean_funcs[i](df)
    cleaned_df = df.fillna(0)
    return cleaned_df

### clean floor column

In [None]:
# some floors are not writen well
csv_features["floor"][csv_features["floor"].apply(lambda floor: len(floor.split()))==1]

In [None]:
def clean_floor_col(df):
    # remove data points with no complete data
    cleaned_df = df.copy()
    cleaned_df = cleaned_df[cleaned_df["floor"].apply(lambda floor: len(floor.split()))==2]
    cleaned_df["floor"] = cleaned_df["floor"].apply(lambda floor: floor.split()[1])
    return cleaned_df

In [None]:
cleaned_floor_col = clean_floor_col(csv_features)
len(cleaned_floor_col), len(csv_features)

### clean constructionTime column

In [None]:
# some constructionTime are not numeric
csv_features[csv_features['constructionTime'].apply(lambda time : not time.isnumeric())]['constructionTime']

In [None]:
def clean_constructionTime_col(df):
    cleaned_df = df.copy()
    cleaned_df['constructionTime'][cleaned_df['constructionTime'].apply(lambda time : not time.isnumeric())] = 0
    return cleaned_df

### clean the data using the generic clean function

In [None]:
clean_funcs = [clean_floor_col, clean_constructionTime_col] # can add function if needed
cleaned_features = generic_clean_col(csv_features, clean_funcs)

### get geographical data

In [None]:
# coords = cleaned_features["coord"].values
unique_coords = cleaned_features["coord"].unique()
shapely_coords_unique = [Point(coord[0], coord[1]) for coord in unique_coords]

coord2coord_id = {coord:i for i, coord in enumerate(cleaned_features["coord"].unique())}
# test - len(cleaned_features["coord"].unique()) == len(coord2coord_id)

cleaned_features["coord_id"] = cleaned_features["coord"].apply(lambda coord: coord2coord_id[coord])

In [None]:
geo_feats = create_building_features(karka_bundle_features)
builder = FeaturesBuilder(geo_feats, cache_table=BUILDINGS_FEATURES_TABLE)
gdf = GeoDataFrame(pd.DataFrame({'geom': shapely_coords_unique}), geometry='geom')
geo_results_list = []
batch_size = 10000
n_samples = len(gdf.geometry)
calculate_geo_features_with_batches = use_full_dataset
if (not calculate_geo_features_with_batches) or (n_samples<=batch_size):
    geo_results = builder.transform(gdf.geometry)
else:
    for batch_start_ind in range(0, n_samples, batch_size):
        batch_end_ind = batch_start_ind+batch_size if batch_start_ind+batch_size<n_samples else n_samples
        geo_results_list.append(builder.transform(gdf.geometry[batch_start_ind:batch_end_ind]))
    geo_results = pd.concat(geo_results_list)
    geo_results = geo_results.reset_index(drop=True)

# print(len(geo_results[0]) + len(geo_results[0]) + len(geo_results[0]), n_samples ))

In [None]:
# print(geo_results.shape[0], gdf.shape[0])
# print(geo_results.shape[1], len(builder.all_feat_names))
# print("index" in geo_results.columns, "coord_id" in cleaned_features.columns)
all_features = cleaned_features.merge(geo_results,left_on='coord_id', right_index=True, how='left')

## fit a simple linear regression on the data

In [None]:
X = all_features.drop(columns=["coord", "coord_id", "totalPrice"]).values
y = all_features['totalPrice'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)
n_cat_iter = 150
models = [LinearRegression(),
          CatBoostRegressor(iterations=n_cat_iter, learning_rate=1, depth=3)]
scores = []
for model in models:
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    scores.append(mean_squared_error(y_test, y_test_pred))
print("mean price - ", np.mean(y_test))
print(f"MSE: linear regression - {scores[0]}, catboost - {scores[1]}")
print(f"RMSE: linear regression - {np.sqrt(scores[0])}, catboost - {np.sqrt(scores[1])}")