### Preperations with pycharm

In [1]:
%load_ext autoreload
%autoreload 2

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
sys.path.insert(0, os.path.join(parent_dir))

### imports

In [2]:
from unittest import TestCase
import re

from geopandas import GeoDataFrame
from shapely import wkt
import pandas as pd
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from coord2vec.config import BUILDINGS_FEATURES_TABLE
from coord2vec.feature_extraction.feature_bundles import karka_bundle_features, create_building_features
from coord2vec.feature_extraction.features_builders import FeaturesBuilder

  from tqdm.autonotebook import tqdm


### get csv data

In [3]:
def get_csv_data(get_full_data=True): # -> Tuple[Tuple[float, float], pd.DataFrame, Any]:
    server_csv_filder_path = "/data/home/morpheus/coord2vec_noam/coord2vec/evaluation/tasks/house_pricing"
    if get_full_data:
        csv_path = f"{server_csv_filder_path}/Housing price in Beijing.csv"
    else:
        csv_path = f"{server_csv_filder_path}/Housing price in Beijing small.csv"
    df = pd.read_csv(csv_path, engine='python')
#     print(df)
    df['coord'] = df.apply(lambda row: tuple(row[['Lng', 'Lat']].values), axis=1)
    features = df[["DOM", "followers", "square" ,"livingRoom", "drawingRoom", "kitchen", "bathRoom",
                  "floor", "buildingType", "constructionTime", "renovationCondition", "buildingStructure", "ladderRatio",
                  "elevator", "fiveYearsProperty", "subway", "district", "communityAverage", "coord", "totalPrice"]]
    # in features all csv exept: 'url', 'id', 'Lng', 'Lat', 'coord', "Cid", "tradeTime", 
    return features

In [4]:
csv_features = get_csv_data(get_full_data=False)

## cleaning the data

### generic clean funcion

In [5]:
def generic_clean_col(df, clean_funcs):
    ''' df - data frame
        cols - list of strings contains cols that should be cleaned
        clean_funcs - list of funcs that clean cols that should be cleand in df 
    '''
    for i, col in enumerate(clean_funcs):
        df = clean_funcs[i](df)
    cleaned_df = df.fillna(0)
    return cleaned_df

### clean floor column

In [6]:
# some floors are not writen well
csv_features["floor"][csv_features["floor"].apply(lambda floor: len(floor.split()))==1]

Series([], Name: floor, dtype: object)

In [7]:
def clean_floor_col(df):
    # remove data points with no complete data
    cleaned_df = df.copy()
    cleaned_df = cleaned_df[cleaned_df["floor"].apply(lambda floor: len(floor.split()))==2]
    cleaned_df["floor"] = cleaned_df["floor"].apply(lambda floor: floor.split()[1])
    return cleaned_df

In [8]:
cleaned_floor_col = clean_floor_col(csv_features)
len(cleaned_floor_col), len(csv_features)

(1000, 1000)

### clean constructionTime column

In [9]:
# some constructionTime are not numeric
csv_features[csv_features['constructionTime'].apply(lambda time : not time.isnumeric())]['constructionTime']

41     δ֪
82     δ֪
101    δ֪
158    δ֪
160    δ֪
189    δ֪
251    δ֪
272    δ֪
293    δ֪
346    δ֪
375    δ֪
455    δ֪
483    δ֪
496    δ֪
553    δ֪
578    δ֪
579    δ֪
704    δ֪
742    δ֪
771    δ֪
788    δ֪
826    δ֪
842    δ֪
949    δ֪
955    δ֪
Name: constructionTime, dtype: object

In [10]:
def clean_constructionTime_col(df):
    cleaned_df = df.copy()
    cleaned_df['constructionTime'][cleaned_df['constructionTime'].apply(lambda time : not time.isnumeric())] = 0
    return cleaned_df

### clean the data using the generic clean function

In [11]:
clean_funcs = [clean_floor_col, clean_constructionTime_col] # can add function if needed
cleaned_features = generic_clean_col(csv_features, clean_funcs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### get geographical data

In [12]:
# coords = cleaned_features["coord"].values
unique_coords = cleaned_features["coord"].unique()
shapely_coords_unique = [Point(coord[0], coord[1]) for coord in unique_coords]

coord2coord_id = {coord:i for i, coord in enumerate(cleaned_features["coord"].unique())}
# test - len(cleaned_features["coord"].unique()) == len(coord2coord_id)

cleaned_features["coord_id"] = cleaned_features["coord"].apply(lambda coord: coord2coord_id[coord])

In [13]:
geo_feats = create_building_features(karka_bundle_features)
builder = FeaturesBuilder(geo_feats, cache_table=BUILDINGS_FEATURES_TABLE)
gdf = GeoDataFrame(pd.DataFrame({'geom': shapely_coords_unique}), geometry='geom')
geo_results = builder.transform(gdf.geometry)
geo_results = geo_results.reset_index()

                                                                     

In [14]:
print(geo_results.shape[0], gdf.shape[0])
print(geo_results.shape[1], len(builder.all_feat_names))
print("index" in geo_results.columns, "coord_id" in cleaned_features.columns)
all_features = cleaned_features.merge(geo_results,left_on='coord_id', right_on='index', how='left')
all_features.columns

693 693
35 34
True True


Index(['DOM', 'followers', 'square', 'livingRoom', 'drawingRoom', 'kitchen',
       'bathRoom', 'floor', 'buildingType', 'constructionTime',
       'renovationCondition', 'buildingStructure', 'ladderRatio', 'elevator',
       'fiveYearsProperty', 'subway', 'district', 'communityAverage', 'coord',
       'totalPrice', 'coord_id', 'index', 'distance_to_major_road_100m',
       'length_of_major_road_100m', 'area_of_nearest_major_road_100m',
       'distance_to_major_road_250m', 'length_of_major_road_250m',
       'area_of_nearest_major_road_250m', 'distance_to_major_road_500m',
       'length_of_major_road_500m', 'area_of_nearest_major_road_500m',
       'distance_to_major_road_1000m', 'length_of_major_road_1000m',
       'area_of_nearest_major_road_1000m', 'distance_to_minor_road_50m',
       'length_of_minor_road_50m', 'area_of_nearest_minor_road_50m',
       'distance_to_minor_road_100m', 'length_of_minor_road_100m',
       'area_of_nearest_minor_road_100m', 'distance_to_minor_road_250

## fit a simple linear regression on the data

In [16]:
X = all_features.drop(columns=["coord", "coord_id", "index", "totalPrice"]).values
y = all_features['totalPrice'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)
models = [LinearRegression()]
scores = []
for model in models:
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    scores.append(mean_squared_error(y_test, y_test_pred))
scores

[24208.409242781807]