### Preperations with pycharm

In [24]:
%load_ext autoreload
%autoreload 2

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
sys.path.insert(0, os.path.join(parent_dir))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### imports

In [25]:
from unittest import TestCase
import re

from geopandas import GeoDataFrame
from shapely import wkt
import pandas as pd
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from coord2vec.config import BUILDINGS_FEATURES_TABLE
from coord2vec.feature_extraction.feature_bundles import karka_bundle_features, create_building_features
from coord2vec.feature_extraction.features_builders import FeaturesBuilder

### get csv data

In [26]:
def get_csv_data(): # -> Tuple[Tuple[float, float], pd.DataFrame, Any]:

    server_csv_path = "/data/home/morpheus/coord2vec_noam/coord2vec/evaluation/tasks/house_pricing/Housing price in Beijing.csv"
    df = pd.read_csv(server_csv_path, engine='python')
#     print(df)
    df['coord'] = df.apply(lambda row: tuple(row[['Lng', 'Lat']].values), axis=1)
    features = df[["DOM", "followers", "square" ,"livingRoom", "drawingRoom", "kitchen", "bathRoom",
                  "floor", "buildingType", "constructionTime", "renovationCondition", "buildingStructure", "ladderRatio",
                  "elevator", "fiveYearsProperty", "subway", "district", "communityAverage", "coord", "totalPrice"]]
    # in features all csv exept: 'url', 'id', 'Lng', 'Lat', 'coord', "Cid", "tradeTime", 
    return features

In [27]:
csv_features = get_csv_data()

## cleaning the data

### generic clean funcion

In [28]:
def generic_clean_col(df, clean_funcs):
    ''' df - data frame
        cols - list of strings contains cols that should be cleaned
        clean_funcs - list of funcs that clean cols that should be cleand in df 
    '''
    for i, col in enumerate(clean_funcs):
        df = clean_funcs[i](df)
    cleaned_df = df.fillna(0)
    return cleaned_df

### clean floor column

In [29]:
# some floors are not writen well
csv_features["floor"][csv_features["floor"].apply(lambda floor: len(floor.split()))==1]

92235     �ֻ�ṹ
92251     �ֻ�ṹ
92267     �ֻ�ṹ
92270     ��Ͻṹ
92297     �ֻ�ṹ
92299     ��Ͻṹ
92300     �ֻ�ṹ
92304     ��Ͻṹ
92340     ��Ͻṹ
92349     �ֻ�ṹ
92356     ��Ͻṹ
92398     �ֻ�ṹ
92409     ��Ͻṹ
92414     �ֻ�ṹ
92467     �ֻ�ṹ
92520     ��Ͻṹ
92610     ��Ͻṹ
92660     ��Ͻṹ
92814     �ֻ�ṹ
92845     �ֻ�ṹ
92899     �ֻ�ṹ
113275    ��Ͻṹ
141376    �ֻ�ṹ
208214    ��Ͻṹ
220567    ��Ͻṹ
220569    ��Ͻṹ
220570    ��Ͻṹ
220603    ��Ͻṹ
224349    �ֻ�ṹ
243731    �ֻ�ṹ
244054    �ֻ�ṹ
245394    �ֻ�ṹ
Name: floor, dtype: object

In [30]:
def clean_floor_col(df):
    # remove data points with no complete data
    cleaned_df = df.copy()
    cleaned_df = cleaned_df[cleaned_df["floor"].apply(lambda floor: len(floor.split()))==2]
    cleaned_df["floor"] = cleaned_df["floor"].apply(lambda floor: floor.split()[1])
    return cleaned_df

In [31]:
cleaned_floor_col = clean_floor_col(csv_features)
len(cleaned_floor_col), len(csv_features)

(318819, 318851)

### clean constructionTime column

In [32]:
# some constructionTime are not numeric
csv_features[csv_features['constructionTime'].apply(lambda time : not time.isnumeric())]['constructionTime']

41        δ֪
82        δ֪
101       δ֪
158       δ֪
160       δ֪
          ..
318825    δ֪
318828    δ֪
318833    δ֪
318839    δ֪
318850    δ֪
Name: constructionTime, Length: 19283, dtype: object

In [33]:
def clean_constructionTime_col(df):
    cleaned_df = df.copy()
    cleaned_df[cleaned_df['constructionTime'].apply(lambda time : not time.isnumeric())]['constructionTime'] = 0
    return cleaned_df

### clean the data using the generic clean function

In [34]:
clean_funcs = [clean_floor_col, clean_constructionTime_col] # can add function if needed
cleaned_features = generic_clean_col(csv_features, clean_funcs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


### get geographical data

In [37]:
# for memory reasons choose only n_choose examples
n_choose = 10000
cleaned_features = cleaned_features[:n_choose]

In [38]:
coords = cleaned_features["coord"].values
shapely_coords = [Point(coord[0], coord[1]) for coord in coords]

geo_feats = create_building_features(karka_bundle_features)
builder = FeaturesBuilder(geo_feats, cache_table=BUILDINGS_FEATURES_TABLE)
gdf = GeoDataFrame(pd.DataFrame({'geom': shapely_coords}), geometry='geom')
geo_results = builder.transform(gdf.geometry)


Checking loaded features:   0%|          | 0/34 [00:00<?, ?feature/s][A
                                                                     [A

HBox(children=(FloatProgress(value=0.0, description='Calculating intersection', max=9.0, style=ProgressStyle(d…



HBox(children=(FloatProgress(value=0.0, description='Calculating Features for 10000 geoms', max=27.0, style=Pr…




Inserting Features to building_features:   0%|          | 0/27 [00:00<?, ?feature/s][A
Inserting Features to building_features:   4%|▎         | 1/27 [00:05<02:16,  5.26s/feature][A
Inserting Features to building_features:   7%|▋         | 2/27 [00:09<02:01,  4.88s/feature][A
Inserting Features to building_features:  11%|█         | 3/27 [00:13<01:50,  4.61s/feature][A
Inserting Features to building_features:  15%|█▍        | 4/27 [00:17<01:43,  4.49s/feature][A
Inserting Features to building_features:  19%|█▊        | 5/27 [00:21<01:35,  4.35s/feature][A
Inserting Features to building_features:  22%|██▏       | 6/27 [00:25<01:29,  4.24s/feature][A
Inserting Features to building_features:  26%|██▌       | 7/27 [00:29<01:21,  4.09s/feature][A
Inserting Features to building_features:  30%|██▉       | 8/27 [00:33<01:16,  4.03s/feature][A
Inserting Features to building_features:  33%|███▎      | 9/27 [00:36<01:11,  3.98s/feature][A
Inserting Features to building_features:  37%|█

In [None]:
print(geo_results.shape[0], gdf.shape[0])
print(geo_results.shape[1], len(builder.all_feat_names))
# geo_results

In [44]:
## fit a simple linear regression on the data

88908 10000
34 34


## fit a simple linear regression on the data

In [10]:
X = cleaned_features.values
y = cleaned_features['totalPrice'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)
models = [LinearRegression()]
scores = []
for model in models:
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    scores.append(mean_squared_error(y_test, y_test_pred))
scores

NameError: name 'cleaned_features' is not defined

In [None]:
filtered_csv_features = filtered_csv_features.drop(columns=["floor"])