### Preperations with pycharm

In [1]:
%load_ext autoreload
%autoreload 2

import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(current_dir)))
sys.path.insert(0, os.path.join(parent_dir))

### imports

In [2]:
from unittest import TestCase
import re

from geopandas import GeoDataFrame
from shapely import wkt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from coord2vec.config import BUILDINGS_FEATURES_TABLE
from coord2vec.feature_extraction.feature_bundles import karka_bundle_features, create_building_features
from coord2vec.feature_extraction.features_builders import FeaturesBuilder

  from tqdm.autonotebook import tqdm


### get csv data

In [73]:
def get_csv_data(): # -> Tuple[Tuple[float, float], pd.DataFrame, Any]:

    server_csv_path = "/data/home/morpheus/coord2vec_noam/coord2vec/evaluation/tasks/house_pricing/Housing price in Beijing.csv"
    df = pd.read_csv(server_csv_path, engine='python')
#     print(df)
    df['coord'] = df.apply(lambda row: tuple(row[['Lng', 'Lat']].values), axis=1)
    coords = df['coord'].values
    features = df[["DOM", "followers", "square" ,"livingRoom", "drawingRoom", "kitchen", "bathRoom",
                  "floor", "buildingType", "constructionTime", "renovationCondition", "buildingStructure", "ladderRatio",
                  "elevator", "fiveYearsProperty", "subway", "district", "communityAverage", "totalPrice"]]
    # in features all csv exept: 'url', 'id', 'Lng', 'Lat', 'coord', "Cid", "tradeTime", 
    return coords, features

In [74]:
coords, csv_features = get_csv_data()

## cleaning the data

### generic clean funcion

In [112]:
def generic_clean_col(df, clean_funcs):
    ''' df - data frame
        cols - list of strings contains cols that should be cleaned
        clean_funcs - list of funcs that clean cols that should be cleand in df 
    '''
    for i, col in enumerate(clean_funcs):
        df = clean_funcs[i](df)
    cleaned_df = df.fillna(0)
    return cleaned_df

### clean floor column

In [None]:
# some floors are not writen well
csv_features["floor"][csv_features["floor"].apply(lambda floor: len(floor.split()))==1]

In [118]:
def clean_floor_col(df):
    # remove data points with no complete data
    cleaned_df = df.copy()
    cleaned_df = cleaned_df[cleaned_df["floor"].apply(lambda floor: len(floor.split()))==2]
    cleaned_df["floor"] = cleaned_df["floor"].apply(lambda floor: floor.split()[1])
    return cleaned_df

In [119]:
cleaned_floor_col = clean_floor_col(csv_features)
len(cleaned_floor_col), len(csv_features)

(318819, 318851)

### clean constructionTime column

In [None]:
# some constructionTime are not numeric
csv_features[csv_features['constructionTime'].apply(lambda time : not time.isnumeric())]['constructionTime']

In [None]:
def clean_constructionTime_col(df):
    cleaned_df = df.copy()
    cleaned_df[cleaned_df['constructionTime'].apply(lambda time : not time.isnumeric())] = 0
    return cleaned_df

### clean the data using the generic clean function

In [120]:
clean_funcs = [clean_floor_col, clean_constructionTime_col] # can add function if needed
cleaned_features = generic_clean_col(csv_features, clean_funcs)

## fit a simple linear regression on the data

In [None]:
X = cleaned_features.values
y = cleaned_features['totalPrice'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)
models = [LinearRegression()]
scores = []
for model in models:
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    scores.append(mean_squared_error(y_test, y_test_pred))
scores

In [None]:
filtered_csv_features = filtered_csv_features.drop(columns=["floor"])