In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use('default')

In [3]:
import sys, os
sys.path.append(os.path.join(os.path.abspath(''), '..', 'shared_libs'))
import data_transform

In [4]:
df = pd.read_csv('data/data_target_cleared.csv')
df.head()

Unnamed: 0,status,private pool,propertyType,street,baths,homeFacts,fireplace,city,schools,sqft,zipcode,beds,state,stories,mls-id,PrivatePool,MlsId,target
0,for sale,,multi-family,803 Passmore St,2 Baths,"{'atAGlanceFacts': [{'factValue': '1950', 'fac...",yes,Philadelphia,"[{'rating': ['3/10', '2/10', '3/10'], 'data': ...","1,350 sqft",19111,3 Beds,PA,2.0,,,PAPH853202,195000.0
1,for sale,,lot/land,3609 Summit Ave,,"{'atAGlanceFacts': [{'factValue': '', 'factLab...",,Greensboro,"[{'rating': ['5/10', '5/10', '4/10'], 'data': ...",,27405,,NC,,,,930640,199000.0
2,Active,,Land,4011 Valley Vista Dr,,"{'atAGlanceFacts': [{'factValue': '', 'factLab...",,Lowell,"[{'rating': ['7', '6'], 'data': {'Distance': [...",0,49331,,MI,,,,19058608,189900.0
3,For sale,,Single Family,118 S 17th St,Bathrooms: 2,"{'atAGlanceFacts': [{'factValue': '1909', 'fac...",,Allentown,"[{'rating': ['2/10', '2/10', '2/10'], 'data': ...","Total interior livable area: 1,617 sqft",18104,3 bd,PA,2.0,628531.0,,,189000.0
4,for sale,,single-family home,6820 Quincy St,4 Baths,"{'atAGlanceFacts': [{'factValue': '1925', 'fac...",yes,Philadelphia,"[{'rating': ['3/10', '1/10'], 'data': {'Distan...","5,013 sqft",19119,6 Beds,PA,3.0,,,PAPH851112,579000.0


In [5]:
df = data_transform.clear_data_base_line(
    df, 
    '../shared_libs/data/default_values.pkl',
    can_drop_rows=True, 
    force_rebuild_cached_data=True
)

In [6]:
cities_dict = data_transform.get_cities_dict('../shared_libs/data/cities_dict.pkl', force_read=True)
address_dict = data_transform.get_addresses_dict('../shared_libs/data/address_dict.pkl', force_read=True)
address_by_zip_dict = data_transform.get_address_by_zipcode_dict('../shared_libs/data/address_by_zip_dict.pkl', force_read=True)
cities_clusters_dict = data_transform.get_citiess_clusters_dict('../shared_libs/data/cities_clusters_dict.pkl', force_read=True)

In [7]:
df = data_transform.fix_incorrect_states_and_cities(
    df, 
    '../shared_libs/data/default_values.pkl', 
    cities_dict, 
    can_drop_rows=True
)

In [8]:
df = data_transform.add_city_features(
    df,
    '../shared_libs/data/default_values.pkl', 
    cities_dict, 
    address_dict,
    address_by_zip_dict,
    cities_clusters_dict,
    force_rebuild_cached_data=True
)

In [9]:
df = data_transform.add_population_features(
    df, 
    '../shared_libs/data/default_values.pkl', 
    '../shared_libs/data/uscities.csv', 
    can_drop_rows=True,
    force_rebuild_cached_data=True
)

In [10]:
df = data_transform.encode_state_and_city(
    df, 
    '../shared_libs/data/default_values.pkl',
    can_drop_rows=True, 
    force_rebuild_cached_data=True
)

In [11]:
df = data_transform.final_tune_pca_and_scale(
    df, 
    '../shared_libs/data/default_values.pkl', 
    force_rebuild_cached_data=True
)

In [12]:
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################
#####################################################################################################################

In [13]:
df_base = df.copy()

In [14]:
y = df_base['target']
X = df_base.drop(['target'], axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [16]:
model = LinearRegression()

model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)

print('MAPE:', mean_absolute_percentage_error(y_test, y_pred_test)*100)
print('RMSE:', mean_squared_error(y_test, y_pred_test)**0.5)

MAPE: 89.69147320782015
RMSE: 446652.0059706066


In [17]:
with open('../shared_libs/data/models/model_lr.pkl', 'wb') as f:
    pickle.dump(model, f)

In [18]:
model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

print('Train MAPE:', mean_absolute_percentage_error(y_train, y_pred_train)*100)
print('Train RMSE:', mean_squared_error(y_train, y_pred_train)**0.5)
print('Test MAPE:', mean_absolute_percentage_error(y_test, y_pred_test)*100)
print('Test RMSE:', mean_squared_error(y_test, y_pred_test)**0.5)

Train MAPE: 49.846199682951465
Train RMSE: 246257.31453057274
Test MAPE: 54.80158426129014
Test RMSE: 325080.6969704045


In [19]:
with open('../shared_libs/data/models/model_rfr.pkl', 'wb') as f:
    pickle.dump(model, f)

In [20]:
model = AdaBoostRegressor(
    DecisionTreeRegressor(
        min_samples_leaf=4, # 3
        max_depth=27, # 28
        random_state=42
    ),
    random_state=42,
    learning_rate=0.0001, # 0.01
    n_estimators=100 #
)

model.fit(X_train, y_train)

y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

print('Train MAPE:', mean_absolute_percentage_error(y_train, y_pred_train)*100)
print('Train RMSE:', mean_squared_error(y_train, y_pred_train)**0.5)
print('Test MAPE:', mean_absolute_percentage_error(y_test, y_pred_test)*100)
print('Test RMSE:', mean_squared_error(y_test, y_pred_test)**0.5)

Train MAPE: 15.160181354365495
Train RMSE: 166241.58117034868
Test MAPE: 30.176375171879172
Test RMSE: 310733.2767712826


In [21]:
with open('../shared_libs/data/models/model_abr.pkl', 'wb') as f:
    pickle.dump(model, f)