In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
import time

from catboost import CatBoostRegressor

import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
front_end = pd.read_csv('../data/ames_housing_price_data_v6.csv')

In [4]:
sp=pd.read_csv('../data/ames_housing_price_data_v5.csv')
sp=sp[['PID','SalePrice']]

In [5]:
def dummify(df, non_dummies, dummies):
    for dummified in dummies:
        for original in non_dummies:
            if original in dummified:
                orig_name = f'{original}_'
                value = dummified.replace(orig_name, '')
                df[dummified] = df[original].map(lambda x: 1 if x == value else 0)
    df=df.drop(non_dummies,axis=1)
    return df

In [6]:
dummies = [   
    'Neighborhood_Blueste',
    'Neighborhood_BrDale',
    'Neighborhood_BrkSide',
    'Neighborhood_ClearCr',
    'Neighborhood_CollgCr',
    'Neighborhood_Crawfor',
    'Neighborhood_Edwards',
    'Neighborhood_Gilbert', 
    'Neighborhood_Greens', 
    'Neighborhood_GrnHill',
    'Neighborhood_IDOTRR',
    'Neighborhood_Landmrk',
    'Neighborhood_MeadowV',
    'Neighborhood_Mitchel',
    'Neighborhood_NAmes',
    'Neighborhood_NPkVill',
    'Neighborhood_NWAmes',
    'Neighborhood_NoRidge',
    'Neighborhood_NridgHt',
    'Neighborhood_OldTown',
    'Neighborhood_SWISU',
    'Neighborhood_Sawyer',
    'Neighborhood_SawyerW',
    'Neighborhood_Somerst',
    'Neighborhood_StoneBr',
    'Neighborhood_Timber',
    'Neighborhood_Veenker',
    'BldgType_2fmCon',
    'BldgType_Duplex',
    'BldgType_Twnhs',
    'BldgType_TwnhsE',
    'MasVnrType_None',
    'MasVnrType_Stone'
    ]

non_dummies=['Neighborhood', 'BldgType', 'MasVnrType']

In [7]:
534177230

def FC_analysis(PID, rowname):
    #select PID and variable to adjust
    #add new rows to front end
    #retransform to back_end
    #run algorithm
    #merge prices to front_end
    col=front_end.columns.get_loc(rowname)
    newrow=front_end[front_end['PID']==PID]
    front_end2=front_end.copy()
    for i in range(0,20):
        newrow2=newrow.copy()
        newrow2.iloc[0,col]=newrow2.iloc[0,col]*0.5+newrow2.iloc[0,col]*i/20
        front_end2=front_end2.append(newrow2)
    
    front_end2=front_end2.reset_index()
    back_end = front_end2.copy()
    back_end['ExterQualDisc']=back_end['ExterQual']-back_end['OverallQual']
    back_end['OverallCondDisc']=back_end['OverallCond']-back_end['OverallQual']
    back_end['KitchenQualDisc']=back_end['KitchenQual']-back_end['OverallQual']
    back_end=back_end.drop(['ExterQual','OverallCond','KitchenQual'],axis=1)

    back_end = dummify(back_end, non_dummies, dummies)

    cbl = CatBoostRegressor();
    cbl.load_model("HousePriceCatBoost", "cbm")
    cbl_pred = cbl.predict(back_end)
    
    cbl_pred=pd.Series(data=cbl_pred)
    cbl_pred.name='pred_price'

    df=pd.merge(front_end2,cbl_pred, left_index=True, right_index=True)

    return df


In [24]:
df2=FC_analysis(534177230, 'GrLivArea')

In [25]:
px.line(df2[df2['PID']==534177230],'GrLivArea','pred_price')

In [10]:
#transformation of front-end to back-end, and catboost application
back_end = front_end.copy()
back_end['ExterQualDisc']=back_end['ExterQual']-back_end['OverallQual']
back_end['OverallCondDisc']=back_end['OverallCond']-back_end['OverallQual']
back_end['KitchenQualDisc']=back_end['KitchenQual']-back_end['OverallQual']
back_end=back_end.drop(['ExterQual','OverallCond','KitchenQual'],axis=1)

back_end = dummify(back_end, non_dummies, dummies)

cbl = CatBoostRegressor();
cbl.load_model("HousePriceCatBoost", "cbm")
cbl_pred = cbl.predict(back_end)

In [11]:
cbl_pred=pd.Series(data=cbl_pred)
cbl_pred.name='pred_price'

In [12]:
df=pd.merge(front_end,cbl_pred, left_index=True, right_index=True)

In [13]:
df=pd.merge(df,sp, on='PID')

In [14]:
df['pricedelta']=(1-df['pred_price']/df['SalePrice'])*100

In [16]:
px.scatter(df,df['SalePrice'],df['pred_price'])

In [23]:
fig = px.scatter(df,df['pred_price'],df['pricedelta'])
fig.update_layout(
width=400, height=600
)
fig.show()

In [18]:
front_end['ExterQualDisc']=front_end['ExterQual']-front_end['OverallQual']
front_end['OverallCondDisc']=front_end['OverallCond']-front_end['OverallQual']
front_end['KitchenQualDisc']=front_end['KitchenQual']-front_end['OverallQual']

In [19]:
px.scatter(df,df['GrLivArea'],df['pred_price'])