# Regression with all transactions included

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import xgboost as xgb
import warnings
warnings.simplefilter(action='ignore')
from scipy.optimize import minimize
from scipy.optimize import fmin
import math
from bayes_opt import BayesianOptimization

# Loading Data 

In [2]:
Tot = pd.read_csv('austin_housing_data.csv')
print('Number of Transactions: ' + str(len(Tot)))

Number of Transactions: 24384


In [3]:
# This step randomly sorts the transactions
Tot.sample(frac=1)
Tot.columns

Index(['Unnamed: 0', 'city', 'daysOnMarket', 'hoa', 'latitude', 'longitude',
       'lotSize', 'salePrice', 'pricePerSqFt', 'daysAftSold', 'sqFt', 'state',
       'streetLine', 'daysBtwBldAndSold', 'zip', 'Crime Index', 'idx',
       'Population Density', 'park_index', 'schools_index', 'event_index',
       'fastfood_index', 'Address', 'Number', '2015 Appraisal',
       '2019 Appraisal', '5-year Growth', 'house_age', 'stories_1.0',
       'stories_2.0', 'beds_2.0', 'beds_3.0', 'beds_4.0', 'beds_5.0',
       'baths_1.0', 'baths_1.5', 'baths_2.0', 'baths_2.5', 'baths_3.0',
       'baths_3.5', 'baths_4.0', 'zip_78617.0', 'zip_78653.0', 'zip_78660.0',
       'zip_78702.0', 'zip_78703.0', 'zip_78704.0', 'zip_78721.0',
       'zip_78722.0', 'zip_78723.0', 'zip_78724.0', 'zip_78725.0',
       'zip_78726.0', 'zip_78727.0', 'zip_78728.0', 'zip_78730.0',
       'zip_78732.0', 'zip_78733.0', 'zip_78734.0', 'zip_78735.0',
       'zip_78736.0', 'zip_78738.0', 'zip_78739.0', 'zip_78741.0',
       'z

In [4]:
# Eliminate Columns
colToElim = ['Unnamed: 0', 'city', 'latitude', 'longitude', 'pricePerSqFt', 'state', 'streetLine', 'zip',  'idx', 
             'Population Density', 'Address', 'Number', 'daysOnMarket']
Tot.drop(colToElim, axis=1, inplace=True)
Tot.columns

Index(['hoa', 'lotSize', 'salePrice', 'daysAftSold', 'sqFt',
       'daysBtwBldAndSold', 'Crime Index', 'park_index', 'schools_index',
       'event_index', 'fastfood_index', '2015 Appraisal', '2019 Appraisal',
       '5-year Growth', 'house_age', 'stories_1.0', 'stories_2.0', 'beds_2.0',
       'beds_3.0', 'beds_4.0', 'beds_5.0', 'baths_1.0', 'baths_1.5',
       'baths_2.0', 'baths_2.5', 'baths_3.0', 'baths_3.5', 'baths_4.0',
       'zip_78617.0', 'zip_78653.0', 'zip_78660.0', 'zip_78702.0',
       'zip_78703.0', 'zip_78704.0', 'zip_78721.0', 'zip_78722.0',
       'zip_78723.0', 'zip_78724.0', 'zip_78725.0', 'zip_78726.0',
       'zip_78727.0', 'zip_78728.0', 'zip_78730.0', 'zip_78732.0',
       'zip_78733.0', 'zip_78734.0', 'zip_78735.0', 'zip_78736.0',
       'zip_78738.0', 'zip_78739.0', 'zip_78741.0', 'zip_78744.0',
       'zip_78745.0', 'zip_78747.0', 'zip_78748.0', 'zip_78749.0',
       'zip_78750.0', 'zip_78751.0', 'zip_78752.0', 'zip_78753.0',
       'zip_78754.0', 'zip_78756.

In [5]:
aTot = Tot.copy()
# Delete columns for testing purposes
colToDel = ['daysAftSold', 'daysBtwBldAndSold', 'Crime Index', 'park_index', 'schools_index', 'event_index', 'fastfood_index', 
            'Crime Index Scaled', '2015 Appraisal', '2019 Appraisal' , '5-year Growth', 'event_index_scaled', 'fastfood_index_scaled',
           'park_index_scaled', 'school_index_scaled']
aTot.drop(colToDel, axis=1, inplace=True)
colBefZip = aTot.shape[1]
print("Number of Columns: " + str(colBefZip))
aTot.columns

Number of Columns: 55


Index(['hoa', 'lotSize', 'salePrice', 'sqFt', 'house_age', 'stories_1.0',
       'stories_2.0', 'beds_2.0', 'beds_3.0', 'beds_4.0', 'beds_5.0',
       'baths_1.0', 'baths_1.5', 'baths_2.0', 'baths_2.5', 'baths_3.0',
       'baths_3.5', 'baths_4.0', 'zip_78617.0', 'zip_78653.0', 'zip_78660.0',
       'zip_78702.0', 'zip_78703.0', 'zip_78704.0', 'zip_78721.0',
       'zip_78722.0', 'zip_78723.0', 'zip_78724.0', 'zip_78725.0',
       'zip_78726.0', 'zip_78727.0', 'zip_78728.0', 'zip_78730.0',
       'zip_78732.0', 'zip_78733.0', 'zip_78734.0', 'zip_78735.0',
       'zip_78736.0', 'zip_78738.0', 'zip_78739.0', 'zip_78741.0',
       'zip_78744.0', 'zip_78745.0', 'zip_78747.0', 'zip_78748.0',
       'zip_78749.0', 'zip_78750.0', 'zip_78751.0', 'zip_78752.0',
       'zip_78753.0', 'zip_78754.0', 'zip_78756.0', 'zip_78757.0',
       'zip_78758.0', 'zip_78759.0'],
      dtype='object')

# Linear Regression to predict sale price

In [6]:
# Linear Regression library
from scipy.stats import skew
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


# Objective function definition
X_train = []
y_train = []

def target(x):
    model = Ridge(alpha = x)
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 3)).mean()     
    return(-rmse)    

# Select 80% of the data for training
nTot = len(aTot)
msk = np.random.rand(nTot) < 0.8

a = aTot[msk]
na = len(a)
ytrain = a['salePrice'].to_frame()
a.drop(['salePrice'], axis=1, inplace=True)

# The remaining 20% is used for testing
b = aTot[~msk]
nb = len(b)
ytest = b['salePrice'].to_frame()
b.drop(['salePrice'], axis=1, inplace=True)


c = pd.concat((a, b), sort=False).reset_index(drop=True)
numeric_feats = c.dtypes[c.dtypes != "object"].index
skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
c[skewed_feats] = np.log1p(c[skewed_feats])


X_train = c[:na]   
X_test = c[na:]

y_train = np.log1p(ytrain) 
y_test = np.log1p(ytest) 


# define the optimizer
pbounds = {'x': (0, 200)}    
optimizer = BayesianOptimization(f=target, pbounds=pbounds, verbose=0, random_state=1,)
optimizer.maximize(init_points=2,n_iter=50,)
print(optimizer.max)   
sol = optimizer.max['params']['x']
model_ridge = Ridge(alpha = sol)
model_ridge.fit(X_train,y_train)  

{'target': -0.5072333497255505, 'params': {'x': 199.9996423304463}}


Ridge(alpha=199.9996423304463, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [7]:
# Sorting between best and worst zip codes
dfCoef = pd.DataFrame(np.transpose(model_ridge.coef_), index =X_train.columns, columns =['Coef']) 
print(dfCoef.sort_values(by=['Coef'], ascending = False))

                 Coef
zip_78703.0  0.620370
zip_78704.0  0.611962
sqFt         0.591022
zip_78757.0  0.489416
zip_78751.0  0.470517
zip_78756.0  0.462942
zip_78702.0  0.311635
zip_78722.0  0.308071
lotSize      0.300647
zip_78750.0  0.195597
beds_2.0     0.190437
zip_78759.0  0.133327
zip_78730.0  0.126271
zip_78735.0  0.126207
baths_4.0    0.121912
baths_3.5    0.110365
zip_78732.0  0.108303
baths_3.0    0.088122
zip_78745.0  0.077332
zip_78733.0  0.042852
zip_78738.0  0.025828
zip_78752.0  0.022225
zip_78721.0  0.010592
zip_78723.0  0.007781
stories_2.0  0.003401
house_age   -0.000011
stories_1.0 -0.003401
hoa         -0.005104
beds_3.0    -0.006058
zip_78734.0 -0.022757
zip_78739.0 -0.026920
zip_78726.0 -0.049546
baths_1.0   -0.056367
baths_2.0   -0.067325
zip_78749.0 -0.072611
beds_4.0    -0.082040
baths_2.5   -0.100917
zip_78727.0 -0.103517
beds_5.0    -0.104197
baths_1.5   -0.116449
zip_78736.0 -0.141324
zip_78747.0 -0.157394
zip_78741.0 -0.165307
zip_78728.0 -0.192184
zip_78753.

# Mean Square Error for the Train set

In [8]:
Ypredict = model_ridge.predict(X_train)
predictError = y_train -  Ypredict
MSE_Train = mean_squared_error(y_train,Ypredict)
print('Train MSE = ' + str(MSE_Train))

Train MSE = 0.14170173543168338


# Mean Square Error for the Testing set

In [9]:
Ypredict = model_ridge.predict(X_test)
predictError = y_test -  Ypredict
MSE_Test = mean_squared_error(y_test,Ypredict)
print('Test MSE = ' + str(MSE_Test))

Test MSE = 0.14127389041898367


# Here we add one atribute at a time, to measure its effect

In [None]:
results = {'Init' : [MSE_Train, MSE_Test]}
for addFeature in colToDel :
    print(addFeature)
    addTot = aTot.copy()
    addTot[addFeature] = Tot[addFeature]
    
    # Select 80% of the data for training
    nTot = len(addTot)
    msk = np.random.rand(nTot) < 0.8

    a = addTot[msk]
    na = len(a)
    ytrain = a['salePrice'].to_frame()
    a.drop(['salePrice'], axis=1, inplace=True)

    # The remaining 20% is used for testing
    b = addTot[~msk]
    nb = len(b)
    ytest = b['salePrice'].to_frame()
    b.drop(['salePrice'], axis=1, inplace=True)


    c = pd.concat((a, b), sort=False).reset_index(drop=True)
    numeric_feats = c.dtypes[c.dtypes != "object"].index
    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    c[skewed_feats] = np.log1p(c[skewed_feats])


    X_train = c[:na]   
    X_test = c[na:]

    y_train = np.log1p(ytrain) 
    y_test = np.log1p(ytest) 


    # define the optimizer
    pbounds = {'x': (0, 200)}
    optimizer = BayesianOptimization(f=target, pbounds=pbounds, verbose=0, random_state=1,)
    optimizer.maximize(init_points=2,n_iter=50,)
    print(optimizer.max)   
    sol = optimizer.max['params']['x']
    model_ridge = Ridge(alpha = sol)
    model_ridge.fit(X_train,y_train)      
    
    Ypredict = model_ridge.predict(X_train)
    predictError = y_train -  Ypredict
    MSE_Train = mean_squared_error(y_train,Ypredict)
    print('Train MSE = ' + str(MSE_Train))
    Ypredict = model_ridge.predict(X_test)
    predictError = y_test -  Ypredict
    MSE_Test = mean_squared_error(y_test,Ypredict)
    print('Test MSE = ' + str(MSE_Test))
    results[addFeature] = [MSE_Train, MSE_Test]

daysAftSold
{'target': -0.3920674180488537, 'params': {'x': 126.86423881121245}}
Train MSE = 0.06856991471846069
Test MSE = 0.06945731083931038
daysBtwBldAndSold
{'target': -0.5087180447731318, 'params': {'x': 199.9996423304463}}
Train MSE = 0.13744311840618526
Test MSE = 0.14177275402890865
Crime Index
{'target': -0.5460623983137686, 'params': {'x': 83.4044009405148}}
Train MSE = 0.11367850020391096
Test MSE = 0.11708383018497433
park_index
{'target': -0.460078053334864, 'params': {'x': 199.47200673973134}}
Train MSE = 0.12348802786544788
Test MSE = 0.1223525637531741
schools_index
{'target': -0.4827001651656444, 'params': {'x': 0.1516044385819315}}
Train MSE = 0.10148665916998734
Test MSE = 0.1038730960607079
event_index
{'target': -0.4776816688156184, 'params': {'x': 162.81249403258948}}
Train MSE = 0.12332086995158856
Test MSE = 0.12217594474722897
fastfood_index
{'target': -0.4755474060269471, 'params': {'x': 107.48974069637734}}
Train MSE = 0.114216358005079
Test MSE = 0.11328527

# Printing the results of how each feature impacts the sale price prediction

In [None]:
results

# Calculation of MSE percent improvement - 

In [None]:
TrMSE = results['Init'][0]
TeMSE = results['Init'][1]
invTrMSE = 100/TrMSE
invTeMSE = 100/TeMSE
resPer = {}
for (key, value) in results.items() :
    TrPer = (value[0] - TrMSE) * invTrMSE
    TePer = (value[1] - TeMSE) * invTeMSE    
    resPer[key] = [TrPer, TePer]
resPer    

# Now we repeat the calculation but includying days after sold 'daysAftSold' is the most influential from previous calculations

In [None]:
aTot = Tot.copy()
# Delete columns for testing purposes
colToDel = ['daysBtwBldAndSold', 'Crime Index', 'park_index', 'schools_index', 'event_index', 'fastfood_index', 
            'Crime Index Scaled', '2015 Appraisal', '2019 Appraisal' , '5-year Growth', 'event_index_scaled', 'fastfood_index_scaled',
           'park_index_scaled', 'school_index_scaled']
aTot.drop(colToDel, axis=1, inplace=True)
colBefZip = aTot.shape[1]
print("Number of Columns: " + str(colBefZip))
aTot.columns

# Select 80% of the data for training

In [None]:
nTot = len(aTot)
msk = np.random.rand(nTot) < 0.8

a = aTot[msk]
na = len(a)
ytrain = a['salePrice'].to_frame()
a.drop(['salePrice'], axis=1, inplace=True)

# The remaining 20% is used for testing
b = aTot[~msk]
nb = len(b)
ytest = b['salePrice'].to_frame()
b.drop(['salePrice'], axis=1, inplace=True)


c = pd.concat((a, b), sort=False).reset_index(drop=True)
numeric_feats = c.dtypes[c.dtypes != "object"].index
skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
c[skewed_feats] = np.log1p(c[skewed_feats])


X_train = c[:na]   
X_test = c[na:]

y_train = np.log1p(ytrain) 
y_test = np.log1p(ytest) 


# define the optimizer
pbounds = {'x': (0, 200)}    
optimizer = BayesianOptimization(f=target, pbounds=pbounds, verbose=0, random_state=1,)
optimizer.maximize(init_points=2,n_iter=50,)
print(optimizer.max)   
sol = optimizer.max['params']['x']
model_ridge = Ridge(alpha = sol)
model_ridge.fit(X_train,y_train)  

# Sorting between best and worst zip codes

In [None]:
dfCoef = pd.DataFrame(np.transpose(model_ridge.coef_), index =X_train.columns, columns =['Coef']) 
print(dfCoef.sort_values(by=['Coef'], ascending = False))

In [None]:
# Mean Square Error for the Train set
Ypredict = model_ridge.predict(X_train)
predictError = y_train -  Ypredict
MSE_Train = mean_squared_error(y_train,Ypredict)
print('Train MSE = ' + str(MSE_Train))

In [None]:
# Mean Square Error for the Testing set
Ypredict = model_ridge.predict(X_test)
predictError = y_test -  Ypredict
MSE_Test = mean_squared_error(y_test,Ypredict)
print('Test MSE = ' + str(MSE_Test))

In [None]:
# Here we add one atribute at a time, to measure its effect
results = {'Init' : [MSE_Train, MSE_Test]}
for addFeature in colToDel :
    print(addFeature)
    addTot = aTot.copy()
    addTot[addFeature] = Tot[addFeature]
    
    # Select 80% of the data for training
    nTot = len(addTot)
    msk = np.random.rand(nTot) < 0.8

    a = addTot[msk]
    na = len(a)
    ytrain = a['salePrice'].to_frame()
    a.drop(['salePrice'], axis=1, inplace=True)

    # The remaining 20% is used for testing
    b = addTot[~msk]
    nb = len(b)
    ytest = b['salePrice'].to_frame()
    b.drop(['salePrice'], axis=1, inplace=True)


    c = pd.concat((a, b), sort=False).reset_index(drop=True)
    numeric_feats = c.dtypes[c.dtypes != "object"].index
    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    c[skewed_feats] = np.log1p(c[skewed_feats])


    X_train = c[:na]   
    X_test = c[na:]

    y_train = np.log1p(ytrain) 
    y_test = np.log1p(ytest) 


    # define the optimizer
    pbounds = {'x': (0, 200)}
    optimizer = BayesianOptimization(f=target, pbounds=pbounds, verbose=0, random_state=1,)
    optimizer.maximize(init_points=2,n_iter=50,)
    print(optimizer.max)   
    sol = optimizer.max['params']['x']
    model_ridge = Ridge(alpha = sol)
    model_ridge.fit(X_train,y_train)      
    
    Ypredict = model_ridge.predict(X_train)
    predictError = y_train -  Ypredict
    MSE_Train = mean_squared_error(y_train,Ypredict)
    print('Train MSE = ' + str(MSE_Train))
    Ypredict = model_ridge.predict(X_test)
    predictError = y_test -  Ypredict
    MSE_Test = mean_squared_error(y_test,Ypredict)
    print('Test MSE = ' + str(MSE_Test))
    results[addFeature] = [MSE_Train, MSE_Test]

In [None]:
# Printing the results
results

In [None]:
# Calculation of MSE percent improvement - 
TrMSE = results['Init'][0]
TeMSE = results['Init'][1]
invTrMSE = 100/TrMSE
invTeMSE = 100/TeMSE
resPer = {}
for (key, value) in results.items() :
    TrPer = (value[0] - TrMSE) * invTrMSE
    TePer = (value[1] - TeMSE) * invTeMSE    
    resPer[key] = [TrPer, TePer]
resPer    

# Repat the calculation but includying Crime Index as the most influential from last calculations

In [None]:
aTot = Tot.copy()
# Eliminate columns with no valuable info (now we include )
colToDel = ['daysBtwBldAndSold', 'park_index', 'schools_index', 'event_index', 'fastfood_index', 
            'Crime Index Scaled', '2015 Appraisal', '2019 Appraisal' , '5-year Growth', 'event_index_scaled', 'fastfood_index_scaled',
           'park_index_scaled', 'school_index_scaled']
aTot.drop(colToDel, axis=1, inplace=True)
colBefZip = aTot.shape[1]
print("Number of Columns: " + str(colBefZip))
print(aTot.dtypes)

In [None]:
# Objective function definition
X_train = []
y_train = []

def target(x):
    model = Ridge(alpha = x)
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 3)).mean()     
    return(-rmse)    

# Select 80% of the data for training
nTot = len(aTot)
msk = np.random.rand(nTot) < 0.8

a = aTot[msk]
na = len(a)
ytrain = a['salePrice'].to_frame()
a.drop(['salePrice'], axis=1, inplace=True)

# The remaining 20% is used for testing
b = aTot[~msk]
nb = len(b)
ytest = b['salePrice'].to_frame()
b.drop(['salePrice'], axis=1, inplace=True)


c = pd.concat((a, b), sort=False).reset_index(drop=True)
numeric_feats = c.dtypes[c.dtypes != "object"].index
skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
c[skewed_feats] = np.log1p(c[skewed_feats])


X_train = c[:na]   
X_test = c[na:]

y_train = np.log1p(ytrain) 
y_test = np.log1p(ytest) 


# define the optimizer
pbounds = {'x': (0, 200)}    
optimizer = BayesianOptimization(f=target, pbounds=pbounds, verbose=0, random_state=1,)
optimizer.maximize(init_points=2,n_iter=50,)
print(optimizer.max)   
sol = optimizer.max['params']['x']
model_ridge = Ridge(alpha = sol)
model_ridge.fit(X_train,y_train)  

In [None]:
# Sorting between best and worst zip codes
dfCoef = pd.DataFrame(np.transpose(model_ridge.coef_), index =X_train.columns, columns =['Coef']) 
print(dfCoef.sort_values(by=['Coef'], ascending = False))

In [None]:
# Mean Square Error for the Train set
Ypredict = model_ridge.predict(X_train)
predictError = y_train -  Ypredict
MSE_Train = mean_squared_error(y_train,Ypredict)
print('Train MSE = ' + str(MSE_Train))

In [None]:
# Mean Square Error for the Testing set
Ypredict = model_ridge.predict(X_test)
predictError = y_test -  Ypredict
MSE_Test = mean_squared_error(y_test,Ypredict)
print('Test MSE = ' + str(MSE_Test))

In [None]:
# Here we add one atribute at a time, to measure its effect
results = {'Init' : [MSE_Train, MSE_Test]}
for addFeature in colToDel :
    print(addFeature)
    addTot = aTot.copy()
    addTot[addFeature] = Tot[addFeature]
    
    # Select 80% of the data for training
    nTot = len(addTot)
    msk = np.random.rand(nTot) < 0.8

    a = addTot[msk]
    na = len(a)
    ytrain = a['salePrice'].to_frame()
    a.drop(['salePrice'], axis=1, inplace=True)

    # The remaining 20% is used for testing
    b = addTot[~msk]
    nb = len(b)
    ytest = b['salePrice'].to_frame()
    b.drop(['salePrice'], axis=1, inplace=True)


    c = pd.concat((a, b), sort=False).reset_index(drop=True)
    numeric_feats = c.dtypes[c.dtypes != "object"].index
    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    c[skewed_feats] = np.log1p(c[skewed_feats])


    X_train = c[:na]   
    X_test = c[na:]

    y_train = np.log1p(ytrain) 
    y_test = np.log1p(ytest) 


    # define the optimizer
    pbounds = {'x': (0, 200)}
    optimizer = BayesianOptimization(f=target, pbounds=pbounds, verbose=0, random_state=1,)
    optimizer.maximize(init_points=2,n_iter=50,)
    print(optimizer.max)   
    sol = optimizer.max['params']['x']
    model_ridge = Ridge(alpha = sol)
    model_ridge.fit(X_train,y_train)      
    
    Ypredict = model_ridge.predict(X_train)
    predictError = y_train -  Ypredict
    MSE_Train = mean_squared_error(y_train,Ypredict)
    print('Train MSE = ' + str(MSE_Train))
    Ypredict = model_ridge.predict(X_test)
    predictError = y_test -  Ypredict
    MSE_Test = mean_squared_error(y_test,Ypredict)
    print('Test MSE = ' + str(MSE_Test))
    results[addFeature] = [MSE_Train, MSE_Test]