In [1]:
# Regression with all transactions included

In [2]:
import math
import warnings
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import xgboost as xgb
import multiprocessing as mp
import matplotlib.pyplot as plt

from scipy.optimize import minimize
from scipy.optimize import fmin

from bayes_opt import BayesianOptimization

sns.set_style('whitegrid')

warnings.simplefilter(action='ignore')

In [3]:
# Loading Data 
Tot = pd.read_csv('./data/austin_housing_data.csv')
print('Number of Transactions: ' + str(len(Tot)))

Number of Transactions: 24384


In [4]:
# Eliminating rows with NaN and inf entries 
Tot = Tot.dropna()
Tot = Tot[~Tot.isin([np.inf, -np.inf])]
# This step randomly sorts the transactions
Tot.sample(frac=1)
print('Number of Transactions: ' + str(len(Tot)))

Number of Transactions: 24384


In [5]:
# Eliminate Columns to determine their individual effect

In [6]:
aTot = Tot.copy()
# Eliminate columns with no valuable info
colToDel = [
    'daysBtwBldAndSold', 'Crime Index', 'park_index', 'schools_index', 'event_index', 'fastfood_index',
    'Number', 'Address', 'streetLine', 'state', 'city', 'latitude', 'longitude' ]
aTot.drop(colToDel, axis=1, inplace=True)
colBefZip = aTot.shape[1]
print("Number of Columns: " + str(colBefZip))
print(aTot.dtypes)

Number of Columns: 69
Unnamed: 0                 int64
daysOnMarket             float64
hoa                      float64
lotSize                  float64
salePrice                  int64
                          ...   
Crime Index Scaled       float64
park_index_scaled        float64
school_index_scaled      float64
event_index_scaled       float64
fastfood_index_scaled    float64
Length: 69, dtype: object


## Divide the data in zip codes

In [7]:
zipCodes = [
    76574,76578,78574,78572,78626,78610,78747,78619,78612,78617,78602,78616,78724,78613,77423,78719,78620,78737,78640,78720,
    78736,76820,78621,78653,78615,78749,78674,78634,78681,78628,78642,78641,78633,76577,78665,78605,78645,78652,78748,78750,
    78734,78754,78660,78725,78654,78564,77447,78669,78744,78664,78700,78753,78717,78701,78732,78726,78745,78702,78704,78721,
    78703,78731,78729,78705,78723,78722,78730,78727,78758,78728,75227,78746,78733,78738,78735,78739,78741,76401,78716,78644,
    78762,78751,78756,78752,78757,78759]

In [17]:
# Linear Regression library
from scipy.stats import skew
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Objective function definition
def target(x, zip):
    X_train = []
    y_train = []

    aTot = Tot.copy()
    # Eliminate columns with no valuable info
    colToDel = [
        'daysBtwBldAndSold', 'Crime Index', 'park_index', 'schools_index', 'event_index', 'fastfood_index',
        'Number', 'Address', 'streetLine', 'state', 'city', 'latitude', 'longitude' ]
    aTot.drop(colToDel, axis=1, inplace=True)

    byZipTot = aTot[aTot['zip'] == zip]
    if len(byZipTot) > 50 :
        byZipTot.drop(['zip'], axis=1, inplace=True)

        zip_key = zip
        zip_value = byZipTot

        # Select 80% of the data for training
        nTot = len(zip_value)
        msk = np.random.rand(nTot) < 0.8

        a = zip_value[msk]
        na = len(a)
        ytrain = a['salePrice'].to_frame()
        a.drop(['salePrice'], axis=1, inplace=True)

        # The remaining 20% is used for testing
        b = zip_value[~msk]
        nb = len(b)
        ytest = b['salePrice'].to_frame()
        b.drop(['salePrice'], axis=1, inplace=True)


        c = pd.concat((a, b), sort=False).reset_index(drop=True)
        numeric_feats = c.dtypes[c.dtypes != "object"].index
        skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        c[skewed_feats] = np.log1p(c[skewed_feats])


        X_train = c[:na]   
        X_test = c[na:]

        y_train = np.log1p(ytrain) 
        y_test = np.log1p(ytest) 

        model = Ridge(alpha = x)
        rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = 3)).mean()     
        return(-rmse)
    
    return np.inf

In [19]:
# function to compute model and coefficients for each zip code
def compute_zip_model(zip):
    print('Starting ' + str(zip) + ' >>>')

    aTot = Tot.copy()
    # Eliminate columns with no valuable info
    colToDel = [
        'daysBtwBldAndSold', 'Crime Index', 'park_index', 'schools_index', 'event_index', 'fastfood_index',
        'Number', 'Address', 'streetLine', 'state', 'city', 'latitude', 'longitude' ]
    aTot.drop(colToDel, axis=1, inplace=True)

    byZipTot = aTot[aTot['zip'] == zip]
    if len(byZipTot) > 50 :
        byZipTot.drop(['zip'], axis=1, inplace=True)

        zip_key = zip
        zip_value = byZipTot

        # Select 80% of the data for training
        nTot = len(zip_value)
        msk = np.random.rand(nTot) < 0.8

        a = zip_value[msk]
        na = len(a)
        ytrain = a['salePrice'].to_frame()
        a.drop(['salePrice'], axis=1, inplace=True)

        # The remaining 20% is used for testing
        b = zip_value[~msk]
        nb = len(b)
        ytest = b['salePrice'].to_frame()
        b.drop(['salePrice'], axis=1, inplace=True)


        c = pd.concat((a, b), sort=False).reset_index(drop=True)
        numeric_feats = c.dtypes[c.dtypes != "object"].index
        skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        c[skewed_feats] = np.log1p(c[skewed_feats])


        X_train = c[:na]   
        X_test = c[na:]

        y_train = np.log1p(ytrain) 
        y_test = np.log1p(ytest) 


        # define the optimizer
        pbounds = { 'x': (0, 200), 'zip': (zip, zip) }
        optimizer = BayesianOptimization(f=target, pbounds=pbounds, verbose=0, random_state=1)
        optimizer.maximize(init_points=2, n_iter=50)
        sol = optimizer.max['params']['x']
        model_ridge = Ridge(alpha = sol)
        model_ridge.fit(X_train,y_train)      
        
        Ypredict = model_ridge.predict(X_train)
        predictError = y_train -  Ypredict
        MSE_Train = mean_squared_error(y_train,Ypredict)
        Ypredict = model_ridge.predict(X_test)
        predictError = y_test -  Ypredict
        MSE_Test = mean_squared_error(y_test,Ypredict)
        
        print(zip_key + '\n' + \
            'Train MSE = ' + str(MSE_Train) + '\n' + \
            'Test MSE = ' + str(MSE_Test) + '\n' + \
            str(optimizer.max))   
        
        aDict = {key : model_ridge.coef_}
        aRes = {key : [MSE_Train, MSE_Test]}
        
        return { 'coef': aDict, 'res': aRes, 'okay': True }

    else:
        return { 'okay': False }

SyntaxError: invalid syntax (<ipython-input-19-ed66aca6f160>, line 68)

In [10]:
coefDict = {}
results = {}

# compute models for each zip code in parallel
with mp.Pool(int(mp.cpu_count() / 2)) as pool:
    zip_results = pool.map(compute_zip_model, zipCodes)
    for result in zip_results:
        if (result['okay']):
            coefDict.update(result['coef'])
            results.update(result['res'])

NameError: name 'compute_zip_model' is not defined

In [68]:
col = ['zip' , 'a0', 'baths', 'beds', 'hoa', 'lotSize',  'daysAftSold', 'sqft', 'stories']
data = [];
for (key, value) in coefDict.items() :
    data.append([key, value[0][0], value[0][1], value[0][2], value[0][3] , value[0][4], value[0][5],value[0][6], value[0][7] ])
df = pd.DataFrame(data, columns = col) 
df.sort_values(by=['daysAftSold'], ascending=False)

Unnamed: 0,zip,a0,baths,beds,hoa,lotSize,daysAftSold,sqft,stories
42,78735,-2.414751,1.13477,-0.044178,0.09579,0.177551,-8.6e-05,0.407492,-0.172822
24,78726,0.5639045,0.05426,-0.038435,0.025913,0.195644,-9.7e-05,0.000204,-0.041018
18,78660,-2.884494e-05,0.412601,0.083738,0.019846,0.043901,-9.8e-05,0.451271,-0.095272
14,78748,-3.424822e-07,0.354188,-0.085414,0.025649,0.111012,-0.000108,0.549086,-0.121485
2,78612,0.0007107959,0.41385,-0.051555,0.054998,0.138901,-0.000111,0.76101,-0.403936
40,78733,0.221703,0.251888,-0.084625,0.028314,0.148047,-0.000115,0.948869,-0.107456
38,78728,-0.5265361,0.093999,0.084297,0.024367,-0.080191,-0.000116,0.000333,-0.0661
20,78744,-1.062752e-05,0.094719,-0.035603,0.04786,-0.063634,-0.000119,0.665371,-0.270236
37,78758,-5.972284e-06,0.293242,-0.276871,0.015311,-0.007791,-0.000121,0.70414,-0.000681
31,78729,-0.09252657,0.0245,0.010277,0.021938,0.008004,-0.000121,0.630695,-0.03615


range(0, 8)