In [20]:
from sklearn.svm import SVR
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', 500)

In [22]:
def fit_mms(df, col):
    mms = MinMaxScaler()
    mms.fit(df[[col]])
    df[[col]] = mms.transform(df[[col]])
    
def fit_ss(df, col):
    ss = StandardScaler()
    ss.fit(df[[col]])
    df[[col]] = ss.transform(df[[col]])

In [4]:
df = pd.read_csv('./../data/ames_housing_price_data_v5.csv')
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df = df[df['BedroomAbvGr'] != 0]
df = df[df['MSZoning_com'] != 'Nonresidential']
#df = df[(df['Neighborhood'] != 'GrnHill') & (df['Neighborhood'] != 'Landmrk')]
df.reset_index(drop=True, inplace = True)

price = df.loc[:,'SalePrice']
price_log = df.loc[:,'SalePrice_log']

basement_categoricals = [
    'BsmtCond_ord',
    'BsmtQual_ord',
    'BsmtExposure_ord']

garage_categoricals = [ 
    'GarageQual',
    'GarageCond',
    'GarageType_com',
    'Garage_age_bin',
    'GarageFinish']

always_drop = [
    'Street_paved',
    'RoofMatl',
    'SaleType',
    'SaleCondition',
    'Garage_age_years',
    'Remod_age_years',
    'MoSold',
    'Utilities',
    'PID',
    'PoolArea',
    'PoolQC',
    'SalePrice',
    'sold_datetime',
    '2ndFlrSF_log',
    'GrLivArea',
    'MiscVal',
    'MiscFeature',
    '1stFlrSF',
    'LotArea',
    'LotFrontage_log',
    'YrSold'
]

droplist = []

to_dummify = [
    #'Street_paved',
    'Alley',
    'LandContour',
    #'Utilities',
    'LandSlope',
    'Neighborhood',
    'BldgType',
    'OverallQual',
    'OverallCond',
    'RoofStyle',
    #'RoofMatl',
    'MasVnrType',
    'ExterQual',
    'ExterCond',
    'Foundation',
    'CentralAir',
    'KitchenQual',
    'FireplaceQu',
    'PavedDrive',
    #'PoolQC',
    'Fence',
    #'MiscFeature',
    #'MoSold',
    'HeatingQC_ord',
    'LotShape_com',
    'MSZoning_com',
    'Heating_com',
    'Electrical_com',
    'LotConfig_com',
    'number_floors',
    'attic',
    'PUD',
    'Functional_ord',
    'Remod_age_bin'
    #'SaleType',
    #'SaleCondition'
]

In [5]:
df6 = df.copy()

In [6]:
fit_mms(df6, 'OverallQual')
fit_mms(df6, 'ExterQual')
fit_mms(df6, 'OverallCond')
fit_mms(df6, 'KitchenQual')
#df2['Porch']=((df2['OpenPorchSF']>0) | (df2['EnclosedPorch']>0) | (df2['3SsnPorch']>0) | (df2['ScreenPorch']>0))
df6['PorchSF']=df6['OpenPorchSF']+df6['EnclosedPorch']+df6['3SsnPorch']+df6['ScreenPorch']
#df2['1stFloorArea%']=df2['1stFlrSF']/df2['GrLivArea']
#df2['2ndFloorArea%']=df2['2ndFlrSF']/df2['GrLivArea']
df6['ExterQualDisc']=df6['OverallQual']-df6['ExterQual']
df6['OverallCondDisc']=df6['OverallQual']-df6['OverallCond']
df6['KitchenQualDisc']=df6['OverallQual']-df6['KitchenQual']
#df2['BSMT_GLQ%']=df2['BSMT_GLQ']/df2['TotalBsmtSF']
#df2['BSMT_ALQ%']=df2['BSMT_ALQ']/df2['TotalBsmtSF']
#df2['BSMT_GLQ%']=df2['BSMT_GLQ%'].fillna(0)
#df2['BSMT_ALQ%']=df2['BSMT_ALQ%'].fillna(0)
df6['BSMT_LowQual']=df6['TotalBsmtSF']-df6['BSMT_GLQ']-df6['BSMT_ALQ']
df6['BSMT_HighQual']=df6['BSMT_GLQ']+df6['BSMT_ALQ']
df6['AreaPerPerson'] = df6['GrLivArea']/df6['BedroomAbvGr']
df6['BSMT_HighQual_bin'] = pd.cut(df6['BSMT_HighQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+'])
df6['BSMT_LowQual_bin'] = pd.cut(df6['BSMT_LowQual'], [-1, 1, 500, 1000, 1500, 2500], labels = ['No basement', '0-500', '500-1000', '1000-1500', '1500+'])

In [7]:
feat_incl = ['GrLivArea_log', 'LotArea_log', 'OverallQual',
    'BSMT_LowQual_bin', 'house_age_years', 'GarageCars','MasVnrType',
                 'FullBath','HalfBath',
                'BsmtExposure_ord',
                 'Neighborhood',
                 'BldgType','PorchSF',
                 'ExterQualDisc','OverallCondDisc','BSMT_HighQual_bin',
                 'KitchenQualDisc',
                'Fireplaces','Pool','AreaPerPerson'
]

In [8]:
df7 = df6.loc[:,feat_incl]
df7

Unnamed: 0,GrLivArea_log,LotArea_log,OverallQual,BSMT_LowQual_bin,house_age_years,GarageCars,MasVnrType,FullBath,HalfBath,BsmtExposure_ord,Neighborhood,BldgType,PorchSF,ExterQualDisc,OverallCondDisc,BSMT_HighQual_bin,KitchenQualDisc,Fireplaces,Pool,AreaPerPerson
0,2.932474,3.897077,0.428571,500-1000,71.210959,2.0,,1,0,1,SWISU,1Fam,166,0.095238,-0.071429,No basement,0.095238,1,0,428.00
1,3.020775,3.626853,0.285714,0-500,25.104110,1.0,Brick Face,2,0,2,Edwards,TwnhsE,105,-0.380952,-0.047619,500-1000,-0.380952,0,0,524.50
2,3.016616,3.910944,0.142857,0-500,109.402740,1.0,,1,0,1,OldTown,1Fam,279,-0.523810,-0.690476,No basement,-0.190476,0,0,519.50
3,3.221414,3.924279,0.714286,0-500,8.838356,2.0,,2,1,1,NWAmes,1Fam,45,0.047619,0.214286,500-1000,0.047619,0,0,555.00
4,3.283753,3.863382,0.571429,No basement,6.501370,2.0,Brick Face,3,0,0,Edwards,1Fam,177,-0.095238,0.238095,No basement,-0.095238,1,0,480.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,2.978637,3.947140,0.428571,500-1000,93.394521,1.0,,1,0,1,BrkSide,1Fam,138,0.095238,-0.071429,No basement,0.428571,1,0,476.00
2467,3.238799,4.136086,0.000000,No basement,54.452055,2.0,,2,0,0,Edwards,1Fam,0,-0.333333,-0.333333,No basement,-0.333333,1,0,433.25
2468,3.301464,3.797268,0.285714,1000-1500,58.619178,3.0,,2,0,1,Crawfor,Duplex,0,-0.047619,-0.214286,No basement,-0.047619,0,0,500.50
2469,3.265290,3.945764,0.571429,0-500,7.501370,2.0,Brick Face,2,1,1,CollgCr,1Fam,96,-0.095238,0.238095,500-1000,-0.095238,1,0,614.00


In [9]:
new_dummies = [
    'MasVnrType',
    'Neighborhood',
    'BldgType',
    'BSMT_HighQual_bin',
    'BSMT_LowQual_bin'
]

In [10]:
df7 = pd.get_dummies(df7, columns = new_dummies, drop_first = True)

In [11]:
df7

Unnamed: 0,GrLivArea_log,LotArea_log,OverallQual,house_age_years,GarageCars,FullBath,HalfBath,BsmtExposure_ord,PorchSF,ExterQualDisc,...,BldgType_Twnhs,BldgType_TwnhsE,BSMT_HighQual_bin_0-500,BSMT_HighQual_bin_500-1000,BSMT_HighQual_bin_1000-1500,BSMT_HighQual_bin_1500+,BSMT_LowQual_bin_0-500,BSMT_LowQual_bin_500-1000,BSMT_LowQual_bin_1000-1500,BSMT_LowQual_bin_1500+
0,2.932474,3.897077,0.428571,71.210959,2.0,1,0,1,166,0.095238,...,0,0,0,0,0,0,0,1,0,0
1,3.020775,3.626853,0.285714,25.104110,1.0,2,0,2,105,-0.380952,...,0,1,0,1,0,0,1,0,0,0
2,3.016616,3.910944,0.142857,109.402740,1.0,1,0,1,279,-0.523810,...,0,0,0,0,0,0,1,0,0,0
3,3.221414,3.924279,0.714286,8.838356,2.0,2,1,1,45,0.047619,...,0,0,0,1,0,0,1,0,0,0
4,3.283753,3.863382,0.571429,6.501370,2.0,3,0,0,177,-0.095238,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,2.978637,3.947140,0.428571,93.394521,1.0,1,0,1,138,0.095238,...,0,0,0,0,0,0,0,1,0,0
2467,3.238799,4.136086,0.000000,54.452055,2.0,2,0,0,0,-0.333333,...,0,0,0,0,0,0,0,0,0,0
2468,3.301464,3.797268,0.285714,58.619178,3.0,2,0,1,0,-0.047619,...,0,0,0,0,0,0,0,0,1,0
2469,3.265290,3.945764,0.571429,7.501370,2.0,2,1,1,96,-0.095238,...,0,0,0,1,0,0,1,0,0,0


In [14]:
test = df7.loc[0:25,:]

In [15]:
test

Unnamed: 0,GrLivArea_log,LotArea_log,OverallQual,house_age_years,GarageCars,FullBath,HalfBath,BsmtExposure_ord,PorchSF,ExterQualDisc,...,BldgType_Twnhs,BldgType_TwnhsE,BSMT_HighQual_bin_0-500,BSMT_HighQual_bin_500-1000,BSMT_HighQual_bin_1000-1500,BSMT_HighQual_bin_1500+,BSMT_LowQual_bin_0-500,BSMT_LowQual_bin_500-1000,BSMT_LowQual_bin_1000-1500,BSMT_LowQual_bin_1500+
0,2.932474,3.897077,0.428571,71.210959,2.0,1,0,1,166,0.095238,...,0,0,0,0,0,0,0,1,0,0
1,3.020775,3.626853,0.285714,25.10411,1.0,2,0,2,105,-0.380952,...,0,1,0,1,0,0,1,0,0,0
2,3.016616,3.910944,0.142857,109.40274,1.0,1,0,1,279,-0.52381,...,0,0,0,0,0,0,1,0,0,0
3,3.221414,3.924279,0.714286,8.838356,2.0,2,1,1,45,0.047619,...,0,0,0,1,0,0,1,0,0,0
4,3.283753,3.863382,0.571429,6.50137,2.0,3,0,0,177,-0.095238,...,0,0,0,0,0,0,0,0,0,0
5,2.971276,3.778151,0.142857,56.123288,2.0,1,0,1,144,0.142857,...,0,0,0,0,0,0,0,1,0,0
6,3.095518,3.569374,0.571429,1.164384,2.0,2,0,4,24,-0.095238,...,0,0,0,0,0,0,0,0,1,0
7,2.948902,4.093247,0.285714,24.265753,2.0,1,0,1,0,-0.047619,...,0,0,0,1,0,0,1,0,0,0
8,3.030195,3.565257,0.428571,2.328767,2.0,1,0,4,44,0.095238,...,0,1,1,0,0,0,1,0,0,0
9,3.127753,3.795185,0.714286,2.084932,2.0,2,0,1,35,0.047619,...,0,1,1,0,0,0,0,0,1,0


In [16]:
model = SVR(kernel = 'linear')

In [None]:
params = {
    'C' : [0.1, 1, 10],
    'epsilon' : [0.01, 0.1, 1]    
}

In [None]:
grid = GridSearchCV(model, )