## Import necessary libraries, data

In [47]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

train_data = pd.read_csv("/Users/jamester/Documents/MachineLearning/KaggleComps/HousePrices/train.HousePrices.csv")
test_data = pd.read_csv("/Users/jamester/Documents/MachineLearning/KaggleComps/HousePrices/test.HousePrices.csv")

testAndtrain = pd.concat([train_data, test_data], axis=0, ignore_index=True)
pd.options.display.max_rows = 1500
pd.options.display.max_columns =500
test_copy = test_data.copy()
import xgboost
print(xgboost.__version__)

1.1.1


## Quick data exploration

In [48]:
neigh_grouped = train_data.groupby('Neighborhood').SalePrice.mean().sort_values()
neigh_grouped

Neighborhood
MeadowV     98576.470588
IDOTRR     100123.783784
BrDale     104493.750000
BrkSide    124834.051724
Edwards    128219.700000
OldTown    128225.300885
Sawyer     136793.135135
Blueste    137500.000000
SWISU      142591.360000
NPkVill    142694.444444
NAmes      145847.080000
Mitchel    156270.122449
SawyerW    186555.796610
NWAmes     189050.068493
Gilbert    192854.506329
Blmngtn    194870.882353
CollgCr    197965.773333
Crawfor    210624.725490
ClearCr    212565.428571
Somerst    225379.837209
Veenker    238772.727273
Timber     242247.447368
StoneBr    310499.000000
NridgHt    316270.623377
NoRidge    335295.317073
Name: SalePrice, dtype: float64

In [None]:
#print(train_data.nunique())

# Create train and test sets

In [49]:
X = train_data.drop('SalePrice', axis=1)
y = train_data.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Assign each neighborhood to a number:

In [50]:
def neighborhoodToNum(df):
    n=1
    neigh_grouped = df.groupby('Neighborhood').SalePrice.mean().sort_values()
    neigh_value = {}
    for i in neigh_grouped.index:
        neigh_value[i]=n
        n+=1
    return neigh_value
        
        
neighVal = neighborhoodToNum(train_data)        
neighVal

X_train['Neighborhood'] = X_train.Neighborhood.fillna(value=neighVal)
X_test['Neighborhood'] = X_test.Neighborhood.fillna(value=neighVal)
test_data['Neighborhood'] = test_data.Neighborhood.fillna(value=neighVal)

# Fill missing values

In [51]:
#create fill values for each column
fill_values={'Alley': 'No Alley', 'BsmtFinType1':'NoBsmt', 'GarageType':'NoGarage', 'BsmtQual':'NoBsmt', 'BsmtCond':'NoBsmt', 'BsmtExposure': 'NoBsmt', 'BsmtFinType2':'NoBsmt', 'Electrical':'SBrkr','FireplaceQu':'NoFirePl', 'MasVnrType':0, 'GarageFinish':'NoGar', 'GarageQual':'NoGar', 'GarageCond':'NoGar', 'Fence':'NoFence'}
housing_train = X_train.fillna(value=fill_values)
housing_test = X_test.fillna(value=fill_values)
test_data = test_data.fillna(value=fill_values)

In [52]:
#fill missing values for garage year built with year house built
def fillGarageYrBlt(df, col):
    for i in range(len(df[col])):
        number = df.loc[i, col]
        if np.isnan(number):
            df.loc[i, col] = df.loc[i, 'YearBuilt']
    return df
#train_data = fillGarageYrBlt(train_data, 'GarageYrBlt')
housing_train = fillGarageYrBlt(housing_train, 'GarageYrBlt')
housing_test = fillGarageYrBlt(housing_test, 'GarageYrBlt')
test_data = fillGarageYrBlt(test_data, 'GarageYrBlt')

In [53]:
#fill LotFrontage and MasVnrArea with mean of columns
housing_train['LotFrontage'] = housing_train.LotFrontage.fillna(housing_train.LotFrontage.mean())
housing_test['LotFrontage'] = housing_test.LotFrontage.fillna(housing_test.LotFrontage.mean())
test_data['LotFrontage'] = test_data.LotFrontage.fillna(test_data.LotFrontage.mean())

housing_train['MasVnrArea'] = housing_train.MasVnrArea.fillna(housing_train.MasVnrArea.mean())
housing_test['MasVnrArea'] = housing_test.MasVnrArea.fillna(housing_test.MasVnrArea.mean())
test_data['MasVnrArea'] = test_data.MasVnrArea.fillna(test_data.MasVnrArea.mean())

# Change column types to numerical (Nominal or Ordinal depending on column)

In [54]:
num_Nominal = ['MSSubClass',  'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'MoSold', 'YrSold'] 
cat_toNominal = ['MSZoning', 'Alley','LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st','Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtExposure', 'BsmtFinType2', 'Heating', 'CentralAir','Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond','PavedDrive', 'Fence', 'SaleType', 'SaleCondition']
cat_toOrdinal = ['Neighborhood', 'ExterQual', 'BsmtFinType1', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'Functional']
num_col = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'OverallQual', 'OverallCond']
#transform SalePrice to Log
to_drop = ['Street', 'Utilities', 'Condition2', 'LowQualFinSF', 'PoolQC', 'MiscFeature', 'MiscVal' ]
#check corr between ExterCond and ExtQual; bsmtQual and BsmtCond; 
#order Functional
#some that would be ordinal are nominal objects since na means none of whatever it is
                        

In [55]:
housing_train = housing_train.drop(to_drop, axis=1)
housing_test = housing_test.drop(to_drop, axis=1)
test_data = test_data.drop(to_drop, axis=1)

In [56]:
test_data.fillna(method='ffill', inplace=True)

#test_datahousing.isna().sum()

In [57]:
housing_train[cat_toNominal] = housing_train[cat_toNominal].applymap(str)
housing_test[cat_toNominal] = housing_test[cat_toNominal].applymap(str)
test_data[cat_toNominal] = test_data[cat_toNominal].applymap(str)

In [58]:
features = ['YearBuilt', 'TotRmsAbvGrd', 'FullBath', '1stFlrSF', 'TotalBsmtSF', 'GarageArea', 'GarageCars', 'GrLivArea', 'OverallQual']
X_scaled= X.loc[:, features]
X_scaled = X_scaled - X_scaled.mean(axis=0)/X_scaled.std(axis=0)

from sklearn.cluster import KMeans
kmeans = KMeans(random_state=0, n_init=10, n_clusters=10)
X_cd = kmeans.fit_transform(X_scaled)
X_cd = pd.DataFrame(X_cd, columns=[f'Centroid_{i}' for i in range(X_cd.shape[1])])
X = X.join(X_cd)


In [59]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


#oe = OrdinalEncoder(handle_unknown='use_encoded_value',
#                                unknown_value=-1)
#housing_train[cat_toNominal] = oe.fit_transform(housing_train[cat_toNominal])
#housing_test[cat_toNominal] = oe.fit(housing_test[cat_toNominal])

ohe = OneHotEncoder(handle_unknown='ignore')

housing_trainEncoded = pd.DataFrame(ohe.fit_transform(housing_train[cat_toNominal]).toarray())
housing_trainEncoded.columns = ohe.get_feature_names(cat_toNominal)
#housing_trainEncoded.drop(columns = cat_toNominal, axis=1)
#housing_trainEncoded.head()
housing_trainfinal = pd.concat([housing_train, housing_trainEncoded], axis=1)
housing_trainfinal.drop(columns = cat_toNominal, axis=1, inplace=True)

housing_testEncoded = pd.DataFrame(ohe.transform(housing_test[cat_toNominal]).toarray())
housing_testEncoded.columns = ohe.get_feature_names(cat_toNominal)
#housing_test.drop(columns = cat_toNominal, axis=1)
#housing_testEncoded.head()
housing_testfinal = pd.concat([housing_test, housing_testEncoded], axis=1)
housing_testfinal.drop(columns = cat_toNominal, axis=1, inplace=True)
housing_testfinal.head()

test_dataEncoded = pd.DataFrame(ohe.transform(test_data[cat_toNominal]).toarray())
test_dataEncoded.columns = ohe.get_feature_names(cat_toNominal)
test_datafinal = pd.concat([test_data, test_dataEncoded], axis=1)
test_datafinal.drop(columns = cat_toNominal, axis=1, inplace=True)
test_datafinal.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtFinType1,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Alley_Grvl,Alley_No Alley,Alley_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_0,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_NoBsmt,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_NoBsmt,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,CentralAir_N,CentralAir_Y,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,FireplaceQu_Ex,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFirePl,FireplaceQu_Po,FireplaceQu_TA,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NoGarage,GarageFinish_Fin,GarageFinish_NoGar,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_NoGar,GarageQual_Po,GarageQual_TA,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_NoGar,GarageCond_Po,GarageCond_TA,PavedDrive_N,PavedDrive_P,PavedDrive_Y,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_NoFence,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,NAmes,5,6,1961,1961,0.0,TA,TA,TA,TA,Rec,468.0,144.0,270.0,882.0,TA,896,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,1961.0,1.0,730.0,140,0,0,0,120,0,6,2010,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1462,20,81.0,14267,NAmes,6,6,1958,1958,108.0,TA,TA,TA,TA,ALQ,923.0,0.0,406.0,1329.0,TA,1329,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,1958.0,1.0,312.0,393,36,0,0,0,0,6,2010,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1463,60,74.0,13830,Gilbert,5,5,1997,1998,0.0,TA,TA,Gd,TA,GLQ,791.0,0.0,137.0,928.0,Gd,928,701,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,1997.0,2.0,482.0,212,34,0,0,0,0,3,2010,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1464,60,78.0,9978,Gilbert,6,6,1998,1998,20.0,TA,TA,TA,TA,GLQ,602.0,0.0,324.0,926.0,Ex,926,678,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,1998.0,2.0,470.0,360,36,0,0,0,0,6,2010,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1465,120,43.0,5005,StoneBr,8,5,1992,1992,0.0,Gd,TA,Gd,TA,ALQ,263.0,0.0,1017.0,1280.0,Ex,1280,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,1992.0,2.0,506.0,0,82,0,0,144,0,1,2010,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [60]:
from sklearn.preprocessing import OrdinalEncoder
orenc = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value=-1)
housing_trainfinal[cat_toOrdinal] = orenc.fit_transform(housing_trainfinal[cat_toOrdinal])
#housing_trainfinal.drop(cat_toOrdinal, axis=1, inplace=True)
housing_trainfinal.head()

housing_testfinal[cat_toOrdinal] = orenc.transform(housing_testfinal[cat_toOrdinal])
#housing_testfinal.drop(cat_toOrdinal)
housing_testfinal.head()

test_datafinal[cat_toOrdinal] = orenc.transform(test_datafinal[cat_toOrdinal])
#test_datafinal.drop(cat_toOrdinal)
test_datafinal.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtFinType1,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Alley_Grvl,Alley_No Alley,Alley_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_ClyTile,RoofMatl_CompShg,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_0,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_NoBsmt,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_NoBsmt,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,CentralAir_N,CentralAir_Y,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,FireplaceQu_Ex,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NoFirePl,FireplaceQu_Po,FireplaceQu_TA,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_NoGarage,GarageFinish_Fin,GarageFinish_NoGar,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_NoGar,GarageQual_Po,GarageQual_TA,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_NoGar,GarageCond_Po,GarageCond_TA,PavedDrive_N,PavedDrive_P,PavedDrive_Y,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_NoFence,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,12.0,5,6,1961,1961,0.0,3.0,4.0,4.0,4.0,5.0,468.0,144.0,270.0,882.0,4.0,896,0,896,0.0,0.0,1,0,2,1,3.0,5,6.0,0,1961.0,1.0,730.0,140,0,0,0,120,0,6,2010,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1462,20,81.0,14267,12.0,6,6,1958,1958,108.0,3.0,4.0,4.0,4.0,0.0,923.0,0.0,406.0,1329.0,4.0,1329,0,1329,0.0,0.0,1,1,3,1,2.0,6,6.0,0,1958.0,1.0,312.0,393,36,0,0,0,0,6,2010,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1463,60,74.0,13830,8.0,5,5,1997,1998,0.0,3.0,4.0,2.0,4.0,2.0,791.0,0.0,137.0,928.0,2.0,928,701,1629,0.0,0.0,2,1,3,1,3.0,6,6.0,1,1997.0,2.0,482.0,212,34,0,0,0,0,3,2010,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1464,60,78.0,9978,8.0,6,6,1998,1998,20.0,3.0,4.0,4.0,4.0,2.0,602.0,0.0,324.0,926.0,0.0,926,678,1604,0.0,0.0,2,1,3,1,2.0,7,6.0,1,1998.0,2.0,470.0,360,36,0,0,0,0,6,2010,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1465,120,43.0,5005,22.0,8,5,1992,1992,0.0,2.0,4.0,2.0,4.0,0.0,263.0,0.0,1017.0,1280.0,0.0,1280,0,1280,0.0,0.0,2,0,2,1,2.0,5,6.0,0,1992.0,2.0,506.0,0,82,0,0,144,0,1,2010,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [61]:
corrdf = pd.concat([housing_trainfinal, y_train], axis=1)
corr = corrdf.corr()
#corr.LotFrontage.sort_values()
corr.SalePrice.sort_values()

ExterQual               -0.615864
BsmtQual                -0.592928
KitchenQual             -0.580320
FireplaceQu_NoFirePl    -0.471458
HeatingQC               -0.421374
GarageFinish_Unf        -0.412895
GarageType_Detchd       -0.351096
MasVnrType_None         -0.347370
Foundation_CBlock       -0.340451
MSZoning_RM             -0.294014
CentralAir_N            -0.262795
BsmtExposure_No         -0.261313
LotShape_Reg            -0.253679
GarageCond_NoGar        -0.242903
GarageQual_NoGar        -0.242903
GarageType_NoGarage     -0.242903
GarageFinish_NoGar      -0.242903
SaleType_WD             -0.240092
RoofStyle_Gable         -0.223479
PavedDrive_N            -0.207532
Foundation_BrkTil       -0.193061
Electrical_FuseA        -0.179209
SaleCondition_Normal    -0.174969
Exterior1st_MetalSd     -0.174391
HouseStyle_1.5Fin       -0.171764
Exterior2nd_MetalSd     -0.168122
Exterior1st_Wd Sdng     -0.167846
Exterior2nd_Wd Sdng     -0.162750
BsmtFinType2_NoBsmt     -0.155730
BsmtExposure_N

 ## Drop values with less than 1 or -1ish correlation with salesprice

In [62]:
corr_todrop = ['Exterior2nd_AsbShng', 'SaleType_COD', 'Exterior2nd_HdBoard', 'HouseStyle_1.5Unf', 'MSSubClass', 'Heating_Grav', 'HouseStyle_SFoyer', 'PavedDrive_P', 'OverallCond', 'MSZoning_RH', 'FireplaceQu_Po', 'GarageType_CarPort', 'MSZoning_C (all)', 'LotConfig_Inside', 'RoofMatl_CompShg', 'LandSlope_Gtl', 'Exterior1st_BrkComm', 'SaleCondition_AdjLand',  'Heating_Wall', 'HouseStyle_1Story', 'Electrical_FuseP', 'Exterior1st_WdShing', 'Exterior2nd_Plywood', 'Exterior2nd_Brk Cmn', 'Fence_MnWw', 'BsmtHalfBath', 'HouseStyle_2.5Unf', 'Condition1_RRAe', 'SaleCondition_Family', 'Exterior2nd_Wd Shng', 'GarageCond_Po', 'BsmtFinType2_LwQ', 'SaleType_ConLI', 'HouseStyle_SLvl', 'Heating_Floor', 'MasVnrType_BrkCmn', 'BsmtFinType2_BLQ', 'SaleType_ConLD', 'RoofStyle_Gambrel', 'Exterior1st_AsphShn', 'GarageCond_Ex', 'BsmtFinType2_Rec', 'Heating_OthW', 'Id', 'Exterior1st_CBlock', 'GarageType_2Types', 'LotConfig_Corner', 'LandContour_Lvl', 'Alley_Pave', 'GarageQual_Po', 'SaleType_ConLw', 'SaleType_Oth', 'Exterior2nd_Stone', 'RoofMatl_ClyTile', 'GarageCond_Gd', 'BsmtFinSF2', 'Fence_GdPrv', 'BldgType_TwnhsE', 'Exterior1st_Stone', 'Condition1_PosA', 'LotConfig_FR3',      
'MasVnrType_0', 'Condition1_RRNn', 'SaleCondition_Alloca', 'RoofMatl_WdShake', 'Exterior1st_ImStucc', 'Condition1_RRAn',           
'GarageQual_Gd', 'LandContour_Low', 'GarageQual_Ex' , 'MoSold', 'LandSlope_Mod', 'BsmtFinType2_ALQ' ,    
'SaleType_Con', 'LotShape_IR3', 'LandSlope_Sev', 'Exterior2nd_Other', '3SsnPorch', 'HouseStyle_2.5Fin', 'Exterior1st_BrkFace',
'BsmtExposure_Mn', 'Condition1_PosN', 'Exterior2nd_BrkFace']
housing_trainfinal.drop(corr_todrop, axis=1, inplace=True)
housing_testfinal.drop(corr_todrop, axis=1, inplace=True)
test_datafinal.drop(corr_todrop, axis=1, inplace=True)

In [63]:
#housing_trainfinal.applymap(float)
#housing_testfinal.applymap(float)
#test_datafinal.applymap(float)

In [64]:
for row in range(housing_trainfinal.shape[0]): 
         for col in range(housing_trainfinal.shape[1]):
             if housing_trainfinal.iloc[row, col] == 'GLQ':
                 print(row, col)
                 break

In [65]:
finEntSet = pd.concat([housing_trainfinal, housing_testfinal], axis=0, ignore_index=True)

## RandomForestRegressor with GridSearchCV.
## RMSE score: .15195

In [66]:


rfr = RandomForestRegressor()
#rfr.fit(housing_trainfinal, y_train)
#ypred = rfr.predict(housing_testfinal)

In [75]:
param_grid = {  'max_depth': [5, 10, None],  'n_estimators': [200, 300, 500, 700, 1000]}

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid, 

                           n_jobs = -1, verbose = 2)
g_search.fit(housing_trainfinal, y_train)
y_hatrfr = g_search.predict(test_datafinal) 
#.13451



Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [76]:
g_search.best_score_
#g_search.best_params_

0.8084575219365758

In [77]:
len(test_datafinal)

1459

In [79]:
Ames2rfr = pd.DataFrame({'Id':test_copy.Id, 'SalePrice': y_hatrfr})
path = '/Users/jamester/Documents/MachineLearning/KaggleComps/HousePrices/Ames2rfr'
Ames2rfr.to_csv(path, index=False)

## RFR with default params. 
## RMSE score: .15037

In [81]:
rfr.fit(housing_trainfinal, y_train)
y_pred = rfr.predict(test_datafinal)

In [82]:
Ames2Submission1 = pd.DataFrame({'Id': test_copy.Id, 'SalePrice': y_pred})
path = '/Users/jamester/Documents/MachineLearning/KaggleComps/HousePrices/Ames2Submission1'
Ames2Submission1.to_csv(path, index=False)


## xgboost with GridSearchCV
## RMSE: .13525

In [88]:
from xgboost import XGBRegressor 
#params = {'n_estimators' : [100,200,300, 500, 1000, 1500],'max_depth': [4, 8, 10, 12, 14],'eta' : [.01,.05, .1, .2, .3, .4, .5, .7],
#              'max_features': [0.5, 0.3, 0.1] 
#             }

#skf = StratifiedKFold(n_splits=5, shuffle = True, random_state =42)

#random_search = RandomizedSearchCV(xgb, param_distributions=params, scoring = 'neg_mean_squared_error', n_jobs=-1, cv=5, verbose=3, random_state=42)
#xgb.fit(finEntSet, y)
#y_hat = xgb.predict(test_datafinal)
#random_search.best_score_                                   
                                   
                                

In [89]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [.01, .02, .03, 0.05, .07, .09], #so called `eta` value
              'max_depth': [5, 6, 7, 9],
              'min_child_weight': [4,6, 8],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [300, 400, 500, 600, 700]}
xgb = XGBRegressor()
xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)
xgb_grid.fit(housing_trainfinal, y_train)
y_hat = xgb_grid.predict(test_datafinal)
xgb_grid.best_score_


Fitting 2 folds for each of 360 candidates, totalling 720 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




0.8634836143804234

In [90]:
xgb_grid.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.02,
 'max_depth': 5,
 'min_child_weight': 6,
 'n_estimators': 500,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'silent': 1,
 'subsample': 0.7}

In [91]:
Ames2Submissionxgb = pd.DataFrame({'Id': test_copy.Id, 'SalePrice': y_hat})
path = '/Users/jamester/Documents/MachineLearning/KaggleComps/HousePrices/Ames2Submissionxgb'
Ames2Submissionxgb.to_csv(path, index=False)


In [None]:
## VotingRegressor with xgb_grid and rfr
## RMSE: 

In [95]:
from sklearn.ensemble import VotingRegressor
vc = VotingRegressor(estimators=[('rfr', rfr), ('xgb_grid', xgb_grid)])
vc.fit(housing_trainfinal, y_train) 
y_hatvc = vc.predict(test_datafinal) #.14

Fitting 2 folds for each of 360 candidates, totalling 720 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




## Ridge, RidgeCV, linear

In [None]:
from sklearn.linear_model import Ridge, RidgeCV
r_reg = RidgeCV(alphas=np.logspace(-6, 6, 13))
r_reg.fit(housing_trainfinal, y_train)
#lin_reg.intercept_, lin_reg.coef_
y_hatrr = lin_reg.predict(test_datafinal)
y_hatrrTrain = r_reg.predict(housing_testfinal)
#ridge, linear, RidgeCV all .3 something