In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Initialize DF

In [97]:
df = pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/testZeroNAs.csv')

In [98]:
df.drop(['Id','Unnamed: 0' ], axis = 1, inplace = True)

# Begin Feature Selection

In [99]:
df['TotSq'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['OutSq'] = df['3SsnPorch'] + df['EnclosedPorch'] + df['WoodDeckSF'] + df['OpenPorchSF'] + df['ScreenPorch']
df['TotBr'] = df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']

In [100]:
df.drop(['GrLivArea'], axis =1, inplace = True)
droplist = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', '3SsnPorch', 'EnclosedPorch', 'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
df.drop(droplist, axis = 1, inplace = True)
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,TotSq,OutSq,TotBr
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,MnPrv,,0,6,2010,WD,Normal,1778.0,260,1.0
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,,Gar2,12500,6,2010,WD,Normal,2658.0,429,1.5
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,MnPrv,,0,3,2010,WD,Normal,2557.0,246,2.5
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,,,0,6,2010,WD,Normal,2530.0,396,2.5
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,,,0,1,2010,WD,Normal,2560.0,226,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1443,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,,,0,6,2006,WD,Normal,1638.0,0,1.5
1444,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,,,0,4,2006,WD,Abnorml,1638.0,24,1.5
1445,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,,,0,9,2006,WD,Abnorml,2448.0,474,2.0
1446,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,MnPrv,Shed,700,7,2006,WD,Normal,1882.0,112,1.5


In [101]:
QualityCols = ['OverallQual','OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC' ]

## Convert Certain Nominal Categoricals to Ordinal

In [102]:
replaceDict = {'Ex': 10, 'Gd':8, 'TA': 6, 'Fa':4, 'Po':2}
df = df.replace({'ExterQual': replaceDict})
df.ExterQual.value_counts()

6     884
8     491
10     55
4      18
Name: ExterQual, dtype: int64

In [103]:
df = df.replace({'ExterCond': replaceDict})
df.ExterCond.value_counts()

6     1249
8      153
4       36
10       9
2        1
Name: ExterCond, dtype: int64

In [104]:
replaceDict = {'Ex': 10, 'Gd':8, 'TA': 6, 'Fa':4, 'Po':2, 'None': 0}
df = df.replace({'BsmtQual': replaceDict})
df.BsmtQual.value_counts()

6     630
8     589
10    137
4      53
0      39
Name: BsmtQual, dtype: int64

In [105]:
replaceDict = {'Ex': 10, 'Gd':8, 'TA': 6, 'Fa':4, 'Po':2, 'None': 0}
df = df.replace({'BsmtCond': replaceDict})
df.BsmtCond.value_counts()

6    1290
8      57
4      56
0      42
2       3
Name: BsmtCond, dtype: int64

In [106]:
df = df.replace({'GarageQual': replaceDict})
df.GarageQual.value_counts()

6    1289
0      76
4      72
8      10
2       1
Name: GarageQual, dtype: int64

In [107]:
df = df.replace({'GarageCond': replaceDict})
df.GarageCond.value_counts()

6     1324
0       76
4       35
8        6
2        6
10       1
Name: GarageCond, dtype: int64

In [108]:
df = df.replace({'KitchenQual': replaceDict})
df.KitchenQual.value_counts()

6     749
8     564
10    105
4      30
Name: KitchenQual, dtype: int64

In [109]:
df = df.replace({'FireplaceQu': replaceDict})
df.FireplaceQu.value_counts()

0     720
8     363
6     279
4      41
2      26
10     19
Name: FireplaceQu, dtype: int64

In [110]:
df = df.replace({'PoolQC': replaceDict})
df.PoolQC.value_counts()

0     1445
10       2
8        1
Name: PoolQC, dtype: int64

## Create a Combined Quality Variable and a Combined Condition variable

In [111]:
dfOld = pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/trainZeroNAs.csv')
dfOld['TotSq'] = dfOld['TotalBsmtSF'] + dfOld['1stFlrSF'] + dfOld['2ndFlrSF']
dfOld['OutSq'] = dfOld['3SsnPorch'] + dfOld['EnclosedPorch'] + dfOld['WoodDeckSF'] + dfOld['OpenPorchSF'] + dfOld['ScreenPorch']
dfOld['TotBr'] = dfOld['FullBath'] + 0.5 * dfOld['HalfBath'] + dfOld['BsmtFullBath'] + 0.5 * dfOld['BsmtHalfBath']

In [112]:
dfOld.drop(['GrLivArea'], axis =1, inplace = True)
droplist = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', '3SsnPorch', 'EnclosedPorch', 'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
dfOld.drop(droplist, axis = 1, inplace = True)

In [113]:
dfOld = dfOld.replace({'ExterQual': replaceDict})
dfOld = dfOld.replace({'ExterCond': replaceDict})
dfOld = dfOld.replace({'BsmtQual': replaceDict})
dfOld = dfOld.replace({'BsmtCond': replaceDict})
dfOld = dfOld.replace({'GarageQual': replaceDict})
dfOld = dfOld.replace({'GarageCond': replaceDict})
dfOld = dfOld.replace({'KitchenQual': replaceDict})
dfOld = dfOld.replace({'FireplaceQu': replaceDict})
dfOld = dfOld.replace({'PoolQC': replaceDict})


In [114]:
dfTemp1 = df[['OverallQual','OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']]
df['Quality'] = 0
df['Condition'] = 0

In [115]:
for i in ['OverallQual', 'ExterQual', 'BsmtQual', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'PoolQC']:
    df['Quality'] = df['Quality'] + (df[i] * dfOld.corr().SalePrice[i])

In [116]:
for i in ['OverallCond',  'ExterCond', 'BsmtCond', 'GarageCond']:
    df['Condition'] = df['Condition'] + (df[i] * dfOld.corr().SalePrice[i])

### Drop Old Qual and Cond Columns

In [117]:
for i in ['OverallQual','OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
    df.drop(i, axis = 1, inplace = True)

### Convert None Values to 0

In [118]:
df.loc[df['GarageYrBlt']=='None','GarageYrBlt' ] = 0
df.GarageYrBlt = pd.to_numeric(df.GarageYrBlt)

## Convert Basement Finished and Unfinished Sq ft to percentage finished sq ft

In [119]:
df['BsmtPerFin'] = (df['BsmtFinSF1'] + df['BsmtFinSF2']) / (df['BsmtUnfSF']+ df['BsmtFinSF1'] + df['BsmtFinSF2'])
df = df.drop(['BsmtFinSF2', 'BsmtFinSF1', 'BsmtUnfSF'], axis = 1)
df['BsmtPerFin'] = df['BsmtPerFin'].fillna(0)

## Convert Basement Quality columns to numeric and average the two

In [120]:
replaceDict2 = {'None' : 0, 'Unf' : 1, 'LwQ' : 2, 'Rec' : 3, 'BLQ' : 4, 'ALQ' : 5, 'GLQ': 6}
df = df.replace({'BsmtFinType1': replaceDict2, 'BsmtFinType2': replaceDict2})
df.loc[:,['BsmtFinType1', 'BsmtFinType2']].value_counts()

BsmtFinType1  BsmtFinType2
1             1               415
6             1               398
5             1               166
3             1               112
4             1                85
2             1                53
0             0                39
4             3                19
5             4                15
3             5                15
5             3                14
              2                12
3             2                12
6             5                11
              3                10
4             2                10
3             4                 8
2             3                 8
3             6                 7
2             6                 7
              4                 7
6             2                 7
4             6                 5
2             5                 5
6             4                 5
4             5                 2
5             6                 1
dtype: int64

In [121]:
df['BsmtFinish'] = df['BsmtFinType1'] + df['BsmtFinType2']
df = df.drop(['BsmtFinType1', 'BsmtFinType2'], axis = 1)

## Change Year Remolded and Year Built

In [122]:
df['RemodYN'] = np.where(df['YearBuilt'] == df['YearRemodAdd'], 'No', 'Yes')

In [123]:
df= df.drop(['YearRemodAdd'], axis = 1)

In [124]:
df=df.drop(['YearBuilt'], axis = 1)

## Garage Variables

In [125]:
df = df.drop(['GarageYrBlt','GarageArea'], axis = 1)

## DropTotal Rooms, same info as number of bedrooms and Kitchens

In [126]:
df = df.drop(['TotRmsAbvGrd'], axis =1)

## Drop Year Sold, weak correlation and p-value

In [127]:
df = df.drop(['YrSold'], axis = 1)

## Create  a significant Rooms variable

In [128]:
df['SigRooms'] = df['BedroomAbvGr'] + df['KitchenAbvGr'] + df['TotBr']
df=df.drop(['BedroomAbvGr', 'KitchenAbvGr', 'TotBr'], axis = 1)

## Combine Quality and Condition columns

In [129]:
df['AvgConQual'] = (df['Quality'] + df['Condition'])/2
df = df.drop(['Quality', 'Condition'], axis = 1)

## Convert MSSubClass into categorical

In [130]:
df['MSSubClass']=df['MSSubClass'].map(str)

## Create Dummy Columns and output CSV

In [131]:
catCols = [i for i in df.columns if df[i].dtypes == object]
pd.get_dummies(df, columns = catCols, drop_first = True).to_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.4/FinalTest.csv')
Vdf = pd.get_dummies(df, columns = catCols, drop_first = True)
Vdf

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,LowQualFinSF,Fireplaces,GarageCars,PoolArea,MiscVal,MoSold,TotSq,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,RemodYN_Yes
0,80.0,11622,0.0,0,0,1.0,0,0,6,1778.0,...,0,0,0,1,0,0,0,1,0,0
1,81.0,14267,108.0,0,0,1.0,0,12500,6,2658.0,...,0,0,0,1,0,0,0,1,0,0
2,74.0,13830,0.0,0,1,2.0,0,0,3,2557.0,...,0,0,0,1,0,0,0,1,0,1
3,78.0,9978,20.0,0,1,2.0,0,0,6,2530.0,...,0,0,0,1,0,0,0,1,0,0
4,43.0,5005,0.0,0,0,2.0,0,0,1,2560.0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1443,21.0,1936,0.0,0,0,0.0,0,0,6,1638.0,...,0,0,0,1,0,0,0,1,0,0
1444,21.0,1894,0.0,0,0,1.0,0,0,4,1638.0,...,0,0,0,1,0,0,0,0,0,0
1445,160.0,20000,0.0,0,1,2.0,0,0,9,2448.0,...,0,0,0,1,0,0,0,0,0,1
1446,62.0,10441,0.0,0,0,0.0,0,700,7,1882.0,...,0,0,0,1,0,0,0,1,0,0


# Create 2 Models with parameters from Modeling Notebook

In [132]:
X_train= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.4/x_Train.csv')
X_test= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.4/x_Test.csv')
y_train= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.4/y_Train.csv')
y_test= pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.4/y_Test.csv')


In [133]:
add2XTrain = []
for i in Vdf.columns:
    if i not in X_train.columns:
        add2XTrain.append(i)
add2XTrain

['MSSubClass_150',
 'Condition2_PosA',
 'Exterior1st_AsphShn',
 'Exterior1st_None',
 'Exterior2nd_CBlock',
 'Exterior2nd_None',
 'HeatingQC_Po',
 'Functional_Sev']

In [134]:
X_train[add2XTrain] = 0


In [135]:
add2Vdf = []
for i in X_train.columns:
    if i not in Vdf:
        add2Vdf.append(i)
add2Vdf

['Condition2_RRNn', 'HouseStyle_2.5Fin', 'Exterior1st_Stone', 'Heating_OthW']

In [136]:
Vdf[add2Vdf] = 0

In [142]:
add2XTest = []
for i in Vdf.columns:
    if i not in X_test.columns:
        add2XTest.append(i)
X_test[add2XTest] = 0

In [137]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [169]:
Vdf.to_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.4/FinalTest.csv')

In [155]:
X_test

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,LowQualFinSF,Fireplaces,GarageCars,PoolArea,MiscVal,MoSold,TotSq,...,SaleCondition_Partial,RemodYN_Yes,MSSubClass_150,Condition2_PosA,Exterior1st_AsphShn,Exterior1st_None,Exterior2nd_CBlock,Exterior2nd_None,HeatingQC_Po,Functional_Sev
0,36.000000,2268,106.0,0,0,2,0,0,7,2390,...,0,1,0,0,0,0,0,0,0,0
1,65.000000,7804,0.0,0,2,2,0,0,12,3103,...,0,1,0,0,0,0,0,0,0,0
2,66.000000,16226,0.0,0,1,2,0,0,5,3480,...,0,1,0,0,0,0,0,0,0,0
3,36.867934,5100,0.0,0,1,1,0,0,6,2254,...,0,1,0,0,0,0,0,0,0,0
4,60.000000,8520,0.0,0,1,1,0,0,8,3240,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428,61.265832,8475,0.0,0,0,1,0,0,4,1904,...,0,0,0,0,0,0,0,0,0,0
429,43.000000,3182,16.0,0,1,2,0,0,5,2928,...,0,1,0,0,0,0,0,0,0,0
430,79.000000,9480,224.0,0,2,1,0,0,6,2372,...,0,1,0,0,0,0,0,0,0,0
431,61.000000,7943,192.0,0,0,1,0,0,4,2058,...,0,0,0,0,0,0,0,0,0,0


In [156]:
from sklearn.linear_model import Lasso
logit = Lasso()
logit.fit(X_train, y_train)

Lasso()

In [157]:
LassoPred = logit.predict(Vdf)

In [158]:
LassoPred

array([-141575.31762057, -124071.32984559, -158790.94052311, ...,
       -213613.89895836, -317081.30880098, -251470.73038039])

In [163]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(max_depth= 12,
 max_leaf_nodes= 500,
 max_samples= None,
 min_samples_leaf= 4,
 min_samples_split= 5,
 n_estimators= 300)

In [164]:
rfr.fit(X_train, y_train)
rfrPred = rfr.predict(Vdf)

array([119120.09618399, 146972.03592184, 175092.71771405, ...,
       170071.93473486, 110682.44315085, 194514.3642466 ])

In [165]:
import xgboost as xgb
logitXB = xgb.XGBRegressor(max_depth = 8,
 min_child_weight = 9,
 eta = 0.01,
 subsample = 0.7,
 colsample_bytree = 0.7,
 objective = 'reg:linear',
 eval_metric = 'mae',
 num_boost_round = 519
)

In [168]:
logitXB.train(X_train, y_train)

AttributeError: 'XGBRegressor' object has no attribute 'train'

In [167]:
logitXB.predict(Vdf)

array([ 70172.22,  88597.37, 107290.67, ..., 104737.81,  72606.77,
       125982.54], dtype=float32)