# This Notebook is for Feature selection

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


## Initialize DF

In [54]:
df = pd.read_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/trainZeroNAs.csv')

In [55]:
df.columns

Index(['Unnamed: 0', 'Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
       'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond

## Drop Identifier Rows

In [56]:
df.drop(['Id','Unnamed: 0'], axis = 1, inplace=True)

In [57]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Check for low count of categorical values

In [58]:
catColumns= df[['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']]

In [59]:
dropList = []
for i in catColumns.columns:
    for j in catColumns[i].unique():
        if catColumns[catColumns[i] == j].count()[1] == 1:
            print(f'In column {i} value {j} only has 1 row')
            dropList.append([i,j])

In column Utilities value NoSeWa only has 1 row
In column Condition2 value PosA only has 1 row
In column Condition2 value RRAn only has 1 row
In column Condition2 value RRAe only has 1 row
In column RoofMatl value Metal only has 1 row
In column RoofMatl value Membran only has 1 row
In column RoofMatl value Roll only has 1 row
In column RoofMatl value ClyTile only has 1 row
In column Exterior1st value AsphShn only has 1 row
In column Exterior1st value ImStucc only has 1 row
In column Exterior1st value CBlock only has 1 row
In column Exterior2nd value Other only has 1 row
In column Exterior2nd value CBlock only has 1 row
In column ExterCond value Po only has 1 row
In column Heating value Floor only has 1 row
In column HeatingQC value Po only has 1 row
In column Electrical value Mix only has 1 row
In column Functional value Sev only has 1 row
In column MiscFeature value TenC only has 1 row


## Check for Values with High Correlation to Target

In [60]:
for i in dropList:
    df.drop(df[df[i[0]] == i[1]].index, inplace = True)

In [61]:
fig = px.bar(x = df.corr().index, y =df.corr().SalePrice,
            title = 'Correlation of Numerical Variable With Sales Price',
             color_discrete_sequence = ['#5F4B8B']
            )
fig.update_layout(
    barmode='stack', 
    xaxis_categoryorder = 'total descending',
    xaxis_title = 'Feature',
    yaxis_title = 'Correlation',
    title_x=0.5,
    plot_bgcolor = 'white',
    title_font = dict(size = 25),
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        showgrid = False,
        showline = True,
        linecolor = 'black'
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        showgrid = False,
        showline = False,
        linecolor = 'black'
        
    )
    
)
fig

**There are a lot of area and square footage values with high correlation, perhaps we should combine like to a single metric**  

**Combine Counts of Various items to 1 metric**

### Action Plan  
+ Total Square footage = 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF'  
+ Outdoor Square Footage = '3SsnPorch', 'EnclosedPorch', 'WoodDeckSF, 'OpenPorchSF', 'ScreenPorch'  
+ Total Bathrooms = 'Fullbath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'  


In [62]:
df['TotSq'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
df['OutSq'] = df['3SsnPorch'] + df['EnclosedPorch'] + df['WoodDeckSF'] + df['OpenPorchSF'] + df['ScreenPorch']
df['TotBr'] = df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']

### Check if relationship between new TotSq is the same as GrLiving Area

In [63]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Scatter(x = df.sort_values('SalePrice').SalePrice, y = df.sort_values('SalePrice').TotSq, name = 'Total Square Footage'),
    secondary_y = False,
)
fig.add_trace(
    go.Scatter( x = df.sort_values('SalePrice').SalePrice, y = df.sort_values('SalePrice').GrLivArea, name = 'Above Ground Living Area'),
   secondary_y = True,   
)
fig.update_layout(
    title_text = 'Comparing Total Square Footage and Above Ground Living Area',
    title_x=0.5,
    plot_bgcolor = 'lightgray',
    title_font = dict(size = 25)
)
fig.update_xaxes(title_text="Sale Price")
fig.update_yaxes(title_text="<b>Total Square Footage (ft^2)</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Above Ground Living Area (ft^2)</b>", secondary_y=True)
fig.write_image('C:/Users/jmeis/NYC_DSA/HousingPrices/Images/SquareFootageComparisons.png')
fig.show()

**Looks like my combined indoor square footage feature hgas the same relationship as GrLivArea, I feel confident dropping the latter**

In [64]:
df.drop(['GrLivArea'], axis =1, inplace = True)

### Drop columns

In [65]:
droplist = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', '3SsnPorch', 'EnclosedPorch', 'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
df.drop(droplist, axis = 1, inplace = True)
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,TotSq,OutSq,TotBr
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,,0,2,2008,WD,Normal,208500,2566,61,3.5
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,,0,5,2007,WD,Normal,181500,2524,298,2.5
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,,0,9,2008,WD,Normal,223500,2706,42,3.5
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,,0,2,2006,WD,Abnorml,140000,2473,307,2.0
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,,0,12,2008,WD,Normal,250000,3343,276,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,,0,8,2007,WD,Normal,175000,2600,40,2.5
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,,0,2,2010,WD,Normal,210000,3615,349,3.0
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,Shed,2500,5,2010,WD,Normal,266500,3492,60,2.0
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,,0,4,2010,WD,Normal,142125,2156,478,2.0


#### Idea, combine all quality metrics into a single quality converting text to ordinal weighted by correlation

In [66]:
QualityCols = ['OverallQual','OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC' ]

**Some condition columns are nominal convert to ordinal with a lowest value of 0 Highest of 10**

In [67]:
## ExterQual

replaceDict = {'Ex': 10, 'Gd':8, 'TA': 6, 'Fa':4, 'Po':2}
df = df.replace({'ExterQual': replaceDict})
df.ExterQual.value_counts()


6     893
8     486
10     50
4      13
Name: ExterQual, dtype: int64

In [68]:
## ExterCond
df = df.replace({'ExterCond': replaceDict})
df.ExterCond.value_counts()

6     1268
8      145
4       27
10       2
Name: ExterCond, dtype: int64

In [69]:
## BsmtQual
replaceDict = {'Ex': 10, 'Gd':8, 'TA': 6, 'Fa':4, 'Po':2, 'None': 0}
df = df.replace({'BsmtQual': replaceDict})
df.BsmtQual.value_counts()

6     641
8     612
10    120
0      35
4      34
Name: BsmtQual, dtype: int64

In [70]:
## BsmtCond
replaceDict = {'Ex': 10, 'Gd':8, 'TA': 6, 'Fa':4, 'Po':2, 'None': 0}
df = df.replace({'BsmtCond': replaceDict})
df.BsmtCond.value_counts()

6    1296
8      65
4      45
0      35
2       1
Name: BsmtCond, dtype: int64

In [71]:
# GarageQual
df = df.replace({'GarageQual': replaceDict})
df.GarageQual.value_counts()

6     1297
0       79
4       48
8       13
10       3
2        2
Name: GarageQual, dtype: int64

In [72]:
# GarageCond
df = df.replace({'GarageCond': replaceDict})
df.GarageCond.value_counts()

6     1313
0       79
4       33
8        9
2        6
10       2
Name: GarageCond, dtype: int64

In [73]:
#KitchenQual
df = df.replace({'KitchenQual': replaceDict})
df.KitchenQual.value_counts()

6     724
8     581
10     99
4      38
Name: KitchenQual, dtype: int64

In [74]:
# FireplaceQu
df = df.replace({'FireplaceQu': replaceDict})
df.FireplaceQu.value_counts()

0     682
8     377
6     308
4      32
10     24
2      19
Name: FireplaceQu, dtype: int64

In [75]:
# PoolQC
df = df.replace({'PoolQC': replaceDict})
df.PoolQC.value_counts()

0     1437
10       2
8        2
4        1
Name: PoolQC, dtype: int64

### Determine correlation of new numeric Columns

In [76]:
fig = px.bar(x = df.corr().index, y =df.corr().SalePrice,
            title = 'Correlation of Numerical Variable With Sales Price',
             color_discrete_sequence = ['#5F4B8B']
            )
fig.update_layout(
    barmode='stack', 
    xaxis_categoryorder = 'total descending',
    xaxis_title = 'Feature',
    yaxis_title = 'Correlation',
    title_x=0.5,
    plot_bgcolor = 'white',
    title_font = dict(size = 25),
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        showgrid = False,
        showline = True,
        linecolor = 'black'
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        showgrid = False,
        showline = False,
        linecolor = 'black'
        
    )
    
)
fig

#### 4 Metrics are definied by a quality and condition, this info seems redundant, let's create a new metric for quality per condition to combine all the info into one value

**Let's graph all of the quality and condition variables against sale price and see what the general relationships are**

In [77]:
dfG = df[['OverallQual','OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 'SalePrice']]


In [78]:
looplist = [['OverallQual','OverallCond'], ['ExterQual', 'ExterCond'], ['BsmtQual', 'BsmtCond'], ['GarageQual', 'GarageCond']]
for i in looplist:
    i.append('SalePrice')
    dfG = df[i]
    dfG = pd.melt(dfG, id_vars = 'SalePrice')
    fig = px.scatter(dfG, x = 'SalePrice', y = 'value', color = 'variable', title = '{} and {} vs Sale Price'.format(i[0], i[1]), opacity = .2,
                    labels = {
                        'variable': ''
                    }
                    )
    
    fig.update_layout(
    yaxis_title = 'Quality (0-10)',
    xaxis_title = 'Sale Price',
    title_x=0.5,
    plot_bgcolor = 'lightgray',
    title_font = dict(size = 25),
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        showgrid = False,
        showline = True,
        linecolor = 'black'
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        showgrid = False,
        showline = True,
        linecolor = 'black'
        
    )
    
)
    fig.write_image('C:/Users/jmeis/NYC_DSA/HousingPrices/Images/{}{}SP.png'.format(i[0], i[1]))
    fig.show()
    
    
    

**Looks like the quality variables are more impactful in general than the condition ones, but I don't think we can combine them without introducing error**

## Create a Combined Quality Variable and a Combined Condition variable

**Combine all quality and condition variables scaled by their correlation to the target**

In [79]:
dfTemp1 = df[['OverallQual','OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'SalePrice']]
df['Quality'] = 0
df['Condition'] = 0

In [80]:
for i in ['OverallQual', 'ExterQual', 'BsmtQual', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'PoolQC']:
    df['Quality'] = df['Quality'] + (df[i] * df.corr().SalePrice[i])


In [81]:
for i in ['OverallCond',  'ExterCond', 'BsmtCond', 'GarageCond']:
    df['Condition'] = df['Condition'] + (df[i] * df.corr().SalePrice[i])

In [82]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Scatter(x = df.sort_values('SalePrice').SalePrice, y = df.sort_values('SalePrice').Quality, name = 'Combined Quality'),
    secondary_y = False,
)
fig.add_trace(
    go.Scatter( x = df.sort_values('SalePrice').SalePrice, y = df.sort_values('SalePrice').OverallQual, name = 'Original OverallQuality'),
   secondary_y = True,   
)
fig.update_layout(
    title_text = 'Comparing Combined Quality Index and OverallQual Variable',
    title_x=0.5,
    plot_bgcolor = 'lightgray',
    title_font = dict(size = 25)
)
fig.update_xaxes(title_text="Sale Price")
fig.update_yaxes(title_text="<b>Combined Quality</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>OverallQual</b>", secondary_y=True)
fig.write_image('C:/Users/jmeis/NYC_DSA/HousingPrices/Images/QualityVsOverallQual.png')
fig.show()

In [83]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Scatter(x = df.sort_values('SalePrice').SalePrice, y = df.sort_values('SalePrice').Condition, name = 'Combined Condition'),
    secondary_y = False,
)
fig.add_trace(
    go.Scatter( x = df.sort_values('SalePrice').SalePrice, y = df.sort_values('SalePrice').OverallCond, name = 'Original OverallCondition'),
   secondary_y = True,   
)
fig.update_layout(
    title_text = 'Comparing Combined Condition Index and OverallCond Variable',
    title_x=0.5,
    plot_bgcolor = 'lightgray',
    title_font = dict(size = 25)
)
fig.update_xaxes(title_text="<b>Sale Price</b>")
fig.update_yaxes(title_text="<b>Combined Condition</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>OverallCond</b>", secondary_y=True)
fig.write_image('C:/Users/jmeis/NYC_DSA/HousingPrices/Images/ConditionVsOverallCond.png')
fig.show()

**Looks Like we managed to get all the informaiton opf all the quality and condition variables into 2 new variables and keep the overall relationship with sale price**

### Drop Old Qual and Cond Columns

In [84]:
for i in ['OverallQual','OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
    df.drop(i, axis = 1, inplace = True)

In [85]:
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,TotSq,OutSq,TotBr,Quality,Condition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,2,2008,WD,Normal,208500,2566,61,3.5,22.612406,2.373841
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,5,2007,WD,Normal,181500,2524,298,2.5,22.238565,2.108627
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,9,2008,WD,Normal,223500,2706,42,3.5,25.720545,2.373841
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,2,2006,WD,Abnorml,140000,2473,307,2.0,24.214194,2.787274
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,12,2008,WD,Normal,250000,3343,276,3.5,26.512674,2.373841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,8,2007,WD,Normal,175000,2600,40,2.5,22.238565,2.373841
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,2,2010,WD,Normal,210000,3615,349,3.0,22.238565,2.285436
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,5,2010,WD,Normal,266500,3492,60,2.0,26.951526,2.443142
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,4,2010,WD,Normal,142125,2156,478,2.0,18.485751,2.285436


# Everything Above was Feature Selection for V1_0

# Begin V1_1 Feature Engineering

In [86]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'LowQualFinSF',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'PavedDrive', 'PoolArea', 'Fence', 'MiscFeature',
       'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice',
       'TotSq', 'OutSq', 'TotBr', 'Quality', 'Condition'],
      dtype='object')

## Convery GarageYrBlt to numeric

### Convert None Values to 0

In [87]:
df.loc[df['GarageYrBlt']=='None','GarageYrBlt' ] = 0

In [88]:
df.GarageYrBlt = pd.to_numeric(df.GarageYrBlt)

In [89]:
df.GarageYrBlt

0       2003.0
1       1976.0
2       2001.0
3       1998.0
4       2000.0
         ...  
1455    1999.0
1456    1978.0
1457    1941.0
1458    1950.0
1459    1965.0
Name: GarageYrBlt, Length: 1442, dtype: float64

## Let's Look at VIF data and see if we can eliminate some columns

In [90]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
vif_data = pd.DataFrame()
J = df.drop(['SalePrice'], axis =1)
J = J.select_dtypes(['number'])
vif_data['Feature'] = J.columns
vif_data['VIF'] = [vif(J.values, i) for i in range(len(J.columns))]
vif_data

Unnamed: 0,Feature,VIF
0,MSSubClass,4.066255
1,LotFrontage,6.815439
2,LotArea,5.271214
3,YearBuilt,11812.161352
4,YearRemodAdd,19211.420502
5,MasVnrArea,1.826541
6,BsmtFinSF1,15.617675
7,BsmtFinSF2,2.039578
8,BsmtUnfSF,22.718786
9,LowQualFinSF,1.132675


## Find P value for each numeric columns

In [91]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
X = df.drop(['SalePrice'], axis = 1).select_dtypes(['number'])
Y = df.SalePrice
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
pVals = pd.DataFrame()
pVals['Feature'] = X.columns
pVals['P-Value'] = [est2.pvalues[i]for i in range(len(X.columns))]
pVals

Unnamed: 0,Feature,P-Value
0,MSSubClass,0.9057901
1,LotFrontage,7.959627e-08
2,LotArea,0.02239501
3,YearBuilt,3.094222e-08
4,YearRemodAdd,2.098174e-06
5,MasVnrArea,0.02587256
6,BsmtFinSF1,2.619615e-08
7,BsmtFinSF2,0.004866808
8,BsmtUnfSF,2.512031e-05
9,LowQualFinSF,3.509739e-12


# Everything Above is V1_1
## Begin V1_3

In [92]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'LowQualFinSF',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'PavedDrive', 'PoolArea', 'Fence', 'MiscFeature',
       'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice',
       'TotSq', 'OutSq', 'TotBr', 'Quality', 'Condition'],
      dtype='object')

## Convert Basement Finished and Unfinished Sq ft to percentage finished sq ft

In [93]:
df['BsmtPerFin'] = (df['BsmtFinSF1'] + df['BsmtFinSF2']) / (df['BsmtUnfSF']+ df['BsmtFinSF1'] + df['BsmtFinSF2'])

**Drop the previous 3 columns**

In [94]:
df = df.drop(['BsmtFinSF2', 'BsmtFinSF1', 'BsmtUnfSF'], axis = 1)

In [95]:
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', 'LowQualFinSF', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'PavedDrive', 'PoolArea',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice', 'TotSq', 'OutSq', 'TotBr', 'Quality',
       'Condition', 'BsmtPerFin'],
      dtype='object')

## Create Dummy Columns and output CSV V1.1

In [64]:
# List of Categorical columns

catCols = [i for i in df.columns if df[i].dtypes == object]

In [65]:
version = '1.1'
pd.get_dummies(df, columns = catCols, drop_first = True).to_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.1/FullTrainVersion_{}.csv'.format(version,version))

In [66]:
V1_1df = pd.get_dummies(df, columns = catCols, drop_first = True)

In [67]:
V1_1df.SalePrice

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1442, dtype: int64

## Create Train and Test Split and output csv V1.1

In [69]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(V1_1df.drop(['SalePrice'], axis = 1),V1_1df['SalePrice'] , test_size=0.3, shuffle = True, random_state=0)

In [70]:

X_train.to_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.1/x_Train.csv', index=False)
X_test.to_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.1/x_Test.csv', index=False)
y_train.to_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.1/y_Train.csv', index=False)
y_test.to_csv('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.1/y_Test.csv', index=False)


### Create Feature List Text file

In [71]:
with open('C:/Users/jmeis/NYC_DSA/HousingPrices/Data/Version_1.0/FeatureExplanation.txt', 'w') as f:
    f.write('Feature Explanation\n\
            Dropped Low Count Categorical Values\n\
            Created 3 new metrics for total indor square footage, total outdoor square footage, and total bathrooms, dropped other columns\n\
            Created a combined quality and condition variabel weighted by correlation to target\n\
            Changed nominal categorical values to ordinal\n\
            Created dummy variables and dropped first, sampled by shuffle\n\
            Converted GarageYrBlt to numeric, and changed None values to 0\n\
            '
            
           )