In [23]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [24]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [25]:
print(train_df.head())
train_df.shape

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape   
0   1          60       RL         65.0     8450   Pave   NaN      Reg  \
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold   
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2  \
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

(1460, 81)

In [26]:
# Print all label with NaN values
for col in train_df.columns:
    if train_df[col].isna().sum() > 0:
        print(col, train_df[col].isna().sum())

LotFrontage 259
Alley 1369
MasVnrType 872
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81
PoolQC 1453
Fence 1179
MiscFeature 1406


In [27]:
print(train_df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape   
0   1          60       RL         65.0     8450   Pave   NaN      Reg  \
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold   
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2  \
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [28]:
# Drop columns that are over 50% NaN
for col in train_df.columns:
    if train_df[col].isna().sum() > 700:
        train_df = train_df.drop([col], axis=1)

print(train_df.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street LotShape LandContour   
0   1          60       RL         65.0     8450   Pave      Reg         Lvl  \
1   2          20       RL         80.0     9600   Pave      Reg         Lvl   
2   3          60       RL         68.0    11250   Pave      IR1         Lvl   
3   4          70       RL         60.0     9550   Pave      IR1         Lvl   
4   5          60       RL         84.0    14260   Pave      IR1         Lvl   

  Utilities LotConfig  ... EnclosedPorch 3SsnPorch ScreenPorch PoolArea   
0    AllPub    Inside  ...             0         0           0        0  \
1    AllPub       FR2  ...             0         0           0        0   
2    AllPub    Inside  ...             0         0           0        0   
3    AllPub    Corner  ...           272         0           0        0   
4    AllPub       FR2  ...             0         0           0        0   

  MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  
0    

In [29]:
# Print out columns with NaN
for col in train_df.columns:
    if train_df[col].isna().sum() > 0:
        print(col, train_df[col].isna().sum())

LotFrontage 259
MasVnrArea 8
BsmtQual 37
BsmtCond 37
BsmtExposure 38
BsmtFinType1 37
BsmtFinType2 38
Electrical 1
FireplaceQu 690
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
GarageCond 81


In [30]:
# Convert categorical data to numerical data or just drop them
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(train_df['LotFrontage'].mean())
train_df['MasVnrArea'] = train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mean())
bsmtqual_map = {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, 'NA': 6}
train_df['BsmtQual'] = train_df['BsmtQual'].map(bsmtqual_map)
train_df = train_df.dropna(subset=['BsmtQual', 'BsmtCond'])
train_df = train_df.dropna(subset=['BsmtExposure', 'BsmtFinType2', 'Electrical'])

fireplacequ_map = {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, None: 6}
train_df['FireplaceQu'] = train_df['FireplaceQu'].replace('NA', None).map(fireplacequ_map)
train_df['FireplaceQu'] = train_df['FireplaceQu'].fillna(6)

garagetype_map = {'2Types': 1, 'Attchd': 2, 'Basment': 3, 'BuiltIn': 4, 'CarPort': 4, 'Detchd': 5, None: 6}
train_df['GarageType'] = train_df['GarageType'].replace('NA', None).map(garagetype_map)
train_df['GarageType'] = train_df['GarageType'].fillna(6)
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].mean())

garagefinish_map = {'Fin': 1, 'RFn': 2, 'Unf': 3, None: 4}
train_df['GarageFinish'] = train_df['GarageFinish'].replace('NA', None).map(garagefinish_map)
train_df['GarageFinish'] = train_df['GarageFinish'].fillna(6)

garagequal_map = {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, None: 6}
train_df['GarageQual'] = train_df['GarageQual'].replace('NA', None).map(garagequal_map)
train_df.shape
train_df['GarageQual'] = train_df['GarageQual'].fillna(6)

# Map GarageCond
garagecond_map = {'Ex': 1, 'Gd': 2, 'TA': 3, 'Fa': 4, 'Po': 5, None: 6}
train_df['GarageCond'] = train_df['GarageCond'].replace('NA', None).map(garagecond_map)
train_df.shape
train_df['GarageCond'] = train_df['GarageCond'].fillna(6)

In [31]:
# Confirm that there are no more NaN values
for col in train_df.columns:
    if train_df[col].isna().sum() > 0:
        print(col, train_df[col].isna().sum())

In [32]:
# Now that we have our data, we can drop our Id column
train_df = train_df.drop(['Id'], axis=1)
train_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [None]:
# Now lets find the correlation between our features and our sale price
X = train_df.drop(['SalePrice'], axis=1)
y = train_df['SalePrice']