In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
train_downloaded = drive.CreateFile({'id': '1WFacqgpgPdDQc32iFpBbcl-vr2ng90YH'})
train_downloaded.GetContentFile('train.csv')
test_downloaded = drive.CreateFile({'id': '1LxVNmkm_KhN3MA_VdGP4_tOVcNURmIg2'})
test_downloaded.GetContentFile('test.csv')

In [0]:
import pandas as pd
import matplotlib
import numpy as np
import sklearn
import random
import time
import warnings
warnings.filterwarnings('ignore')


#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline

In [0]:
data_raw = pd.read_csv('train.csv')
data_val  = pd.read_csv('test.csv')

In [0]:
data1 = data_raw.copy(deep = True)
data_cleaner = [data1, data_val]

In [41]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [48]:
pd.set_option('display.max_columns', 81)
print(data1[['MSSubClass','SalePrice']].sample(10))

      MSSubClass  SalePrice
1402  20          193879   
672   20          165000   
491   50          133000   
795   60          171000   
540   20          315000   
613   20          147000   
7     60          200000   
410   20          60000    
508   70          161000   
373   20          123000   


In [58]:
for y in data1:
  for x in data1:
    if data1[x].dtype == 'int64' and x!='Id' and y!='Id' and y!=x and data1[y].dtype=='int64' and data1[x].corr(data1[y])>0.7 :
        print(y,'Correlation by:', x)
        print(data1[x].corr(data1[y]))
        print('-'*10, '\n')

OverallQual Correlation by: SalePrice
0.7909816005838044
---------- 

TotalBsmtSF Correlation by: 1stFlrSF
0.8195299750050337
---------- 

1stFlrSF Correlation by: TotalBsmtSF
0.8195299750050337
---------- 

GrLivArea Correlation by: TotRmsAbvGrd
0.8254893743088427
---------- 

GrLivArea Correlation by: SalePrice
0.7086244776126523
---------- 

TotRmsAbvGrd Correlation by: GrLivArea
0.8254893743088426
---------- 

GarageCars Correlation by: GarageArea
0.8824754142814625
---------- 

GarageArea Correlation by: GarageCars
0.8824754142814625
---------- 

SalePrice Correlation by: OverallQual
0.7909816005838044
---------- 

SalePrice Correlation by: GrLivArea
0.7086244776126523
---------- 



In [59]:
# Data Cleaning --- 4 C's i.e. Correcting, Completing, Creating, and Converting


print('Train columns with null value:\n',data1.isnull().sum())
print('-'*10)
print('Test columns with null value:\n',data_val.isnull().sum())
print('-'*10)
data_raw.describe(include = 'all')

Train columns with null value:
 Id               0   
MSSubClass       0   
MSZoning         0   
LotFrontage      259 
LotArea          0   
Street           0   
Alley            1369
LotShape         0   
LandContour      0   
Utilities        0   
LotConfig        0   
LandSlope        0   
Neighborhood     0   
Condition1       0   
Condition2       0   
BldgType         0   
HouseStyle       0   
OverallQual      0   
OverallCond      0   
YearBuilt        0   
YearRemodAdd     0   
RoofStyle        0   
RoofMatl         0   
Exterior1st      0   
Exterior2nd      0   
MasVnrType       8   
MasVnrArea       8   
ExterQual        0   
ExterCond        0   
Foundation       0   
                ..   
BedroomAbvGr     0   
KitchenAbvGr     0   
KitchenQual      0   
TotRmsAbvGrd     0   
Functional       0   
Fireplaces       0   
FireplaceQu      690 
GarageType       81  
GarageYrBlt      81  
GarageFinish     81  
GarageCars       0   
GarageArea       0   
GarageQual       81  


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
count,1460.0,1460.0,1460,1201.0,1460.0,1460,91,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460.0,1460.0,1460.0,1460.0,1460,1460,1460,1460,1452.0,1452.0,1460,1460,1460,1423,1423,1422,1423,1460.0,1422,1460.0,1460.0,1460.0,1460,1460,1460,1459,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460,1460.0,1460,1460.0,770,1379,1379.0,1379,1460.0,1460.0,1379,1379,1460,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,7,281,54,1460.0,1460.0,1460.0,1460,1460,1460.0
unique,,,5,,,2,2,4,4,2,5,3,25,9,8,5,8,,,,,6,8,15,16,4.0,,4,5,6,4,4,4,6,,6,,,,6,5,2,5,,,,,,,,,,,4,,7,,5,6,,3,,,5,5,3,,,,,,,3,4,4,,,,9,6,
top,,,RL,,,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,,,,,Gable,CompShg,VinylSd,VinylSd,,,TA,TA,PConc,TA,TA,No,Unf,,Unf,,,,GasA,Ex,Y,SBrkr,,,,,,,,,,,TA,,Typ,,Gd,Attchd,,Unf,,,TA,TA,Y,,,,,,,Gd,MnPrv,Shed,,,,WD,Normal,
freq,,,1151,,,1454,50,925,1311,1459,1052,1382,225,1260,1445,1220,726,,,,,1141,1434,515,504,864.0,,906,1282,647,649,1311,953,430,,1256,,,,1428,741,1365,1334,,,,,,,,,,,735,,1360,,380,870,,605,,,1311,1326,1340,,,,,,,3,157,49,,,,1267,1198,
mean,730.5,56.89726,,70.049958,10516.828082,,,,,,,,,,,,,6.099315,5.575342,1971.267808,1984.865753,,,,,,103.685262,,,,,,,,443.639726,,46.549315,567.240411,1057.429452,,,,,1162.626712,346.992466,5.844521,1515.463699,0.425342,0.057534,1.565068,0.382877,2.866438,1.046575,,6.517808,,0.613014,,,1978.506164,,1.767123,472.980137,,,,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,,,,43.489041,6.321918,2007.815753,,,180921.19589
std,421.610009,42.300571,,24.284752,9981.264932,,,,,,,,,,,,,1.382997,1.112799,30.202904,20.645407,,,,,,181.066207,,,,,,,,456.098091,,161.319273,441.866955,438.705324,,,,,386.587738,436.528436,48.623081,525.480383,0.518911,0.238753,0.550916,0.502885,0.815778,0.220338,,1.625393,,0.644666,,,24.689725,,0.747315,213.804841,,,,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,,,,496.123024,2.703626,1.328095,,,79442.502883
min,1.0,20.0,,21.0,1300.0,,,,,,,,,,,,,1.0,1.0,1872.0,1950.0,,,,,,0.0,,,,,,,,0.0,,0.0,0.0,0.0,,,,,334.0,0.0,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,,2.0,,0.0,,,1900.0,,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,1.0,2006.0,,,34900.0
25%,365.75,20.0,,59.0,7553.5,,,,,,,,,,,,,5.0,5.0,1954.0,1967.0,,,,,,0.0,,,,,,,,0.0,,0.0,223.0,795.75,,,,,882.0,0.0,0.0,1129.5,0.0,0.0,1.0,0.0,2.0,1.0,,5.0,,0.0,,,1961.0,,1.0,334.5,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,5.0,2007.0,,,129975.0
50%,730.5,50.0,,69.0,9478.5,,,,,,,,,,,,,6.0,5.0,1973.0,1994.0,,,,,,0.0,,,,,,,,383.5,,0.0,477.5,991.5,,,,,1087.0,0.0,0.0,1464.0,0.0,0.0,2.0,0.0,3.0,1.0,,6.0,,1.0,,,1980.0,,2.0,480.0,,,,0.0,25.0,0.0,0.0,0.0,0.0,,,,0.0,6.0,2008.0,,,163000.0
75%,1095.25,70.0,,80.0,11601.5,,,,,,,,,,,,,7.0,6.0,2000.0,2004.0,,,,,,166.0,,,,,,,,712.25,,0.0,808.0,1298.25,,,,,1391.25,728.0,0.0,1776.75,1.0,0.0,2.0,1.0,3.0,1.0,,7.0,,1.0,,,2002.0,,2.0,576.0,,,,168.0,68.0,0.0,0.0,0.0,0.0,,,,0.0,8.0,2009.0,,,214000.0
