In [1]:
import numpy as np
import pandas as pd

from sklearn import linear_model 
from sklearn import svm
from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

import seaborn as sns

import math
import warnings
%matplotlib inline 

import os
import subprocess

In [8]:
train_data = pd.read_csv("train.csv")

In [9]:
test_data = pd.read_csv("test.csv")

In [10]:
data = []
data.append( train_data)
data.append(test_data)

In [11]:
data[0].head(6)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000


In [12]:
data[1].head(6)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
5,1466,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal


Let's deal with cleaning the training data

In [17]:
data[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [49]:
null_values = data[0].isnull().sum()

Let's get all the object data types ~> we'll need to encode these values later, drop them, or perform PCA

In [47]:
##time ~> O(n)
##space ~> O(n)
objects = set()
data_types = data[0].dtypes
i = 0
for c in data[0]:
    if data_types[i] == 'object':
        objects.add(c)
    i += 1

Let's aggregate all the columns with null_values

In [80]:
##time ~> O(n)
##space ~> O(n)i
i = 0
null_columns = set()
for v in null_values:
    if v > 0:
            null_columns.add(null_values.axes[0][i])
    i += 1

Let's get rid of all NaN values in the trainning set

In [102]:
for c in data[0]:
    if c in null_columns:
        if (d[c].dtype == np.int64 or np.float64):
            d[c].fillna(d[c].mode()[0],inplace=True)
        else:
            d[c].fillna(d[c].mode()[0],inplace=True)

Let's encode all the 'object' types

In [None]:
encoder = LabelEncoder()

In [None]:

for c in data[0]:
    if c in objects:
        