In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load Data 

In [2]:
data = pd.read_csv("house_pred.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Preprocessing

**To fill with mean**
- **LotFrontage** - because it has continuous values


**To fill with 0**
- **MasVnrArea** - may be it is not present(becuase MasVnrType is also Nan for respective indices) so because of unavailability it is left empty hence we can substituite with 0.
- **GarageYrBlt** - Garage is not present (as GarageArea for those indices are 0) hence we can fill with 0(assuming 0 is a hypothtical year). (Though there is no year 0 ,but for sake of model we can have)

In [4]:
def standardise(col, data_num, train_mean, train_std):
    return (data_num[col] - train_mean)/ train_std 

In [5]:
mean=[]
std=[]

In [6]:
def preprocessing_pipeline(raw_data, train=True):
    global mean, std
    
    raw_data = raw_data.select_dtypes(exclude='object')
    ## Drop columns that has very few non null values
    ## remove_column = ['Id','Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    remove_column = ['Id']
    raw_data.drop(columns=remove_column, inplace=True)
    
    # fill na values with mean
    raw_data['LotFrontage'] = raw_data['LotFrontage'].fillna(raw_data['LotFrontage'].mean())
    
    # fill na values with 0 (reason mentioned above)
    raw_data['MasVnrArea'] = raw_data['MasVnrArea'].fillna(0)
    raw_data['GarageYrBlt'] = raw_data['GarageYrBlt'].fillna(0)
    
    # Drop remaining null values
    ###raw_data = raw_data.dropna()    
    
    if train:
         # apply standardisation
        for col in raw_data.columns[:-1]:
            mean.append(raw_data[col].mean())
            std.append(raw_data[col].std())
            raw_data[col] = standardise(col, raw_data, mean[-1], std[-1])
        X = raw_data.iloc[:, :-1]
        y = raw_data[['SalePrice']]
        return X,y
    else:
        for i,col in enumerate(raw_data.columns):
            raw_data[col] = standardise(col, raw_data, mean[i], std[i])
        X = raw_data.iloc[:, :]
        
        return X

In [7]:
X, y = preprocessing_pipeline(data)


In [8]:
X.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0.07335,-0.229293,-0.207071,0.651256,-0.517023,1.050634,0.878367,0.513928,0.575228,-0.288554,...,0.35088,-0.751918,0.216429,-0.359202,-0.116299,-0.270116,-0.068668,-0.087658,-1.598563,0.13873
1,-0.872264,0.451781,-0.091855,-0.071812,2.178881,0.15668,-0.42943,-0.570555,1.171591,-0.288554,...,-0.06071,1.625638,-0.704242,-0.359202,-0.116299,-0.270116,-0.068668,-0.087658,-0.488943,-0.614228


In [9]:
y.head(2)

Unnamed: 0,SalePrice
0,208500
1,181500


In [10]:
X.shape, y.shape

((1460, 36), (1460, 1))

## Check for multicollinearity

In [11]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def vif(X):
    vif = pd.DataFrame()
    vif['Variables'] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif

In [12]:
def remove_multicollinearity(X, col):
    X_new = X.drop(columns=col)
    vif_new = vif(X_new)
    return vif_new, X_new

In [13]:
Vif = vif(X)
Vif

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,Variables,VIF
0,MSSubClass,1.653716
1,LotFrontage,1.576385
2,LotArea,1.254031
3,OverallQual,3.255798
4,OverallCond,1.586922
5,YearBuilt,4.103369
6,YearRemodAdd,2.28015
7,MasVnrArea,1.401482
8,BsmtFinSF1,inf
9,BsmtFinSF2,inf


#### Multicollinearity exists
- Columns having infinity(i.e high multicollinearity are all related to square feet area)


In [14]:
vif_new, X_new = remove_multicollinearity(X, ['BsmtFinSF1', '1stFlrSF', '2ndFlrSF'] )
vif_new

Unnamed: 0,Variables,VIF
0,MSSubClass,1.573244
1,LotFrontage,1.575119
2,LotArea,1.253804
3,OverallQual,3.242728
4,OverallCond,1.586299
5,YearBuilt,4.081268
6,YearRemodAdd,2.279908
7,MasVnrArea,1.401448
8,BsmtFinSF2,1.155729
9,BsmtUnfSF,2.57685


## Split the data

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, train_size=0.80, random_state=2)

In [16]:
X_train.shape, X_test.shape

((1168, 33), (292, 33))

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 455 to 1192
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1168 non-null   float64
 1   LotFrontage    1168 non-null   float64
 2   LotArea        1168 non-null   float64
 3   OverallQual    1168 non-null   float64
 4   OverallCond    1168 non-null   float64
 5   YearBuilt      1168 non-null   float64
 6   YearRemodAdd   1168 non-null   float64
 7   MasVnrArea     1168 non-null   float64
 8   BsmtFinSF2     1168 non-null   float64
 9   BsmtUnfSF      1168 non-null   float64
 10  TotalBsmtSF    1168 non-null   float64
 11  LowQualFinSF   1168 non-null   float64
 12  GrLivArea      1168 non-null   float64
 13  BsmtFullBath   1168 non-null   float64
 14  BsmtHalfBath   1168 non-null   float64
 15  FullBath       1168 non-null   float64
 16  HalfBath       1168 non-null   float64
 17  BedroomAbvGr   1168 non-null   float64
 18  Kitche

## Check for features that has highest influence
- Features with correlation >= 0.5
- **But usage of this is reducing R^2 value, hence avoiding this procedure**

In [18]:
data[list(X_new.columns.values) + ['SalePrice']].corr().SalePrice.sort_values(ascending=False)[1:11]

OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
GarageYrBlt     0.486362
Name: SalePrice, dtype: float64

In [19]:
features = data[list(X_new.columns.values) + ['SalePrice']].corr().SalePrice.sort_values(ascending=False)[1:11].index
features

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
       'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt'],
      dtype='object')

## Model

In [20]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
model = reg.fit(X_train, y_train)

## Error

In [21]:
pred = model.predict(X_test)
pred[:5]

array([[273421.23874475],
       [191585.47855411],
       [285252.94933405],
       [215573.6654898 ],
       [215954.71070701]])

In [22]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Metrics suggested in kaggle competition
print(np.sqrt(mean_squared_error(np.log(np.abs(y_test)),np.log(np.abs(pred)))))

0.16229675418488207


In [23]:
r2_score(y_test, pred)

0.8551378072730299

## Predicting with unknown data

In [24]:
data_test = pd.read_csv('test.csv')

In [25]:
X_test = preprocessing_pipeline(data_test, False)

In [26]:
X_test.drop(columns=['BsmtFinSF1', '1stFlrSF', '2ndFlrSF'], inplace=True)

In [27]:
X_test.shape

(256, 33)

In [28]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     256 non-null    float64
 1   LotFrontage    256 non-null    float64
 2   LotArea        256 non-null    float64
 3   OverallQual    256 non-null    float64
 4   OverallCond    256 non-null    float64
 5   YearBuilt      256 non-null    float64
 6   YearRemodAdd   256 non-null    float64
 7   MasVnrArea     256 non-null    float64
 8   BsmtFinSF2     256 non-null    float64
 9   BsmtUnfSF      256 non-null    float64
 10  TotalBsmtSF    256 non-null    float64
 11  LowQualFinSF   256 non-null    float64
 12  GrLivArea      256 non-null    float64
 13  BsmtFullBath   256 non-null    float64
 14  BsmtHalfBath   256 non-null    float64
 15  FullBath       256 non-null    float64
 16  HalfBath       256 non-null    float64
 17  BedroomAbvGr   256 non-null    float64
 18  KitchenAbv

In [29]:
prediction = model.predict(X_test)

In [30]:
prediction

array([[145552.02609464],
       [277771.67387564],
       [107445.87668319],
       [218074.00929132],
       [153666.62335651],
       [285137.3457774 ],
       [207921.60812418],
       [230631.74454155],
       [ 69299.55020238],
       [126737.89896252],
       [108249.65405367],
       [ 91239.76270442],
       [152635.59296831],
       [232637.29666675],
       [ 68019.59960972],
       [131938.94024863],
       [105217.34267679],
       [114194.57883591],
       [172712.77447598],
       [165205.66485101],
       [223408.70083864],
       [138667.28452323],
       [223378.26436351],
       [158066.49433541],
       [331671.64521355],
       [115092.89259588],
       [208680.49476845],
       [137935.72991638],
       [ 90539.03688798],
       [149353.07452492],
       [120464.2808261 ],
       [113131.30675126],
       [182199.21980593],
       [217466.191914  ],
       [162144.00469815],
       [240927.13125022],
       [229786.95109186],
       [236094.64640528],
       [1900