# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

# Get 10 most null valued columns

In [3]:
train_dataset.isnull().sum().nlargest(10)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
dtype: int64

In [4]:
test_dataset.isnull().sum().nlargest(10)

PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
FireplaceQu      730
LotFrontage      227
GarageYrBlt       78
GarageFinish      78
GarageQual        78
GarageCond        78
dtype: int64

# Delete the columns which have more than 50% null values

In [5]:
columns_with_more_nan_values = ['PoolQC' ,'MiscFeature' ,'Alley', 'Fence', 'FireplaceQu']
train_dataset = train_dataset.drop(columns_with_more_nan_values, axis = 1)
test_dataset = test_dataset.drop(columns_with_more_nan_values, axis = 1)
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

# Get the categorical features

In [6]:
categorical_features = train_dataset.select_dtypes(object).columns.values
categorical_features

array(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'], dtype=object)

In [7]:
categorical_data_headers = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition']

# Fill the null values in categorical data with missing

In [8]:
from sklearn.impute import SimpleImputer
# Declare Simple Imputer
cat_impute = SimpleImputer(strategy = 'constant', fill_value = 'missing')

# Get Mean for all features
for i in categorical_data_headers:
  cat_impute.fit(train_dataset[[i]])
# Tranform Train
  train_dataset[[i]] = cat_impute.transform(train_dataset[[i]])

# Tranform Predict
  test_dataset[[i]] = cat_impute.transform(test_dataset[[i]])

In [9]:
train_dataset.info()   # chake again no of null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [10]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1459 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearBuil

# Now get numerical columns which have null values
These are only numerical columns because we already have fii´lled categorical columns

In [11]:
train_dataset.columns[train_dataset.isnull().any()]


Index(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], dtype='object')

In [12]:
test_dataset.columns[test_dataset.isnull().any()]

Index(['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea'],
      dtype='object')

# Replace that null values with median

In [13]:
# Declare Simple Imputer
num_impute = SimpleImputer(strategy = 'median')
# Get Mean for all features for training data
num_impute.fit(train_dataset[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']])
train_dataset[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']] = num_impute.transform(train_dataset[['LotFrontage', 'MasVnrArea', 'GarageYrBlt']])

In [14]:
#for test data
num_impute.fit(test_dataset[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']])
test_dataset[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']] = num_impute.transform(test_dataset[['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea']])

In [15]:
train_dataset.columns[train_dataset.isnull().any()]

Index([], dtype='object')

In [16]:
test_dataset.columns[test_dataset.isnull().any()]

Index([], dtype='object')

## Encoding categorical data

In [17]:
train_dataset = pd.get_dummies(train_dataset, columns=categorical_data_headers, prefix=categorical_data_headers)

In [18]:
test_dataset = pd.get_dummies(test_dataset, columns=categorical_data_headers, prefix=categorical_data_headers)

In [19]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 283 entries, Id to SaleCondition_Partial
dtypes: float64(3), int64(35), uint8(245)
memory usage: 782.9 KB


In [20]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 272 entries, Id to SaleCondition_Partial
dtypes: float64(11), int64(26), uint8(235)
memory usage: 756.7 KB


# X_train,y_train, X_test

In [21]:
X = train_dataset.drop(['Id', 'SalePrice'], axis = 1)
y = train_dataset['SalePrice']
X_test_pred = test_dataset.drop(['Id'], axis = 1)

In [22]:
X, X_test_pred = X.align(X_test_pred, join='right', axis=1)

In [23]:
XTRAIN_impute = SimpleImputer(strategy = 'constant', fill_value = 0)
missing = ['MSZoning_missing', 'Utilities_missing', 'Exterior1st_missing',
       'Exterior2nd_missing', 'KitchenQual_missing', 'Functional_missing',
       'SaleType_missing']
# Get Mean for all features
for i in missing:
  XTRAIN_impute.fit(X[[i]])
# Tranform Train
  X[[i]] = XTRAIN_impute.transform(X[[i]])

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

check again for null values

In [25]:
X_test_pred.columns[X_test.isnull().any()]

Index([], dtype='object')

In [26]:
X_train.columns[X_train.isnull().any()]

Index([], dtype='object')

we fill this missing values with 0s because we encoded this columns so 0 means none 

Both X_train and X_test should have same columns

In [27]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 618 to 684
Columns: 271 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(10), int64(33), uint8(228)
memory usage: 661.6 KB


In [28]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292 entries, 529 to 61
Columns: 271 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(10), int64(33), uint8(228)
memory usage: 165.4 KB


In [29]:
X_train.columns[X_train.isnull().any()]

Index([], dtype='object')

## Training the Multiple Linear Regression model on the Training set

In [36]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.5-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.5


In [37]:
from catboost import CatBoostRegressor
regressor = CatBoostRegressor()
regressor.fit(X_train,y_train)

Learning rate set to 0.04196
0:	learn: 76651.1991765	total: 70.6ms	remaining: 1m 10s
1:	learn: 74497.6308046	total: 80.9ms	remaining: 40.4s
2:	learn: 72553.4406477	total: 98.1ms	remaining: 32.6s
3:	learn: 70676.7552097	total: 117ms	remaining: 29.2s
4:	learn: 69167.5601381	total: 127ms	remaining: 25.3s
5:	learn: 67452.9820391	total: 134ms	remaining: 22.3s
6:	learn: 65924.5290063	total: 140ms	remaining: 19.9s
7:	learn: 64364.6019119	total: 147ms	remaining: 18.3s
8:	learn: 62829.1441035	total: 155ms	remaining: 17.1s
9:	learn: 61313.4414466	total: 161ms	remaining: 15.9s
10:	learn: 59972.6944542	total: 168ms	remaining: 15.1s
11:	learn: 58558.8017804	total: 176ms	remaining: 14.5s
12:	learn: 57219.0323221	total: 185ms	remaining: 14s
13:	learn: 55945.0131704	total: 219ms	remaining: 15.4s
14:	learn: 54736.8123375	total: 235ms	remaining: 15.4s
15:	learn: 53475.4213551	total: 245ms	remaining: 15.1s
16:	learn: 52265.8925793	total: 251ms	remaining: 14.5s
17:	learn: 51066.0373743	total: 259ms	remain

<catboost.core.CatBoostRegressor at 0x7fd721383c10>

## Predicting the Test set results

In [38]:
y_pred = regressor.predict(X_test).astype(int)

In [39]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.8857162519942979

In [40]:
y_test_pred = regressor.predict(X_test_pred)
np.set_printoptions(precision=1)
print(y_test_pred)

[121916.3 166268.5 186545.7 ... 170433.7 112249.6 219672.2]


In [41]:
my_submission = pd.DataFrame({'Id': test_dataset['Id'], 'SalePrice': y_test_pred})
# you could use any filename. We choose submission here
print(my_submission)
my_submission.to_csv('submission_new.csv', index=False)

        Id      SalePrice
0     1461  121916.272905
1     1462  166268.511921
2     1463  186545.682427
3     1464  193951.601007
4     1465  181572.473390
...    ...            ...
1454  2915   76529.713539
1455  2916   80453.239701
1456  2917  170433.721106
1457  2918  112249.553673
1458  2919  219672.218727

[1459 rows x 2 columns]
