### Importing Preprocessed Dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import dataset
df = pd.read_csv('preprocessed_housePrice.csv')

In [3]:
df.head()

Unnamed: 0,price,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks,Sold,avg_dist,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_River
0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,0,4.0875,1,0,0,1
1,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,1,4.9675,0,1,0,0
2,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,46.19856,38,0.045764,0,4.9675,0,0,0,0
3,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,0,6.065,1,1,0,0
4,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,0,6.0625,0,1,0,0


### Multiple Predictor Logistic Model


In [4]:
# Independent Variables
x = df.loc[:, df.columns!='Sold']

In [5]:
x.head()

Unnamed: 0,price,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks,avg_dist,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_River
0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,4.0875,1,0,0,1
1,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,4.9675,0,1,0,0
2,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,46.19856,38,0.045764,4.9675,0,0,0,0
3,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,6.065,1,1,0,0
4,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,6.0625,0,1,0,0


In [6]:
# Dependent variable
y = df['Sold']

In [7]:
y.head()

0    0
1    1
2    0
3    0
4    0
Name: Sold, dtype: int64

**Multiple Predictor Logistic Regression using sklearn library**

In [None]:
# importing Logistic regression from sklearn
from sklearn.linear_model import LogisticRegression

In [9]:
# creating object
mplr = LogisticRegression()

In [10]:
# training model
mplr.fit(x, y)

LogisticRegression()

In [12]:
# coefficent and intercept
print("coefficent[beta1] of model: ", mplr.coef_)
print("\n")
print("intercept[beta0] of model: ", mplr.intercept_)

coefficent[beta1] of model:  [[-0.24636683 -0.01730016 -0.11021093  0.86270716 -0.00599886  0.22817894
  -0.21023024  0.18005235 -0.09084899 -0.0070432  -0.00499283 -0.32612166
  -0.10522555 -0.09488202 -0.01512563  0.2018235 ]]


intercept[beta0] of model:  [0.01671404]


**Multiple Predictor Logistic Regression using Statsmodel library**

In [14]:
# importing library
import statsmodels.api as sm

In [15]:
x_constant = sm.add_constant(x)

In [16]:
x_constant.head()

Unnamed: 0,const,price,resid_area,air_qual,room_num,age,teachers,poor_prop,n_hos_beds,n_hot_rooms,rainfall,parks,avg_dist,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_River
0,1.0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,4.0875,1,0,0,1
1,1.0,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,4.9675,0,1,0,0
2,1.0,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,46.19856,38,0.045764,4.9675,0,0,0,0
3,1.0,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,6.065,1,1,0,0
4,1.0,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,6.0625,0,1,0,0


In [17]:
import statsmodels.discrete.discrete_model as sd

In [18]:
sm_mplr = sd.Logit(y, x_constant).fit()

Optimization terminated successfully.
         Current function value: 0.556433
         Iterations 6


In [20]:
sm_mplr.summary()

0,1,2,3
Dep. Variable:,Sold,No. Observations:,506.0
Model:,Logit,Df Residuals:,489.0
Method:,MLE,Df Model:,16.0
Date:,"Fri, 15 Jan 2021",Pseudo R-squ.:,0.1924
Time:,00:52:23,Log-Likelihood:,-281.56
converged:,True,LL-Null:,-348.64
Covariance Type:,nonrobust,LLR p-value:,9.93e-21

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.1383,2.649,-0.807,0.420,-7.331,3.054
price,-0.2741,0.033,-8.313,0.000,-0.339,-0.209
resid_area,0.0192,0.027,0.720,0.471,-0.033,0.071
air_qual,-7.4183,2.691,-2.757,0.006,-12.693,-2.144
room_num,1.1067,0.277,4.001,0.000,0.565,1.649
age,-0.0020,0.007,-0.302,0.762,-0.015,0.011
teachers,0.3150,0.064,4.937,0.000,0.190,0.440
poor_prop,-0.2077,0.034,-6.149,0.000,-0.274,-0.141
n_hos_beds,0.1760,0.071,2.467,0.014,0.036,0.316


- We have all the variables with it's coef in bottom table.
- We also have other values such as Standard Error, P-Value and etc...

### Confusion Matrix

In [22]:
### Getting predicted values
mplr.predict_proba(x)
### Out put will be array
# fist column is for 0, that means house is not sold
# second column is for 1, that house car is sold

array([[0.12706792, 0.87293208],
       [0.39754468, 0.60245532],
       [0.9807068 , 0.0192932 ],
       ...,
       [0.28591175, 0.71408825],
       [0.28056904, 0.71943096],
       [0.16346999, 0.83653001]])

In [23]:
y_pred = mplr.predict(x)
y_pred
# output is in the form of sold and not sold

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,

In [24]:
# if the value is >= 0.3, the output will be true
y_pred_03 = (mplr.predict_proba(x)[:,1] >= 0.3)

In [25]:
y_pred_03

array([ True,  True, False,  True, False, False,  True, False, False,
        True,  True,  True, False,  True,  True,  True,  True, False,
        True,  True, False,  True, False, False,  True,  True,  True,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False, False,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True, False,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True,  True,  True,  True,  True, False, False, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False,  True,
       False, False,

In [26]:
from sklearn.metrics import confusion_matrix

In [28]:
# comparing actual value of y and predicted value of y
# predicting accuracy of model
confusion_matrix(y, y_pred)

array([[196,  80],
       [ 81, 149]], dtype=int64)

- first column x first row = true negative [196]
- second column x first row = false positive [80]
- first column x second row = false negative [81]
- second column x first row = true positive [149]

In [29]:
# confusion matrix for 0.3
confusion_matrix(y, y_pred_03)

array([[121, 155],
       [ 18, 212]], dtype=int64)