In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from scipy import stats
from sklearn import metrics as m
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels import api as sm
from collections import namedtuple
%matplotlib notebook

In [2]:
warnings.filterwarnings("ignore")

In [3]:
r2 = lambda x: round(x,2)

In [4]:
hd = pd.read_csv("train.csv").iloc[:,1:]

In [5]:
hd.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
hd.shape

(1460, 80)

In [7]:
hd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [8]:
pc = 0.80

In [9]:
min_null = pc * hd.shape[0] + 1

In [10]:
min_null

1169.0

In [11]:
hd.dropna(       axis=1
                ,thresh=min_null
                ,inplace=True)

In [12]:
hd.shape

(1460, 75)

In [13]:
hd.corr()[(hd.corr() >= 0.65) & (hd.corr() < 1)]["SalePrice"].dropna()

OverallQual    0.790982
GrLivArea      0.708624
Name: SalePrice, dtype: float64

In [14]:
gla = hd.GrLivArea.values.copy()
sp = hd.SalePrice.values.copy()

In [15]:
fig, ax = plt.subplots(figsize=(8,6))
sns.histplot(x=sp)
ax2 = ax.twinx()
sns.kdeplot(x=sp,color="purple",ax=ax2)
plt.title("Distribution of Sales Price")
plt.xlabel("Sales Price")
plt.show()
print(
        "Mean Sales Price: {}".format(r2(np.mean(sp)))
      + "\n"
      + "Median Sales Price: {}".format(r2(np.median(sp)))
      + "\n"
      + "Skewness: {}".format(r2(stats.skew(sp)))
    )

<IPython.core.display.Javascript object>

Mean Sales Price: 180921.2
Median Sales Price: 163000.0
Skewness: 1.88


In [16]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=gla,y=sp)
plt.xlabel("GrLivArea")
plt.ylabel("Sale Price")
plt.title("Relationship between GrLivArea & Sale Price")
print("Pearson Corr Coeff: {}".format(r2(stats.pearsonr(gla,sp)[0])))
plt.show()

<IPython.core.display.Javascript object>

Pearson Corr Coeff: 0.71


# Simple Linear Regression

In [17]:
x = gla

y = sp

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=112)

slm = LinearRegression()

slm.fit(X_train.reshape(-1,1),y_train)

ypred = slm.predict(X_test.reshape(-1,1))

In [18]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=x, y=y)
sns.lineplot(x=x, y=slm.predict(x.reshape(-1,1)),color="red")
plt.title("Relationship Between GrLivArea vs Sale Price")
plt.xlabel("GrLivArea")
plt.ylabel("Sale Price")
plt.show()
print("Variability Explained by the Model: {}".format(r2(m.r2_score(y_test, ypred))))

<IPython.core.display.Javascript object>

Variability Explained by the Model: 0.52


# Simple Linear Regression - Outliers Removed

In [19]:
plt.figure(figsize=(8,6))
sns.boxplot(y=y)
plt.title("Distribution of Sale Price")
plt.ylabel("Sale Price")
plt.show()

<IPython.core.display.Javascript object>

In [20]:
q1 = np.percentile(y,25,interpolation="midpoint")
q3 = np.percentile(y,75,interpolation="midpoint")

iqr = q3 - q1

upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr

upper_index = np.where(hd.SalePrice >= upper)[0]
lower_index = np.where(hd.SalePrice <= lower)[0]

hd.drop(upper_index,inplace=True)
hd.drop(lower_index,inplace=True)

hd.reset_index(inplace=True)

In [21]:
q1

129950.0

In [22]:
q3

214000.0

In [23]:
iqr

84050.0

In [24]:
plt.figure(figsize=(8,6))
sns.boxplot(y=hd.SalePrice)
plt.title("Distribution of Sale Price - Outliers Removed")
plt.ylabel("Sale Price")
plt.show()

<IPython.core.display.Javascript object>

In [25]:
hd.shape

(1399, 76)

In [26]:
plt.figure(figsize=(8,6))
sns.boxplot(y=x)
plt.title("Distribution of GrLiveArea")
plt.ylabel("GrLiveArea")
plt.show()

<IPython.core.display.Javascript object>

In [27]:
q1 = np.percentile(x,25,interpolation="midpoint")
q3 = np.percentile(x,75,interpolation="midpoint")

iqr = q3 - q1

upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr

upper_index = np.where(hd.GrLivArea >= upper)[0]
lower_index = np.where(hd.GrLivArea <= lower)[0]

hd.drop(upper_index,inplace=True)
hd.drop(lower_index,inplace=True)

hd.reset_index(inplace=True)

In [28]:
plt.figure(figsize=(8,6))
sns.boxplot(y=hd.GrLivArea)
plt.title("Distribution of GrLiveArea - Outliers Removed")
plt.ylabel("GrLiveArea")
plt.show()

<IPython.core.display.Javascript object>

In [29]:
x = hd.GrLivArea.values
y = hd.SalePrice.values

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=112)

slm = LinearRegression()

slm.fit(X_train.reshape(-1,1),y_train)

ypred = slm.predict(X_test.reshape(-1,1))

In [30]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=x, y=y)
sns.lineplot(x=x, y=slm.predict(x.reshape(-1,1)),color="red")
plt.title("Relationship Between GrLivArea vs Sale Price - (SP Outliers Removed)")
plt.xlabel("GrLivArea")
plt.ylabel("Sale Price")
plt.show()
print("Variability Explained by the Model: {}".format(r2(m.r2_score(y_test, ypred))))

<IPython.core.display.Javascript object>

Variability Explained by the Model: 0.49


In [31]:
slm_r2 = r2(m.r2_score(y_test, ypred))

# Polynomial Regression - Outliers Removed

In [32]:
poly = PolynomialFeatures(degree=3)

In [33]:
x_poly = poly.fit_transform(x.reshape(-1,1))

In [34]:
X_train, X_test, y_train, y_test = train_test_split(x_poly, y, test_size=0.25, random_state=112)

In [35]:
plm = LinearRegression()

In [36]:
plm.fit(X_train,y_train)

LinearRegression()

In [37]:
ypred = plm.predict(X_test)

In [38]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=x, y=y)
sns.lineplot(x=x, y=plm.predict(x_poly),color="red")
plt.title("Relationship Between GrLivArea vs Sale Price - (SP Outliers Removed)")
plt.xlabel("GrLivArea")
plt.ylabel("Sale Price")
plt.show()
print("Variability Explained by the Model: {}".format(r2(m.r2_score(y_test, ypred))))

<IPython.core.display.Javascript object>

Variability Explained by the Model: 0.5


In [39]:
poly_r2 = r2(m.r2_score(y_test, ypred))

# Multiple Linear Regression

In [40]:
len(hd.select_dtypes(["int","float"]).columns)

39

In [41]:
numerical_columns = (
hd.select_dtypes(["int","float"])
    .drop(columns=['YearBuilt', 'YearRemodAdd','YrSold','MoSold'])
    .dropna(how="any")
)

In [42]:
X = numerical_columns.iloc[:,:-1]
y = numerical_columns.iloc[:,-1]

X = sm.add_constant(X)

model = sm.OLS(y,X)

results = model.fit()

In [43]:
results.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.871
Model:,OLS,Adj. R-squared:,0.867
Method:,Least Squares,F-statistic:,216.0
Date:,"Wed, 25 May 2022",Prob (F-statistic):,0.0
Time:,14:37:44,Log-Likelihood:,-11986.0
No. Observations:,1053,AIC:,24040.0
Df Residuals:,1020,BIC:,24200.0
Df Model:,32,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6.731e+05,8.48e+04,-7.934,0.000,-8.4e+05,-5.07e+05
level_0,-240.8874,236.044,-1.021,0.308,-704.075,222.300
index,228.8867,225.558,1.015,0.310,-213.724,671.497
MSSubClass,-76.6517,20.950,-3.659,0.000,-117.762,-35.541
LotFrontage,103.4883,39.903,2.593,0.010,25.187,181.790
LotArea,1.0824,0.190,5.706,0.000,0.710,1.455
OverallQual,1.548e+04,880.380,17.586,0.000,1.38e+04,1.72e+04
OverallCond,5332.7001,705.951,7.554,0.000,3947.417,6717.983
MasVnrArea,11.7936,4.596,2.566,0.010,2.776,20.812

0,1,2,3
Omnibus:,67.245,Durbin-Watson:,2.046
Prob(Omnibus):,0.0,Jarque-Bera (JB):,217.048
Skew:,-0.242,Prob(JB):,7.39e-48
Kurtosis:,5.171,Cond. No.,1.09e+16


In [44]:
numerical_columns = numerical_columns.drop(columns=["level_0","index","BsmtFinSF2","BsmtUnfSF"
                                                   ,"LowQualFinSF","BsmtHalfBath","HalfBath"
                                                   ,"TotRmsAbvGrd","GarageArea","EnclosedPorch"
                                                   ,"3SsnPorch","ScreenPorch","PoolArea","MiscVal"])

In [45]:
X = numerical_columns.iloc[:,:-1]
y = numerical_columns.iloc[:,-1]

X = sm.add_constant(X)

model = sm.OLS(y,X)

results = model.fit()

In [46]:
results.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.87
Model:,OLS,Adj. R-squared:,0.867
Method:,Least Squares,F-statistic:,344.8
Date:,"Wed, 25 May 2022",Prob (F-statistic):,0.0
Time:,14:37:44,Log-Likelihood:,-11992.0
No. Observations:,1053,AIC:,24030.0
Df Residuals:,1032,BIC:,24130.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.342e+05,7.85e+04,-9.352,0.000,-8.88e+05,-5.8e+05
MSSubClass,-77.0169,20.772,-3.708,0.000,-117.778,-36.256
LotFrontage,104.2244,39.596,2.632,0.009,26.527,181.922
LotArea,1.1300,0.188,5.996,0.000,0.760,1.500
OverallQual,1.556e+04,871.478,17.853,0.000,1.38e+04,1.73e+04
OverallCond,5260.7153,691.387,7.609,0.000,3904.030,6617.400
MasVnrArea,13.4503,4.554,2.954,0.003,4.515,22.386
BsmtFinSF1,15.4862,2.358,6.567,0.000,10.859,20.114
TotalBsmtSF,18.5640,3.210,5.783,0.000,12.265,24.863

0,1,2,3
Omnibus:,71.018,Durbin-Watson:,2.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,225.922
Skew:,-0.276,Prob(JB):,8.74e-50
Kurtosis:,5.201,Cond. No.,1280000.0


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=112)

In [48]:
mvlm = LinearRegression()

In [49]:
mvlm.fit(X_train,y_train)

LinearRegression()

In [50]:
ypred = mvlm.predict(X_test)

In [51]:
mvlm_r2 = r2(m.r2_score(y_test,ypred))

In [52]:
mvlm_r2

0.87

# Support Vector Machine

In [53]:
svr_model = SVR(kernel="rbf")

In [54]:
X_sc = StandardScaler()
y_sc = StandardScaler()

In [55]:
X = numerical_columns.iloc[:,:-1].values
y = numerical_columns.iloc[:,-1].values

In [56]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=112)

In [57]:
X_train = X_sc.fit_transform(X_train)
X_test = X_sc.fit_transform(X_test)
y_train = y_sc.fit_transform(y_train.reshape(-1,1))

In [58]:
svr_model.fit(X_train,y_train)

SVR()

In [59]:
ypred = y_sc.inverse_transform(svr_model.predict(X_test))

In [60]:
svr_r2 = r2(m.r2_score(y_test,ypred))

In [61]:
svr_r2

0.85

# Decision Tree

In [62]:
X = numerical_columns.iloc[:,:-1].values
y = numerical_columns.iloc[:,-1].values

In [63]:
dt_model = DecisionTreeRegressor(random_state=112)

In [64]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=112)

In [65]:
dt_model.fit(X_train,y_train)

DecisionTreeRegressor(random_state=112)

In [66]:
ypred = dt_model.predict(X_test)

In [67]:
dt_r2 = r2(m.r2_score(y_test,ypred))

In [68]:
dt_r2

0.72

# Random Forest

In [69]:
X = numerical_columns.iloc[:,:-1].values
y = numerical_columns.iloc[:,-1].values

In [70]:
forest_model = RandomForestRegressor(n_estimators=100 ,random_state=112)

In [71]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=112)

In [72]:
forest_model.fit(X_train,y_train)

RandomForestRegressor(random_state=112)

In [73]:
ypred = forest_model.predict(X_test)

In [74]:
forest_r2 = r2(m.r2_score(y_test,ypred))

In [75]:
forest_r2

0.85

In [76]:
regress_model_score = (namedtuple(
                      "Regression_Model_Scores",["Simple_Linear", "Polynomial_Regress"
                                                ,"Multivariate_Linear", "Support_Vector"
                                                ,"Decision_Tree", "Random_Forest"])
                      )

In [77]:
regress_model_score = regress_model_score(slm_r2,poly_r2,mvlm_r2,svr_r2,dt_r2,forest_r2)

In [78]:
regress_model_score

Regression_Model_Scores(Simple_Linear=0.49, Polynomial_Regress=0.5, Multivariate_Linear=0.87, Support_Vector=0.85, Decision_Tree=0.72, Random_Forest=0.85)