In [1]:
# Predict price of house based on features like sqFt, Bedrooms, Bathromms etc

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt,style
style.use('ggplot')

In [3]:
df=pd.read_csv('house.csv')

In [4]:
df.head()

Unnamed: 0,Bathrooms,Offers,Brick,Neighborhood,Bedrooms,SqFt,Price
0,2,2,No,East,2,1790,114300
1,2,3,No,East,4,2030,114200
2,2,1,No,East,3,1740,114800
3,2,3,No,East,3,1980,94700
4,3,3,No,East,3,2130,119800


In [5]:
# Build regression model
#price = m1b + m2o + m3B + m4n + m5bd + m6sqft + m7price +c

In [6]:
#EDA


In [7]:
# Data Preparation

## Feature Encoding
df_dummy = pd.get_dummies(data=df,columns=['Brick','Neighborhood'],drop_first=True)
df_dummy.head()

Unnamed: 0,Bathrooms,Offers,Bedrooms,SqFt,Price,Brick_Yes,Neighborhood_North,Neighborhood_West
0,2,2,2,1790,114300,0,0,0
1,2,3,4,2030,114200,0,0,0
2,2,1,3,1740,114800,0,0,0
3,2,3,3,1980,94700,0,0,0
4,3,3,3,2130,119800,0,0,0


In [8]:
#Separate features and outcome
X=df_dummy.drop(columns='Price')
y=df_dummy['Price']

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)


In [9]:
#Standardize data
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
st.fit(X_train[['Bathrooms','Offers','Bedrooms','SqFt']])
X_train[['Bathrooms','Offers','Bedrooms','SqFt']]=st.transform(X_train[['Bathrooms','Offers','Bedrooms','SqFt']])
X_test[['Bathrooms','Offers','Bedrooms','SqFt']]=st.transform(X_test[['Bathrooms','Offers','Bedrooms','SqFt']])

In [10]:
X_train.columns

Index(['Bathrooms', 'Offers', 'Bedrooms', 'SqFt', 'Brick_Yes',
       'Neighborhood_North', 'Neighborhood_West'],
      dtype='object')

In [11]:
# Model Building
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

In [12]:
print("Train Score :", model.score(X_train,y_train)) # score is R squared = var exp equation / model / total variance
print("Test Score :", model.score(X_test,y_test))

Train Score : 0.8645573840624607
Test Score : 0.8767270416499355


In [13]:
import joblib
#save the std Scaler model
joblib.dump(st,"StandardScaler_house.h5")
#Save the linear regression model
joblib.dump(model,"LinearReg_house.h5")


['LinearReg_house.h5']

In [14]:
model.coef_ # this will be applied as model to predict, the dump file will have these co-efficient

array([ 4308.03979651, -8359.34190396,  3825.63592392, 11075.14885147,
       17672.78996562,  2307.25853536, 21121.92343954])

In [15]:
df_newhouse=pd.read_csv('house new data for pred.csv')

In [16]:
df_newhouse.head()

Unnamed: 0,SqFt,Bedrooms,Bathrooms,Offers,Brick,Neighborhood
0,1520,2,1,1,Yes,East
1,1650,3,2,1,No,East
2,1800,3,2,1,Yes,North


In [17]:
## Feature Encoding
df_newdummy = pd.get_dummies(data=df_newhouse,columns=['Brick','Neighborhood'],drop_first=True)
#df_newdummy.head()
df_newdummy.columns

Index(['SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Brick_Yes',
       'Neighborhood_North'],
      dtype='object')

In [18]:
model_columns = ['Bathrooms', 'Offers', 'Bedrooms', 'SqFt', 'Brick_Yes',
       'Neighborhood_North', 'Neighborhood_West']

In [19]:
set(model_columns)-set(df_newdummy.columns)

{'Neighborhood_West'}

In [20]:
for i in set(model_columns)-set(df_newdummy.columns):
    df_newdummy[i]=0

In [21]:
sc = joblib.load("StandardScaler_house.h5")
model_newhouse = joblib.load("LinearReg_house.h5")

In [22]:
data=df_newdummy[model_columns]
data[['Bathrooms', 'Offers', 'Bedrooms', 'SqFt']]=sc.transform(data[['Bathrooms', 'Offers', 'Bedrooms', 'SqFt']])

In [23]:
model_newhouse.predict(data)


array([105334.94771259, 108225.29443868, 136008.59120158])

In [24]:
model_newhouse.coef_

array([ 4308.03979651, -8359.34190396,  3825.63592392, 11075.14885147,
       17672.78996562,  2307.25853536, 21121.92343954])

In [25]:
# Model Evaluation Techniques
y_train_pred=model_newhouse.predict(X_train)
y_test_pred=model_newhouse.predict(X_test)

In [26]:
from sklearn import metrics
#Mean squared Error on train data set
metrics.mean_squared_error(y_train,y_train_pred)

99296801.31931134

In [27]:
#Root Mean squared Error on train data set
np.sqrt(metrics.mean_squared_error(y_train,y_train_pred))

9964.778036630387

In [28]:
metrics.mean_squared_error(y_test,y_test_pred)

79836245.27037403

In [29]:
metrics.mean_absolute_error(y_train,y_train_pred)

7783.679896648994

In [30]:
metrics.mean_absolute_error(y_test,y_test_pred)

6976.521935732076

In [31]:
y_train_pred.astype(str)

array(['126306.09691814778', '138533.77942300896', '140929.28476429492',
       '127951.78934301929', '130830.14669496026', '169793.37092429015',
       '135983.25624022444', '139943.9312757453', '141576.27811341424',
       '130998.6575191214', '158200.6923943954', '100688.7091309166',
       '109532.39049441455', '97540.24856168855', '148857.93209420494',
       '145347.15415451638', '115621.13671470691', '138639.4186072328',
       '164869.85774464643', '101818.231796092', '94672.50285357524',
       '184103.29736265255', '118985.3779722939', '102896.35213241793',
       '123032.9220834129', '165136.52069878974', '123946.57838667555',
       '124208.68109433536', '110141.69660879596', '116688.73108075898',
       '82443.4426183888', '95055.41663577006', '117599.718323249',
       '138129.72802671263', '119642.89729168707', '108187.6072039453',
       '104586.36197034408', '145101.62881447456', '125669.7152128561',
       '112763.91697686745', '132988.7337337858', '109365.34307413247

In [32]:
# Print observed valyes vs predicted

pd.DataFrame({'Observed' : y_train, "Predicted":y_train_pred})

Unnamed: 0,Observed,Predicted
91,116500,126306.096918
16,147100,138533.779423
90,143100,140929.284764
118,150200,127951.789343
13,126300,130830.146695
...,...,...
67,151900,126454.011331
64,130300,132341.740385
117,117800,116648.374785
47,90300,91691.089705


In [33]:
# Polynomial Features

from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree=2)
X_poly = pf.fit_transform(X)



In [34]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_poly,y,test_size=0.2,random_state=0)

In [35]:
# Model Building
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

In [36]:
print("Train Score :", model.score(X_train,y_train)) # score is R squared = var exp equation / model / total variance
print("Test Score :", model.score(X_test,y_test))

Train Score : 0.8837479350504942
Test Score : 0.7830807188981486


In [37]:
#Model is overfitted, lets try to regularizeing it with Ridge/Lasso

from sklearn.linear_model import Lasso
lassoModel=Lasso(alpha=1.2,max_iter=20000)
lassoModel.fit(X_train,y_train)
print("Train Score :", lassoModel.score(X_train,y_train)) # score is R squared = var exp equation / model / total variance
print("Test Score :", lassoModel.score(X_test,y_test))

Train Score : 0.9012127558763926
Test Score : 0.784928765969314


In [38]:
help(Lasso)

Help on class Lasso in module sklearn.linear_model._coordinate_descent:

class Lasso(ElasticNet)
 |  Lasso(alpha=1.0, *, fit_intercept=True, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')
 |  
 |  Linear Model trained with L1 prior as regularizer (aka the Lasso).
 |  
 |  The optimization objective for Lasso is::
 |  
 |      (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
 |  
 |  Technically the Lasso model is optimizing the same objective function as
 |  the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).
 |  
 |  Read more in the :ref:`User Guide <lasso>`.
 |  
 |  Parameters
 |  ----------
 |  alpha : float, default=1.0
 |      Constant that multiplies the L1 term, controlling regularization
 |      strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.
 |  
 |      When `alpha = 0`, the objective is equivalent to ordinary least
 |      squares, solved by the :class:`LinearR

In [39]:
#Hyperparameter Tunning
# y = m1x1+m2x2+c
# alpha value tuning 

from sklearn.model_selection import GridSearchCV # alpha = [0,1,10,20,100] compare model with all these alpha value for Lasso/Regres
#CV is the cross validation


In [40]:
parameter_grid = {'alpha' : [0,5,2,10,20,100,0.9,1.2,1000,458]}

In [41]:
model = Lasso(max_iter=200000)
gridModel = GridSearchCV(model,param_grid=parameter_grid,cv=2)

In [42]:
gridModel.fit(X_train,y_train)

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [43]:
gridModel.best_estimator_

In [44]:
# Evaluate the model on train and test
print("Train Score:",gridModel.score(X_train,y_train))
print("Test Score:",gridModel.score(X_test,y_test))

Train Score: 0.8639035464068743
Test Score: 0.8791654412820219


In [45]:
#Cross Validation -X_train, y_train are cv

from sklearn.linear_model import Ridge
ridgeModel=Ridge(alpha=0.5)

rid_gridModel = GridSearchCV(ridgeModel,param_grid=parameter_grid,cv=2)

In [46]:
rid_gridModel.fit(X_train,y_train)

In [47]:
rid_gridModel.best_estimator_

In [48]:
# Evaluate the model on train and test
print("Train Score:",rid_gridModel.score(X_train,y_train))
print("Test Score:",rid_gridModel.score(X_test,y_test))

Train Score: 0.8642797973118264
Test Score: 0.8788366293854271


In [49]:
Multicollinearity

NameError: name 'Multicollinearity' is not defined