In [67]:
import pandas as pd

In [68]:
df = pd.read_csv("house_price_prediction.csv")

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17210 entries, 0 to 17209
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             17210 non-null  int64  
 1   date           17210 non-null  object 
 2   price          17210 non-null  int64  
 3   bedrooms       17210 non-null  int64  
 4   sqft_living    17210 non-null  int64  
 5   sqft_lot       17210 non-null  int64  
 6   floors         17210 non-null  float64
 7   waterfront     17210 non-null  int64  
 8   condition      17210 non-null  int64  
 9   grade          17210 non-null  int64  
 10  sqft_above     17210 non-null  int64  
 11  sqft_basement  17210 non-null  int64  
 12  yr_built       17210 non-null  int64  
 13  yr_renovated   17210 non-null  int64  
 14  zipcode        17210 non-null  int64  
 15  lat            17210 non-null  float64
 16  long           17210 non-null  float64
 17  sqft_living15  17210 non-null  int64  
 18  sqft_l

In [70]:
df.head(20)

Unnamed: 0,id,date,price,bedrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,1702900664,4/16/2015,479000,2,1730,1037,3.5,0,3,8,1730,0,2008,0,98118,47.5594,-122.285,1280,1026
1,8673400177,04-02-2015,525000,3,1730,1074,3.5,0,3,8,1730,0,2006,0,98107,47.6692,-122.392,1370,1185
2,1972202010,08-01-2014,435000,3,1440,1350,3.5,0,3,8,1440,0,2005,0,98103,47.6525,-122.345,1440,1350
3,1972200428,6/25/2014,563500,3,1400,1312,3.5,0,3,8,1400,0,2007,0,98103,47.6534,-122.355,1350,1312
4,1972200426,9/18/2014,525000,2,1310,1268,3.5,0,3,8,1310,0,2007,0,98103,47.6534,-122.355,1350,1288
5,3180100023,1/30/2015,544000,3,1760,1755,3.5,0,3,8,1760,0,1998,0,98105,47.6688,-122.279,1700,1721
6,2559950110,4/22/2015,1230000,2,2470,609,3.0,0,3,11,1910,560,2011,0,98112,47.6182,-122.312,2440,1229
7,6169901185,5/20/2014,490000,5,4460,2975,3.0,0,3,10,3280,1180,2015,0,98119,47.6313,-122.37,2490,4231
8,2770603522,12-11-2014,585000,3,2160,1250,3.0,0,3,8,1830,330,2010,0,98119,47.6515,-122.375,1870,2825
9,1294300038,07-11-2014,450000,3,1810,914,3.0,0,3,8,1380,430,2008,0,98116,47.5732,-122.387,1810,914


# Feature Engineering

  #  Create new features based on existing features
# --In this data we will add two new columns / features age of the house and avg. price for given zipcode
# age of house = no of years(year of sell(from date column) - yr_built)

# - avg price per zipcode ::: Pandas groupby

In [71]:
df['date'] = pd.to_datetime(df['date'],format='mixed')

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17210 entries, 0 to 17209
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             17210 non-null  int64         
 1   date           17210 non-null  datetime64[ns]
 2   price          17210 non-null  int64         
 3   bedrooms       17210 non-null  int64         
 4   sqft_living    17210 non-null  int64         
 5   sqft_lot       17210 non-null  int64         
 6   floors         17210 non-null  float64       
 7   waterfront     17210 non-null  int64         
 8   condition      17210 non-null  int64         
 9   grade          17210 non-null  int64         
 10  sqft_above     17210 non-null  int64         
 11  sqft_basement  17210 non-null  int64         
 12  yr_built       17210 non-null  int64         
 13  yr_renovated   17210 non-null  int64         
 14  zipcode        17210 non-null  int64         
 15  lat            1721

In [73]:
df['age'] = df['date'].dt.year -df['yr_built']

# Preprocess the features
-- Drop unnecessary columns

In [74]:
droped_col = ['id','date','yr_built','zipcode','lat','long','sqft_living','sqft_lot','yr_renovated']
df.drop(droped_col, axis=1,inplace= True)

In [75]:
df.shape

(17210, 11)

In [76]:
# Find the skew of every column
df.skew()

price             3.821753
bedrooms          2.400169
floors            0.614256
waterfront       10.992823
condition         1.062413
grade             0.797259
sqft_above        1.416897
sqft_basement     1.597025
sqft_living15     1.110448
sqft_lot15        8.972974
age               0.482072
dtype: float64

In [77]:
df.nunique()

price            3206
bedrooms           12
floors              6
waterfront          2
condition           5
grade              11
sqft_above        859
sqft_basement     293
sqft_living15     718
sqft_lot15       7592
age               117
dtype: int64

# dealing with continuous variables which have very high skew.

- can convert them to normal distribution

- We use Power Transform

- 1. BOX- COX Transform #used in banks

- 2. Log Transform

- 3. Square Transform 

In [80]:
from sklearn.preprocessing import PowerTransformer
df['sqft_lot15_bc'] = PowerTransformer(method='box-cox').fit_transform(df[['sqft_lot15']])

In [81]:
df.skew()

price             3.821753
bedrooms          2.400169
floors            0.614256
waterfront       10.992823
condition         1.062413
grade             0.797259
sqft_above        1.416897
sqft_basement     1.597025
sqft_living15     1.110448
sqft_lot15        8.972974
age               0.482072
sqft_lot15_bc    -0.111873
dtype: float64

In [82]:
df.drop('sqft_lot15',axis=1,inplace=True)

In [83]:
df.shape

(17210, 11)

In [84]:
df

Unnamed: 0,price,bedrooms,floors,waterfront,condition,grade,sqft_above,sqft_basement,sqft_living15,age,sqft_lot15_bc
0,479000,2,3.5,0,3,8,1730,0,1280,7,-3.101923
1,525000,3,3.5,0,3,8,1730,0,1370,9,-2.829880
2,435000,3,3.5,0,3,8,1440,0,1440,9,-2.590466
3,563500,3,3.5,0,3,8,1400,0,1350,7,-2.642364
4,525000,2,3.5,0,3,8,1310,0,1350,7,-2.676082
...,...,...,...,...,...,...,...,...,...,...,...
17205,425000,3,1.0,0,4,7,1140,220,1700,60,2.219587
17206,700000,3,1.0,0,4,8,2530,0,2120,35,2.625224
17207,790000,2,1.0,0,3,8,2560,0,1620,10,1.877541
17208,190000,2,1.0,0,2,5,710,0,1680,100,0.985301


In [85]:
#X & Y Split

In [86]:
X = df.drop(['price'],axis=1)
Y = df['price']

In [88]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=7)

# Classification mai krte hai stratify=Y, Regression mai stratify = Y ni krenge

In [89]:
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((12047, 10), (5163, 10), (12047,), (5163,))

In [93]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge

In [94]:
lr = LinearRegression()
lr.fit(X_train,Y_train)
Y_pred = lr.predict(X_test)

In [95]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [99]:
r2_score(Y_test,Y_pred),mean_absolute_error(Y_test,Y_pred) , mean_squared_error(Y_test,Y_pred)
# mean_squared_error is not good for regression

(0.6459046924692622, 141278.7196100435, 48000217917.92888)

In [100]:
Y_test.mean() , Y_test.max() , Y_test.min()

(542355.0619794693, 5570000, 82000)

In [112]:
ls = Lasso(alpha=1000000,random_state=7)
ls.fit(X_train,Y_train)
Y_pred = ls.predict(X_test)

In [113]:
r2_score(Y_test,Y_pred),mean_absolute_error(Y_test,Y_pred) , mean_squared_error(Y_test,Y_pred)

(0.530166273818098, 166700.58361623762, 63689410060.78186)

In [114]:
ls.coef_

array([  -0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,  238.66139973,  250.16053782,   82.03132688,
       1075.06759992,   -0.        ])

In [135]:
rd = Ridge(alpha=10000000,random_state=7)
rd.fit(X_train,Y_train)
Y_pred = rd.predict(X_test)

In [136]:
r2_score(Y_test,Y_pred),mean_absolute_error(Y_test,Y_pred) , mean_squared_error(Y_test,Y_pred)

(0.5316697517856382, 166678.69915769887, 63485602587.08222)

In [137]:
rd.coef_

array([ -37.69952105,   13.84220785,    7.78991668,   15.038598  ,
         72.55296369,  240.31986316,  253.25545979,   83.09074632,
       1149.85885683,  -50.39428412])

# Polynomial Features and Selection by Lasso,Ridge

In [148]:
from sklearn.preprocessing import PolynomialFeatures

In [149]:
X_poly = PolynomialFeatures(degree=3).fit_transform(X)

In [150]:
X_poly.shape

(17210, 286)

In [153]:
X_train_poly,X_test_poly,Y_train,Y_test  = train_test_split(X_poly,Y,test_size=0.3,random_state=7)

In [154]:
X_train_poly.shape,X_test_poly.shape,Y_train.shape,Y_test.shape

((12047, 286), (5163, 286), (12047,), (5163,))

In [159]:
lr = LinearRegression()
lr.fit(X_train_poly,Y_train)
Y_pred = lr.predict(X_test_poly)

In [160]:
r2_score(Y_test,Y_pred),mean_absolute_error(Y_test,Y_pred) , mean_squared_error(Y_test,Y_pred)

(0.6926458125278124, 126614.18203328343, 41664116024.3907)

In [161]:
rd = Ridge(alpha=100000,random_state=7)
rd.fit(X_train_poly,Y_train)
Y_pred = rd.predict(X_test_poly)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [162]:
r2_score(Y_test,Y_pred),mean_absolute_error(Y_test,Y_pred) , mean_squared_error(Y_test,Y_pred)

(0.7203407295586899, 124013.29444145523, 37909866746.219505)