In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [98]:
housing_data = pd.read_csv('https://cdn.upgrad.com/UpGrad/temp/76e1a761-6818-4b36-bfd4-174f154c6952/Housing.csv')
housing_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


###### Map boolean values:

In [99]:
_columns = ['mainroad', 'guestroom', 'basement','hotwaterheating', 'airconditioning', 'prefarea']
def map_boolean(x):
    return x.map({'yes': 1, 'no': 0})

housing_data[_columns] = housing_data[_columns].apply(map_boolean)

In [100]:
housing_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


In [101]:
status = pd.get_dummies(housing_data['furnishingstatus'])
_dummy_vars = pd.get_dummies(housing_data['furnishingstatus'], drop_first=True)
housing_data = pd.concat([housing_data, _dummy_vars], axis=1)
housing_data.shape

(545, 15)

In [102]:
housing_data.drop(columns=['furnishingstatus'], axis=1, inplace=True)


In [103]:
housing_data_train, housing_data_test = train_test_split(housing_data, train_size=0.7, test_size=0.3, random_state=100)

In [104]:
print(housing_data_train.shape)
print(housing_data_test.shape)

(381, 14)
(164, 14)


#### Scaling: 

In [105]:
housing_data_train.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
count,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0
mean,4756848.0,5116.900262,2.931759,1.272966,1.805774,0.855643,0.170604,0.351706,0.052493,0.299213,0.727034,0.233596,0.414698,0.325459
std,1820366.0,2061.660813,0.736681,0.474651,0.885003,0.351913,0.376657,0.478131,0.223313,0.458515,0.857799,0.423674,0.493318,0.469162
min,1750000.0,1836.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3500000.0,3600.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4305000.0,4500.0,3.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
max,13300000.0,13200.0,6.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0


In [106]:
scaler = MinMaxScaler()

In [107]:
_vars = ['price', 'area', 'bedrooms','bathrooms','stories', 'parking']
housing_data_train[_vars] = scaler.fit_transform(housing_data_train[_vars])

  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [108]:
y_train_data = housing_data_train.pop('price')


### RFE using scikit learn:

In [109]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [110]:
lr = LinearRegression()
lr.fit(housing_data_train, y_train_data)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [111]:
rfe = RFE(lr, 10)
rfe = rfe.fit(housing_data_train, y_train_data)

In [112]:
list(zip(housing_data_train, rfe.support_, rfe.ranking_))

[('area', True, 1),
 ('bedrooms', True, 1),
 ('bathrooms', True, 1),
 ('stories', True, 1),
 ('mainroad', True, 1),
 ('guestroom', True, 1),
 ('basement', False, 3),
 ('hotwaterheating', True, 1),
 ('airconditioning', True, 1),
 ('parking', True, 1),
 ('prefarea', True, 1),
 ('semi-furnished', False, 4),
 ('unfurnished', False, 2)]

In [113]:
housing_data_train.columns[rfe.support_]

Index(['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom',
       'hotwaterheating', 'airconditioning', 'parking', 'prefarea'],
      dtype='object')

#### build model with Stats model

In [114]:
_colms = housing_data_train.columns[rfe.support_]
x_train_rfe = sm.add_constant(housing_data_train[_colms])


In [115]:
lr = sm.OLS(y_train_data, x_train_rfe).fit()

In [116]:
print(lr.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.669
Model:                            OLS   Adj. R-squared:                  0.660
Method:                 Least Squares   F-statistic:                     74.89
Date:                Sun, 19 May 2019   Prob (F-statistic):           1.28e-82
Time:                        04:43:57   Log-Likelihood:                 374.65
No. Observations:                 381   AIC:                            -727.3
Df Residuals:                     370   BIC:                            -683.9
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0027      0.018     

##### we can drop bedrooms: 

In [117]:
_colms = _colms.drop('bedrooms')


In [118]:
_colms

Index(['area', 'bathrooms', 'stories', 'mainroad', 'guestroom',
       'hotwaterheating', 'airconditioning', 'parking', 'prefarea'],
      dtype='object')

In [119]:
x_train_rfe = sm.add_constant(housing_data_train[_colms])
lr = sm.OLS(y_train_data, x_train_rfe).fit()

In [120]:
print(lr.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.666
Model:                            OLS   Adj. R-squared:                  0.658
Method:                 Least Squares   F-statistic:                     82.37
Date:                Sun, 19 May 2019   Prob (F-statistic):           6.67e-83
Time:                        04:44:04   Log-Likelihood:                 373.00
No. Observations:                 381   AIC:                            -726.0
Df Residuals:                     371   BIC:                            -686.6
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.0242      0.013     

In [121]:
_scaling_columns = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']
housing_data_test[_scaling_columns] = scaler.transform(housing_data_test[_scaling_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [124]:
housing_data_test_price = housing_data_test.pop('price')
housing_data_test.describe()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
count,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0
mean,-0.151062,-0.161295,0.063415,0.402439,0.865854,0.195122,0.347561,0.030488,0.353659,-0.128049,0.237805,0.420732,0.329268
std,0.000209,6.5e-05,0.112432,0.414011,0.341853,0.397508,0.477654,0.172452,0.479569,0.289322,0.427043,0.495189,0.471387
min,-0.151372,-0.161387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,0.0,0.0,0.0
25%,-0.151211,-0.161299,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.333333,0.0,0.0,0.0
50%,-0.151101,-0.161299,0.0,0.5,1.0,0.0,0.0,0.0,0.0,-0.333333,0.0,0.0,0.0
75%,-0.150965,-0.161299,0.2,0.5,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
max,-0.150113,-0.161123,0.6,1.5,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,1.0,1.0


In [136]:
x_test_rfe.describe()

Unnamed: 0,const,area,bedrooms,bathrooms,stories,mainroad,guestroom,hotwaterheating,airconditioning,parking,prefarea
count,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0,164.0
mean,1.0,-0.151062,-0.161295,0.063415,0.402439,0.865854,0.195122,0.030488,0.353659,-0.128049,0.237805
std,0.0,0.000209,6.5e-05,0.112432,0.414011,0.341853,0.397508,0.172452,0.479569,0.289322,0.427043
min,1.0,-0.151372,-0.161387,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,0.0
25%,1.0,-0.151211,-0.161299,0.0,0.0,1.0,0.0,0.0,0.0,-0.333333,0.0
50%,1.0,-0.151101,-0.161299,0.0,0.5,1.0,0.0,0.0,0.0,-0.333333,0.0
75%,1.0,-0.150965,-0.161299,0.2,0.5,1.0,0.0,0.0,1.0,0.0,0.0
max,1.0,-0.150113,-0.161123,0.6,1.5,1.0,1.0,1.0,1.0,0.666667,1.0


In [126]:
_colms = housing_data_train.columns[rfe.support_]
x_test_rfe = sm.add_constant(housing_data_test[_colms])

In [137]:
y_test_predict = lr.predict(x_train_rfe)

In [140]:
y_test_predict

359    0.136896
19     0.471955
159    0.318751
35     0.514123
28     0.481968
267    0.200917
263    0.105156
433    0.117282
217    0.399940
154    0.300265
534    0.138638
96     0.314624
33     0.468577
477    0.142932
129    0.461207
401    0.406145
240    0.243681
21     0.446745
155    0.380023
532    0.048467
190    0.350052
435    0.123770
476    0.220287
124    0.410913
225    0.311653
363    0.206416
227    0.257910
528    0.068670
75     0.360535
241    0.162517
         ...   
172    0.470951
303    0.133351
369    0.114606
238    0.273599
283    0.166964
181    0.352641
63     0.529523
403    0.309222
478    0.119781
2      0.496947
336    0.323521
317    0.328216
431    0.213415
49     0.473360
135    0.448717
4      0.385282
141    0.444904
415    0.308111
386    0.164392
86     0.316863
93     0.430993
507    0.114606
316    0.271196
228    0.196338
280    0.208620
526    0.105859
53     0.475592
350    0.261818
79     0.451167
520    0.200000
Length: 381, dtype: floa

In [141]:
# fig = plt.figure(figsize=(10,8))
# plt.scatter(y_test_predict, housing_data_test_price)
# plt.suptitle('Test data predicted vs actual data')
# plt.show()