In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("Data/kc_house_data.csv")
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            21613 non-null  int64  
 1   date          21613 non-null  object 
 2   price         21613 non-null  float64
 3   bedrooms      21613 non-null  int64  
 4   bathrooms     21613 non-null  float64
 5   floors        21613 non-null  float64
 6   waterfront    21613 non-null  int64  
 7   condition     21613 non-null  int64  
 8   grade         21613 non-null  int64  
 9   yr_built      21613 non-null  int64  
 10  yr_renovated  21613 non-null  int64  
 11  zipcode       21613 non-null  int64  
 12  lat           21613 non-null  float64
 13  long          21613 non-null  float64
dtypes: float64(5), int64(8), object(1)
memory usage: 2.3+ MB


In [4]:
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540182.2,3.370842,2.114757,1.494309,0.007542,3.40943,7.656873,1971.005136,84.402258,98077.939805,47.560053,-122.213896
std,2876566000.0,367362.2,0.930062,0.770163,0.539989,0.086517,0.650743,1.175459,29.373411,401.67924,53.505026,0.138564,0.140828
min,1000102.0,75000.0,0.0,0.0,1.0,0.0,1.0,1.0,1900.0,0.0,98001.0,47.1559,-122.519
25%,2123049000.0,321950.0,3.0,1.75,1.0,0.0,3.0,7.0,1951.0,0.0,98033.0,47.471,-122.328
50%,3904930000.0,450000.0,3.0,2.25,1.5,0.0,3.0,7.0,1975.0,0.0,98065.0,47.5718,-122.23
75%,7308900000.0,645000.0,4.0,2.5,2.0,0.0,4.0,8.0,1997.0,0.0,98118.0,47.678,-122.125
max,9900000000.0,7700000.0,33.0,8.0,3.5,1.0,5.0,13.0,2015.0,2015.0,98199.0,47.7776,-121.315


In [5]:
## in Korean
print('''
id: 집 고유아이디
date: 집이 팔린 날짜 
price: 집 가격 (타겟변수)
bedrooms: 주택 당 침실 개수
bathrooms: 주택 당 화장실 개수
floors: 전체 층 개수
waterfront: 해변이 보이는지 (0, 1)
condition: 집 청소상태 (1~5)
grade: King County grading system 으로 인한 평점 (1~13)
yr_built: 집이 지어진 년도
yr_renovated: 집이 리모델링 된 년도
zipcode: 우편번호
lat: 위도
long: 경도
''')


id: 집 고유아이디
date: 집이 팔린 날짜 
price: 집 가격 (타겟변수)
bedrooms: 주택 당 침실 개수
bathrooms: 주택 당 화장실 개수
floors: 전체 층 개수
waterfront: 해변이 보이는지 (0, 1)
condition: 집 청소상태 (1~5)
grade: King County grading system 으로 인한 평점 (1~13)
yr_built: 집이 지어진 년도
yr_renovated: 집이 리모델링 된 년도
zipcode: 우편번호
lat: 위도
long: 경도



## Removing column less effective in analysis

In [6]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'floors', 'waterfront',
       'condition', 'grade', 'yr_built', 'yr_renovated', 'zipcode', 'lat',
       'long'],
      dtype='object')

In [7]:
columns_to_drop = ["id", "date", "zipcode", 'lat', 'long']

df= df.drop(columns= columns_to_drop)

# can done in same way like
#df = df.drop(columns_to_drop, axis = 1)

In [8]:
feature_coumns = df.columns.difference(["price"]) # x columns

x = df[feature_coumns]
y = df["price"]

In [9]:
# split the data in to training and test data 
x_train, x_test, y_train, y_test  = train_test_split(x,y,test_size = 0.3, random_state = 42 )
print(x_train.shape, x_test.shape, y_train.shape)

(15129, 8) (6484, 8) (15129,)


## Train model without bagging

In [10]:
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
sm_train_x = sm.add_constant(x_train, has_constant="add") # Add Bias

sm_model = sm.OLS(y_train, sm_train_x) # create a model
fitted_sm_model = sm_model.fit() 
fitted_sm_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.595
Model:,OLS,Adj. R-squared:,0.595
Method:,Least Squares,F-statistic:,2776.0
Date:,"Thu, 22 Apr 2021",Prob (F-statistic):,0.0
Time:,20:54:16,Log-Likelihood:,-208260.0
No. Observations:,15129,AIC:,416500.0
Df Residuals:,15120,BIC:,416600.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.186e+06,1.73e+05,41.548,0.000,6.85e+06,7.52e+06
bathrooms,1.303e+05,3960.833,32.889,0.000,1.23e+05,1.38e+05
bedrooms,-2224.7910,2382.356,-0.934,0.350,-6894.497,2444.915
condition,1.641e+04,3169.013,5.178,0.000,1.02e+04,2.26e+04
floors,1946.3052,4336.838,0.449,0.654,-6554.422,1.04e+04
grade,1.956e+05,2199.540,88.924,0.000,1.91e+05,2e+05
waterfront,7.555e+05,2.26e+04,33.479,0.000,7.11e+05,8e+05
yr_built,-4300.7865,88.073,-48.832,0.000,-4473.420,-4128.153
yr_renovated,12.7325,5.043,2.525,0.012,2.847,22.618

0,1,2,3
Omnibus:,13447.374,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1684794.827
Skew:,3.763,Prob(JB):,0.0
Kurtosis:,54.147,Cond. No.,182000.0


#### Predict in test

In [11]:
sm_test_x = sm.add_constant(x_test, has_constant= "add") # Adding Bias in testing data
sm_model_predict  = fitted_sm_model.predict(sm_test_x) # Predict test data
print("RMSE: {}".format(sqrt(mean_squared_error(sm_model_predict, y_test)))) #RMSE
print(fitted_sm_model.params)

RMSE: 239804.29670858165
const           7.185671e+06
bathrooms       1.302689e+05
bedrooms       -2.224791e+03
condition       1.641020e+04
floors          1.946305e+03
grade           1.955909e+05
waterfront      7.555423e+05
yr_built       -4.300787e+03
yr_renovated    1.273246e+01
dtype: float64


## Check weather Bagging is better or not

In [16]:
import random
bagging_predict_result = [] # creating blank list

for _ in range(10):
    data_index = [data_index for data_index in range(x_train.shape[0])]
    # Training data index in list format
    
    random_data_index = np.random.choice(data_index, x_train.shape[0])
    # Random sampling of 1/10
    
    print(len(set(random_data_index)))
    sm_train_x = x_train.iloc[random_data_index,]
    sm_train_y = y_train.iloc[random_data_index,]
    sm_train_x = sm.add_constant(sm_train_x, has_constant="add")# add bias
    sm_model = sm.OLS(sm_train_y, sm_train_x) # Create a model
    fitted_sm_model = sm_model.fit() # 
    
    sm_test_x = sm.add_constant(x_test, has_constant="add") # add 
    sm_model_predict = fitted_sm_model.predict(sm_test_x) # Predict test data
    bagging_predict_result.append(sm_model_predict) 
    # in the case of lotto extracting method there will be around 63% data extraction
    pred = fitted_sm_model.predict(sm_test_x)
    print(sqrt(mean_squared_error(pred, y_test)))

9519
239940.0547974962
9468
239874.97012540037
9613
239766.5097510191
9487
239726.62461593613
9536
239945.9216411
9616
240774.32180367404
9545
239660.2006980678
9581
239936.2755360765
9567
239706.3203393554
9522
240284.96746343232


In [17]:
 bagging_predict_result[0]

735      5.594574e+05
2830     7.033188e+05
4106     1.109833e+06
16218    1.464170e+06
19964    6.962207e+05
             ...     
12606    5.980657e+05
14393    6.768811e+05
6899     3.280772e+05
85       9.060017e+05
21363    4.348181e+05
Length: 6484, dtype: float64

In [18]:
bagging_predict = [] 
for lst2_index in range(x_test.shape[0]):
    temp_predict = [] # 
    for lst_index in range(len(bagging_predict_result)):
        temp_predict.append(bagging_predict_result[lst_index].values[lst2_index])
    bagging_predict.append(np.mean(temp_predict))

In [19]:
bagging_predict

[562921.1152454566,
 710758.7352401826,
 1120655.5846751085,
 1481758.5667576934,
 700282.5262131498,
 385315.9894761948,
 787784.0897486066,
 484467.47293397103,
 499630.25764976523,
 538893.4175647467,
 642923.316529167,
 406029.98584924557,
 265945.04265498306,
 279118.67320426425,
 337349.3606706732,
 1267817.4547198296,
 319704.06889458,
 1036907.443320429,
 258013.4353135905,
 602841.7345772155,
 389533.7000160736,
 1307536.0864840627,
 824575.2034788397,
 581952.5350672265,
 598370.7158149551,
 571150.1329782752,
 262067.41497981353,
 41384.53678722354,
 564666.4352129652,
 642693.701471598,
 566435.4483452874,
 458436.08881940634,
 552757.36159309,
 687311.9163421128,
 408673.1637791408,
 876278.4353985159,
 942705.6309230796,
 638443.649585333,
 390559.16786453285,
 1085126.8676324342,
 453932.3843231383,
 146779.98646481178,
 488004.37527988025,
 218300.20618094289,
 61415.14641642924,
 -47857.93093632879,
 246948.73243306315,
 281774.4164233111,
 361997.0718671189,
 722346.0

In [20]:
print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, y_test))))

RMSE: 239781.84908712833


### Using Function

In [21]:
from sklearn.linear_model import LinearRegression
regression_model = LinearRegression()
linear_model1 = regression_model.fit(x_train, y_train)
predict1 = linear_model1.predict(x_test)
print("RMSE : {}".format(sqrt(mean_squared_error(predict1, y_test))))

RMSE : 239804.29670858147


### Bagging with SK learn

In [23]:
from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(base_estimator=regression_model, # linear regression
                                n_estimators=5, # 5 times sampling
                                verbose=1) # show the process
linear_model2 = bagging_model.fit(x_train, y_train)
predict2 = linear_model2.predict(x_test)
print("RMSE: {}".format(sqrt(mean_squared_error(predict2, y_test)))) # Check RMSE

RMSE: 239755.2996558441


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


### lets do sampling more time

In [24]:
bagging_model2 = BaggingRegressor(base_estimator=regression_model,
                                 n_estimators=30,
                                 verbose = 1)

linear_model3 = bagging_model2.fit(x_train, y_train)
predict3 = linear_model3.predict(x_test)

print("RMSE: {}".format(sqrt(mean_squared_error(predict3, y_test))))

RMSE: 239788.8327189367


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
