In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# Data Loading

In [2]:
# from csv file
filepath = "./data/lr-Real-estate.csv"
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [3]:
df.columns

Index(['No', 'X1 transaction date', 'X2 house age',
       'X3 distance to the nearest MRT station',
       'X4 number of convenience stores', 'X5 latitude', 'X6 longitude',
       'Y house price of unit area'],
      dtype='object')

In [4]:
# drop irrelevant columns
df.drop(['No'], axis=1)
df.head(2)

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2


In [5]:
# remove columns
df = df.rename(
    columns={
         'X1 transaction date':'date', 
         'X2 house age':'house age',
         'X3 distance to the nearest MRT station':'dist-nearest station',
         'X4 number of convenience stores':'num stores', 
         'X5 latitude':'lat', 
         'X6 longitude':'lon',
         'Y house price of unit area':'house price',
      
    }
)
df.head(2)

Unnamed: 0,No,date,house age,dist-nearest station,num stores,lat,lon,house price
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2


In [6]:
df.shape # num of records, num of features

(414, 8)

In [7]:
df = df.drop(['date'], axis=1)

In [8]:
df.isna().sum()

No                      0
house age               0
dist-nearest station    0
num stores              0
lat                     0
lon                     0
house price             0
dtype: int64

In [9]:
df.duplicated().sum()

np.int64(0)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   No                    414 non-null    int64  
 1   house age             414 non-null    float64
 2   dist-nearest station  414 non-null    float64
 3   num stores            414 non-null    int64  
 4   lat                   414 non-null    float64
 5   lon                   414 non-null    float64
 6   house price           414 non-null    float64
dtypes: float64(5), int64(2)
memory usage: 22.8 KB


In [11]:
df.describe()

Unnamed: 0,No,house age,dist-nearest station,num stores,lat,lon,house price
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,207.5,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,119.655756,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,1.0,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,104.25,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,207.5,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,310.75,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,414.0,43.8,6488.021,10.0,25.01459,121.56627,117.5


In [12]:
# independent and dependent feature test
X = df.drop(['house price'], axis = 1)
y = df['house price']

In [13]:
X.shape, y.shape

((414, 6), (414,))

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.8,
    random_state=42
    )
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
X_train.head(3)

(331, 6) (83, 6) (331,) (83,)


Unnamed: 0,No,house age,dist-nearest station,num stores,lat,lon
192,193,43.8,57.58945,7,24.9675,121.54069
234,235,8.0,2216.612,4,24.96007,121.51361
5,6,7.1,2175.03,3,24.96305,121.51254


In [15]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression(n_jobs=-1)

#training
lin_reg.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,-1
,positive,False


In [16]:
preds = lin_reg.predict(X_test)

In [17]:
X_test.shape, y_test.shape, preds.shape

((83, 6), (83,), (83,))

In [18]:
temp = X_test.copy()
temp['y_test'] = y_test
temp['preds'] = preds
temp

Unnamed: 0,No,house age,dist-nearest station,num stores,lat,lon,y_test,preds
358,359,1.1,193.5845,6,24.96571,121.54089,45.1,46.875387
350,351,13.2,492.2313,5,24.96515,121.53737,42.3,41.168836
373,374,0.0,274.0144,1,24.97480,121.53059,52.2,43.610470
399,400,12.7,170.1289,1,24.97371,121.52984,37.3,40.305651
369,370,20.2,2185.1280,3,24.96322,121.51237,22.8,29.353847
...,...,...,...,...,...,...,...,...
31,32,29.6,769.4034,7,24.98281,121.53408,25.0,43.737138
283,284,33.5,1978.6710,2,24.98674,121.51844,23.5,31.482244
157,158,16.1,815.9314,4,24.97886,121.53464,35.6,42.186099
203,204,15.6,289.3248,5,24.98203,121.54348,46.1,46.097451


In [19]:
# performance meatrics
from sklearn.metrics import (mean_squared_error, root_mean_squared_error, r2_score)

In [20]:
print(f"{mean_squared_error(y_test, preds) = }")
print(f"{root_mean_squared_error(y_test, preds) = }")
print(f"{r2_score(y_test, preds) = }")

mean_squared_error(y_test, preds) = 55.42976176163532
root_mean_squared_error(y_test, preds) = 7.445116638551428
r2_score(y_test, preds) = 0.6695884228951923


In [21]:
lin_reg.coef_

array([-5.80112414e-03, -2.67533830e-01, -4.61245577e-03,  1.09441629e+00,
        2.32955827e+02, -2.45926823e+01])

In [22]:
lin_reg.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': -1,
 'positive': False,
 'tol': 1e-06}

In [24]:
# sve to local directory
import joblib
joblib.dump(lin_reg, "./models/model_1.joblib")


['./models/model_1.joblib']

In [25]:
model = joblib.load("./models/model_1.joblib")
model.predict(X_test)

array([46.87538698, 41.16883617, 43.61046963, 40.3056507 , 29.35384659,
       43.66760173, 45.67492408, 46.30240107, 25.6146765 , 52.7873283 ,
       32.80225481, 35.1775532 , 38.68836523, 25.95094993, 36.28325843,
       32.03846092, 42.90030373, 46.54198139, 30.57291739, 45.22869825,
        2.63550887, 34.80310001, 46.77298972, 44.01118186, 13.39893711,
       40.24854581, 14.58018235, 45.25218776, 36.93741581, 38.04128132,
       12.6649321 , 39.58016188, 38.53495628, 29.43551827, 46.95210797,
       31.19971212, 51.72155916, 13.99057585, 46.75473983, 39.66315156,
       37.20985572, 40.02443113, 49.60359175, 39.28540394, 41.67483062,
       48.69642563, 44.71021557, 23.65016626, 49.59437483, 47.26837976,
       47.64693649, 47.65106564, 40.07584292, 42.20297598, 36.67595541,
       15.91300018, 36.20908699, 36.63719665, 30.96751469, 46.88831461,
       32.5419302 , 33.75834294, 15.437308  , 12.01140546,  7.74528596,
       33.33896449, 30.96365308, 46.18880429, 35.1079123 , 30.93

In [26]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=4, min_samples_split=10)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,4
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [27]:
y_pred_dt = dt.predict(X_test)

In [28]:
temp = X_test.copy()
temp['y_test'] = y_test
temp['preds'] = preds
temp

Unnamed: 0,No,house age,dist-nearest station,num stores,lat,lon,y_test,preds
358,359,1.1,193.5845,6,24.96571,121.54089,45.1,46.875387
350,351,13.2,492.2313,5,24.96515,121.53737,42.3,41.168836
373,374,0.0,274.0144,1,24.97480,121.53059,52.2,43.610470
399,400,12.7,170.1289,1,24.97371,121.52984,37.3,40.305651
369,370,20.2,2185.1280,3,24.96322,121.51237,22.8,29.353847
...,...,...,...,...,...,...,...,...
31,32,29.6,769.4034,7,24.98281,121.53408,25.0,43.737138
283,284,33.5,1978.6710,2,24.98674,121.51844,23.5,31.482244
157,158,16.1,815.9314,4,24.97886,121.53464,35.6,42.186099
203,204,15.6,289.3248,5,24.98203,121.54348,46.1,46.097451


In [29]:
print(f"{mean_squared_error(y_test, y_pred_dt) = }")
print(f"{root_mean_squared_error(y_test, y_pred_dt) = }")
print(f"{r2_score(y_test, y_pred_dt) = }")

mean_squared_error(y_test, y_pred_dt) = 39.72385344514002
root_mean_squared_error(y_test, y_pred_dt) = 6.302686208684359
r2_score(y_test, y_pred_dt) = 0.7632098596791491


In [37]:
from sklearn.model_selection import GridSearchCV
dt_reg = DecisionTreeRegressor()
params = {
        'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        'max_depth': [4,5,6,7,8],
        'min_samples_split': [10,15,20],
        'random_state': [42]
}
dt_grid= GridSearchCV(dt_reg, param_grid=params)
dt_grid.fit(X_train, y_train)


0,1,2
,estimator,DecisionTreeRegressor()
,param_grid,"{'criterion': ['squared_error', 'friedman_mse', ...], 'max_depth': [4, 5, ...], 'min_samples_split': [10, 15, ...], 'random_state': [42]}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'absolute_error'
,splitter,'best'
,max_depth,5
,min_samples_split,20
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [33]:
dt_grid.best_params_

{'criterion': 'poisson', 'max_depth': 4, 'min_samples_split': 20}

In [34]:
best_dt_model = dt_grid.best_estimator_

In [35]:
y_pred_dt_grid = best_dt_model.predict(X_test)

In [36]:
print(f"{mean_squared_error(y_test, y_pred_dt_grid) = }")
print(f"{root_mean_squared_error(y_test, y_pred_dt_grid) = }")
print(f"{r2_score(y_test, y_pred_dt_grid) = }")

mean_squared_error(y_test, y_pred_dt_grid) = 41.41467011633694
root_mean_squared_error(y_test, y_pred_dt_grid) = 6.435423072054932
r2_score(y_test, y_pred_dt_grid) = 0.7531310611209359
