In [10]:
# Import neccessary library

import sqlite3
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

<p style="font-family: Arial; font-size:1.5em;color:black; font-style:bold"><br>
Reading files
<br><br></p>


In [11]:
df = pd.read_csv("./DataSet/realest.csv")

In [12]:
df.head()

Unnamed: 0,Price,Bedroom,Space,Room,Lot,Tax,Bathroom,Garage,Condition
0,53.0,2.0,967.0,5.0,39.0,652.0,1.5,0.0,0.0
1,55.0,2.0,815.0,5.0,33.0,1000.0,1.0,2.0,1.0
2,56.0,3.0,900.0,5.0,35.0,897.0,1.5,1.0,0.0
3,58.0,3.0,1007.0,6.0,24.0,964.0,1.5,2.0,0.0
4,64.0,3.0,1100.0,7.0,50.0,1099.0,1.5,1.5,0.0


In [13]:
df.columns

Index(['Price', 'Bedroom', 'Space', 'Room', 'Lot', 'Tax', 'Bathroom', 'Garage',
       'Condition'],
      dtype='object')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Price      156 non-null    float64
 1   Bedroom    156 non-null    float64
 2   Space      146 non-null    float64
 3   Room       156 non-null    float64
 4   Lot        146 non-null    float64
 5   Tax        147 non-null    float64
 6   Bathroom   156 non-null    float64
 7   Garage     156 non-null    float64
 8   Condition  156 non-null    float64
dtypes: float64(9)
memory usage: 11.2 KB


<p style="font-family: Arial; font-size:1.5em;color:black; font-style:bold"><br>
Found some null value that store within different field
<br><br></p>


In [15]:
# Check original dataframe's size
df.shape

(157, 9)

<p style="font-family: Arial; font-size:1.5em;color:black; font-style:bold"><br>
Drop all the row that consist of NaN value
<br><br></p>


In [17]:
df = df.dropna()

# Check the dataframe's size after cleanup NaN value row
df.shape

(128, 9)

<p style="font-family: Arial; font-size:1.5em;color:black; font-style:bold"><br><br>
Prepare data for model training
<br><br></p>


In [18]:
feature = ['Bedroom', 'Space', 'Room', 'Lot', 'Tax', 'Bathroom', 'Garage',
       'Condition']

In [19]:
target = ['Price']

In [20]:
x = df[feature]
y = df[target]

In [21]:
x.head()

Unnamed: 0,Bedroom,Space,Room,Lot,Tax,Bathroom,Garage,Condition
0,2.0,967.0,5.0,39.0,652.0,1.5,0.0,0.0
1,2.0,815.0,5.0,33.0,1000.0,1.0,2.0,1.0
2,3.0,900.0,5.0,35.0,897.0,1.5,1.0,0.0
3,3.0,1007.0,6.0,24.0,964.0,1.5,2.0,0.0
4,3.0,1100.0,7.0,50.0,1099.0,1.5,1.5,0.0


In [22]:
y.head()

Unnamed: 0,Price
0,53.0
1,55.0
2,56.0
3,58.0
4,64.0


In [23]:
# Split data for train and test purpose
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=324)

<p style="font-family: Arial; font-size:2.5em;color:black; font-style:bold"><br><br>
Using machine Learning
<br><br></p>

<p style="font-family: Arial; font-size:1.5em;color:black; font-style:bold">
Start Training Regression Model
<br><br></p>

In [24]:
regressor = LinearRegression()

In [25]:
regressor.fit(x_train, y_train)

In [26]:
predictResult = regressor.predict(x_test)
predictResult

array([[48.54319755],
       [58.04125521],
       [42.78962908],
       [88.00163219],
       [59.89101235],
       [43.56205554],
       [86.49335918],
       [42.52743114],
       [48.2085339 ],
       [41.276301  ],
       [57.75883908],
       [60.51287049],
       [61.75644135],
       [48.25565809],
       [60.48227205],
       [48.36930001],
       [52.78896937],
       [58.12873513],
       [42.44176388],
       [38.29455388],
       [64.83243643],
       [65.7273065 ],
       [58.52834349],
       [85.59932937],
       [51.02071992],
       [61.6099676 ],
       [42.638201  ],
       [61.23354191],
       [56.22580217],
       [45.7931311 ],
       [87.72401015],
       [58.42577579],
       [60.94802115],
       [82.8764996 ],
       [55.95185841],
       [38.61359251],
       [86.34711052],
       [51.53265493],
       [53.35603155],
       [56.17604612],
       [57.31355322],
       [53.93060939],
       [45.13631711]])

In [27]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = predictResult))

In [28]:
print(RMSE)

8.06801760262437


<p style="font-family: Arial; font-size:1.5em;color:black; font-style:bold"><br>
Improve the prediction accuracy by using the different, slightly complex model
<br><br></p>

In [63]:
regressor = DecisionTreeRegressor(max_depth=20)
regressor.fit(x_train, y_train)

In [64]:
predictResult = regressor.predict(x_test)
predictResult

array([63., 45., 47., 70., 67., 42., 85., 36., 53., 42., 67., 56., 63.,
       49., 41., 53., 47., 45., 36., 39., 66., 62., 45., 70., 53., 65.,
       36., 59., 51., 49., 70., 66., 61., 88., 55., 36., 70., 63., 55.,
       51., 58., 51., 53.])

In [65]:
y_test.head()

Unnamed: 0,Price
41,61.0
57,44.0
12,47.0
137,70.0
70,69.0


In [66]:
y_test.size

43

In [67]:
RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = predictResult))

In [68]:
print(RMSE)

4.8512765042749555
