In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import cross_val_score, KFold

# K-fold Cross Validation

In [2]:
data = pd.read_csv("data/real_estate.csv", delimiter = ";")
data = data.drop(["No", "latitude", "longitude"], axis=1)

In [3]:
data.head()

Unnamed: 0,transaction_date,house_age,distance_to_the_nearest_MRT_station,number_of_convenience_stores,house_price_of_unit_area
0,2012.917,32.0,84.87882,10,37.9
1,2012.917,19.5,306.5947,9,42.2
2,2013.583,13.3,561.9845,5,47.3
3,2013.5,13.3,561.9845,5,54.8
4,2012.833,5.0,390.5684,5,43.1


In [4]:
data.dtypes

transaction_date                       float64
house_age                              float64
distance_to_the_nearest_MRT_station    float64
number_of_convenience_stores             int64
house_price_of_unit_area               float64
dtype: object

In [5]:
# Missing values per column
data.isnull().sum()

transaction_date                       0
house_age                              0
distance_to_the_nearest_MRT_station    0
number_of_convenience_stores           0
house_price_of_unit_area               0
dtype: int64

In [6]:
# Missing values for your all dataset
data.isnull().sum().sum()

0

In [7]:
data.describe()

Unnamed: 0,transaction_date,house_age,distance_to_the_nearest_MRT_station,number_of_convenience_stores,house_price_of_unit_area
count,414.0,414.0,414.0,414.0,414.0
mean,2013.148971,17.71256,1083.885689,4.094203,37.980193
std,0.281967,11.392485,1262.109595,2.945562,13.606488
min,2012.667,0.0,23.38284,0.0,7.6
25%,2012.917,9.025,289.3248,1.0,27.7
50%,2013.167,16.1,492.2313,4.0,38.45
75%,2013.417,28.15,1454.279,6.0,46.6
max,2013.583,43.8,6488.021,10.0,117.5


## Work on EDA 
-  most likely transaction_date should be droped or featured engineered

# Work on your model
## Journey building your model -- you have your model!

- Does my model generalize well?
    - You'll run K-cross validation to see how robust your model is
        - Performance over different splits train/test
        - Preventing/Discovering if you have overfitting issues with slices of your data

# K-cross validation
- K = 5
- Reproducible -- set a fixed random_seed
- shuffle - making sure your data is shuffled before creating the folds

In [8]:
num_folds = 5
random_seed = 42

kfold = KFold(
    n_splits=num_folds,
    shuffle=True,
    random_state=random_seed,
)

In [9]:
kfold

KFold(n_splits=5, random_state=42, shuffle=True)

In [10]:
# Target variable (house_price_of_unit_area) numerical / Float choose a model accordingly
lr = LinearRegression()

In [11]:
# Here we are not doing the train/test split but in a real-model scenario you should
# Splitting data
# X = features
# y = target variable
X = data.drop(["house_price_of_unit_area"], axis=1)
y = data["house_price_of_unit_area"]


# Train/test split
# should go here

In [37]:
#cross_val_score?

In [12]:
# Scoring: https://scikit-learn.org/stable/modules/model_evaluation.html
results = cross_val_score(
    estimator = lr, 
    X = X, # in other scenarios, X = train split
    y = y, 
    cv = kfold,
#    scoring="r2"
    scoring="neg_mean_squared_error" # MSE
)

In [60]:
#data.head()

In [13]:
len(results)

5

In [14]:
list(results)

[-56.11619009787396,
 -74.45258537879404,
 -74.11612889169152,
 -78.0223953768302,
 -135.29511323704259]

In [15]:
# average performance
round(np.mean(results),2)

-83.6

In [16]:
# std
round(np.std(results), 2)

26.95

In [17]:
data.head()

Unnamed: 0,transaction_date,house_age,distance_to_the_nearest_MRT_station,number_of_convenience_stores,house_price_of_unit_area
0,2012.917,32.0,84.87882,10,37.9
1,2012.917,19.5,306.5947,9,42.2
2,2013.583,13.3,561.9845,5,47.3
3,2013.5,13.3,561.9845,5,54.8
4,2012.833,5.0,390.5684,5,43.1


In [58]:
var_i = 1
for train_index, test_index in kfold.split(X):
    print("SPLIT: ", var_i)
    var_i+=1
    print("TRAIN:", train_index)
    print("TEST:", test_index)
    print("----")

SPLIT:  1
TRAIN: [  1   2   3   4   5   6   7   8  10  11  12  13  14  16  18  19  20  21
  23  26  27  28  29  32  34  35  36  37  38  40  41  43  44  45  47  48
  49  50  51  52  53  54  58  59  60  61  62  63  64  65  66  67  68  69
  71  74  75  80  81  83  85  86  87  88  89  91  92  95  96  97  98  99
 100 101 102 103 105 106 107 108 109 111 112 113 114 115 117 119 120 121
 122 123 125 127 128 129 130 132 133 134 135 136 138 139 142 143 144 145
 146 147 148 149 150 151 153 154 156 158 159 160 161 162 163 164 166 167
 168 169 170 171 172 174 176 177 178 179 181 182 183 185 186 187 188 189
 190 191 192 193 194 195 196 197 198 199 200 201 202 204 205 206 207 209
 210 211 212 213 214 215 216 217 218 219 220 221 223 224 226 227 228 229
 230 232 233 234 235 236 237 239 240 241 242 243 244 245 246 247 248 249
 250 251 252 253 254 255 256 257 258 259 260 261 263 264 265 267 268 269
 270 273 276 277 278 279 280 281 282 284 285 287 288 289 292 293 294 295
 296 297 298 299 300 301 302 303 3