<a href="https://colab.research.google.com/github/kszymon/neural-network/blob/main/05_regression%20/01_housing_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Problem regresji - prognoza ceny mieszkań

Celem tego notebook'a jest zapoznanie się z problemami regresji. W odróżnieniu od problemów klasyfikacji, gdzie staramy się przyporządkować daną klasę, w modelach regresji przewidujemy ciągłą zmienną docelową.

### Spis treści:
1. [Import bibliotek](#a0)
2. [Załadowanie danych i wstępna eksploracja](#a1)
3. [Podział na zbiór treningowy oraz testowy](#a2)
4. [Standaryzacja danych](#a3)
5. [Budowa modelu](#a4)
6. [Trenowanie sieci](#a5)




### <a name='a0'></a> 1. Import biblotek

In [30]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format = '{:.6f}'.format
tf.__version__

'2.18.0'

### <a name='a1'></a> 2. Załadowanie danych i wstępna eksploracja

In [31]:
raw_dataset = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ann-course/housing.csv')
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [32]:
dataset = raw_dataset.copy()
dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [33]:
dataset.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND


In [34]:
dataset.isnull().sum() / len(dataset)

Unnamed: 0,0
longitude,0.0
latitude,0.0
housing_median_age,0.0
total_rooms,0.0
total_bedrooms,0.010029
population,0.0
households,0.0
median_income,0.0
median_house_value,0.0
ocean_proximity,0.0


In [35]:
dataset.dropna(inplace=True)

dataset.isnull().sum() / len(dataset)

Unnamed: 0,0
longitude,0.0
latitude,0.0
housing_median_age,0.0
total_rooms,0.0
total_bedrooms,0.0
population,0.0
households,0.0
median_income,0.0
median_house_value,0.0
ocean_proximity,0.0


In [36]:
dataset.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0,20433.0
mean,-119.570689,35.633221,28.633094,2636.504233,537.870553,1424.946949,499.433465,3.871162,206864.413155
std,2.003578,2.136348,12.591805,2185.269567,421.38507,1133.20849,382.299226,1.899291,115435.667099
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1450.0,296.0,787.0,280.0,2.5637,119500.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5365,179700.0
75%,-118.01,37.72,37.0,3143.0,647.0,1722.0,604.0,4.744,264700.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [37]:
dataset.describe(include=['object'])

Unnamed: 0,ocean_proximity
count,20433
unique,5
top,<1H OCEAN
freq,9034


In [38]:
dataset.ocean_proximity.value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,9034
INLAND,6496
NEAR OCEAN,2628
NEAR BAY,2270
ISLAND,5


In [39]:
px.histogram(dataset, x='median_house_value')

In [40]:
dataset.median_house_value.value_counts()

Unnamed: 0_level_0,count
median_house_value,Unnamed: 1_level_1
500001.000000,958
137500.000000,119
162500.000000,116
112500.000000,103
187500.000000,92
...,...
321700.000000,1
300800.000000,1
393100.000000,1
203600.000000,1


In [41]:
index_to_drop = dataset[dataset.median_house_value == 500000].index
datset = dataset.drop(index=index_to_drop)
px.histogram(dataset, x='median_house_value')

In [42]:
dataset_dummies = pd.get_dummies(dataset, drop_first=True)
dataset_dummies.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,True,False


### <a name='a2'></a> 3. Podział na zbiór treningowy oraz testowy

In [43]:
train_dataset = dataset_dummies.sample(frac=0.8, random_state=0)
test_dataset = dataset_dummies.drop(train_dataset.index)

print(f'train_dataset length: {len(train_dataset)}')
print(f'test_dataset lenfth: {len(test_dataset)}')

train_dataset length: 16346
test_dataset lenfth: 4087


In [44]:
train_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14185,-117.08,32.7,37.0,2176.0,418.0,1301.0,375.0,2.875,98900.0,False,False,False,True
6125,-117.91,34.11,20.0,3158.0,684.0,2396.0,713.0,3.525,153000.0,False,False,False,False
14095,-117.1,32.75,11.0,2393.0,726.0,1905.0,711.0,1.3448,91300.0,False,False,False,True
14359,-117.22,32.74,52.0,1260.0,202.0,555.0,209.0,7.2758,345200.0,False,False,False,True
18004,-121.99,37.29,32.0,2930.0,481.0,1336.0,481.0,6.4631,344100.0,False,False,False,False


In [45]:
test_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
10,-122.26,37.85,52.0,2202.0,434.0,910.0,402.0,3.2031,281500.0,False,False,True,False
13,-122.26,37.84,52.0,696.0,191.0,345.0,174.0,2.6736,191300.0,False,False,True,False
19,-122.27,37.84,52.0,1503.0,298.0,690.0,275.0,2.6033,162900.0,False,False,True,False
28,-122.28,37.84,50.0,2082.0,492.0,1131.0,473.0,1.6424,108900.0,False,False,True,False
40,-122.26,37.83,52.0,1665.0,419.0,946.0,395.0,2.0978,155400.0,False,False,True,False


In [46]:
px.scatter_matrix(train_dataset, dimensions=['median_house_value', 'housing_median_age', 'median_income', 'total_rooms'], color='median_house_value', height=700)

In [47]:
train_stats = train_dataset.describe()
train_stats.pop('median_house_value')
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,16346.0,-119.564154,2.002618,-124.35,-121.79,-118.49,-118.0,-114.47
latitude,16346.0,35.630318,2.138574,32.55,33.93,34.25,37.71,41.95
housing_median_age,16346.0,28.664505,12.556764,1.0,18.0,29.0,37.0,52.0
total_rooms,16346.0,2622.235776,2169.548287,11.0,1448.0,2119.0,3120.75,39320.0
total_bedrooms,16346.0,535.281659,418.469078,3.0,296.0,432.5,644.0,6445.0
population,16346.0,1416.087055,1103.842065,3.0,784.25,1164.0,1711.0,28566.0
households,16346.0,496.758167,379.109535,3.0,280.0,408.0,600.0,6082.0
median_income,16346.0,3.869337,1.902228,0.4999,2.555675,3.5332,4.744225,15.0001


In [48]:
train_labels = train_dataset.pop('median_house_value')
test_labels = test_dataset.pop('median_house_value')

### <a name='a3'></a> 4. Standaryzacja danych

In [49]:
continuous_columns = [col for col in train_dataset.columns if not col.startswith('ocean_proximity')]

def norm(x):
    normed = x.copy()

    bool_cols = normed.select_dtypes(include=['bool', 'boolean']).columns
    normed[bool_cols] = normed[bool_cols].astype(int)

    normed[continuous_columns] = (normed[continuous_columns] - train_stats.loc[continuous_columns, 'mean']) / \
                                 train_stats.loc[continuous_columns, 'std']
    return normed

In [50]:
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [51]:
normed_train_data.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,0
population,0
households,0
median_income,0
ocean_proximity_INLAND,0
ocean_proximity_ISLAND,0


In [52]:
normed_train_data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14185,1.240453,-1.370221,0.663825,-0.205681,-0.280264,-0.10426,-0.321169,-0.522722,0,0,0,1
6125,0.825996,-0.710903,-0.690027,0.246947,0.355387,0.887729,0.570394,-0.181018,0,0,0,0
14095,1.230466,-1.34684,-1.406772,-0.105661,0.455753,0.442919,0.565119,-1.327147,0,0,0,1


In [53]:
normed_test_data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
10,-1.346161,1.037926,1.8584,-0.193697,-0.242029,-0.458478,-0.249949,-0.35024,0,0,1,0
13,-1.346161,1.03325,1.8584,-0.887851,-0.822717,-0.970326,-0.851359,-0.628598,0,0,1,0
19,-1.351154,1.03325,1.8584,-0.515884,-0.567023,-0.657782,-0.584945,-0.665555,0,0,1,0


In [54]:
print(normed_test_data['ocean_proximity_INLAND'].dtype)

int64


In [55]:
normed_test_data = normed_test_data.values
normed_train_data = normed_train_data.values

### <a name='a4'></a> 5. Budowa modelu

In [56]:
def build_model():
    model = Sequential()
    model.add(Dense(1024, kernel_regularizer='l2', activation='relu', input_shape=[len(train_dataset.keys())]))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1))

    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae', 'mse'])
    return model

In [57]:
model = build_model()
model.summary()


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



### <a name='a5'></a> 6. Trenowanie sieci

In [58]:
history = model.fit(normed_train_data, train_labels.values, epochs=150, validation_split=0.2, verbose=1, batch_size=32)

Epoch 1/150
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step - loss: 36970090496.0000 - mae: 153015.7031 - mse: 36970090496.0000 - val_loss: 6372820480.0000 - val_mae: 58217.6055 - val_mse: 6372820480.0000
Epoch 2/150
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 5990871552.0000 - mae: 56017.2891 - mse: 5990871552.0000 - val_loss: 4611885568.0000 - val_mae: 48816.4492 - val_mse: 4611885568.0000
Epoch 3/150
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - loss: 4675111424.0000 - mae: 48896.0859 - mse: 4675111424.0000 - val_loss: 4324369920.0000 - val_mae: 46805.7656 - val_mse: 4324369920.0000
Epoch 4/150
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - loss: 4419607040.0000 - mae: 47472.1172 - mse: 4419607040.0000 - val_loss: 4303235584.0000 - val_mae: 46889.5508 - val_mse: 4303235584.0000
Epoch 5/150
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s

In [59]:
def plot_hist(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    hist['rmse'] = np.sqrt(hist['mse'])
    hist['val_rmse'] = np.sqrt(hist['val_mse'])

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['mae'], name='mae', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_mae'], name='val_mae', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='MAE vs. VAL_MAE', xaxis_title='Epoki', yaxis_title='Mean Absolute Error', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['rmse'], name='rmse', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_rmse'], name='val_rmse', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='RMSE vs. VAL_RMSE', xaxis_title='Epoki', yaxis_title='Root Mean Squared Error', yaxis_type='log')
    fig.show()

plot_hist(history)

In [60]:
test_predictions = model.predict(normed_test_data).flatten()
test_predictions

[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


array([244901.05 , 203906.16 , 164435.47 , ..., 122822.625, 102642.57 , 115323.945], dtype=float32)

In [61]:
pred = pd.DataFrame(test_labels)
pred['predictions'] = test_predictions
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500.0,244901.046875
13,191300.0,203906.15625
19,162900.0,164435.46875
28,108900.0,125499.4375
40,155400.0,155856.078125


In [62]:
fig = px.scatter(pred, 'median_house_value', 'predictions')
fig.add_trace(go.Scatter(x=[0, 500000], y=[0, 500000], mode='lines'))
fig.show()

In [63]:
pred.head()

Unnamed: 0,median_house_value,predictions
10,281500.0,244901.046875
13,191300.0,203906.15625
19,162900.0,164435.46875
28,108900.0,125499.4375
40,155400.0,155856.078125


In [64]:
pred['error'] = pred['median_house_value'] - pred['predictions']
pred.head()

Unnamed: 0,median_house_value,predictions,error
10,281500.0,244901.046875,36598.953125
13,191300.0,203906.15625,-12606.15625
19,162900.0,164435.46875,-1535.46875
28,108900.0,125499.4375,-16599.4375
40,155400.0,155856.078125,-456.078125


In [65]:
px.histogram(pred, 'error', marginal='rug', width=1000)