# Problem regresji - Boston Housing Price Prediction
### Spis treści:
[1. Ładowanie danych i wstępna eksploracja](#sekcja-1)<br>
[2. Przygotowanie zbioru treningowego i testowego](#sekcja-2)<br>
[3. Standaryzacja danych](#sekcja-3)<br>
[4. Budowa modelu](#sekcja-4)<br>
[5. Ocena modelu](#sekcja-5)

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout
import plotly.express as px

np.set_printoptions(precision=12, suppress=True, linewidth=150)
pd.options.display.float_format='{:.6f}'.format

In [None]:
tf.__version__

'2.13.0'

<a name="sekcja-1"></a>
### 1. Ładowanie danych i wstępna eksploracja

In [None]:
raw = pd.read_csv("/content/drive/MyDrive/Uczenie Maszynowe/Notatniki/housing.csv")
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [None]:
data = raw.copy()
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
data.tail()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND
20639,-121.24,39.37,16.0,2785.0,616.0,1387.0,530.0,2.3886,89400.0,INLAND


In [None]:
raw.isnull().sum() / len(raw)

longitude            0.000000
latitude             0.000000
housing_median_age   0.000000
total_rooms          0.000000
total_bedrooms       0.010029
population           0.000000
households           0.000000
median_income        0.000000
median_house_value   0.000000
ocean_proximity      0.000000
dtype: float64

In [None]:
data.dropna(inplace=True)
data.isnull().sum() / len(data)

longitude            0.000000
latitude             0.000000
housing_median_age   0.000000
total_rooms          0.000000
total_bedrooms       0.000000
population           0.000000
households           0.000000
median_income        0.000000
median_house_value   0.000000
ocean_proximity      0.000000
dtype: float64

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [None]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,20433.0,-119.570689,2.003578,-124.35,-121.8,-118.49,-118.01,-114.31
latitude,20433.0,35.633221,2.136348,32.54,33.93,34.26,37.72,41.95
housing_median_age,20433.0,28.633094,12.591805,1.0,18.0,29.0,37.0,52.0
total_rooms,20433.0,2636.504233,2185.269567,2.0,1450.0,2127.0,3143.0,39320.0
total_bedrooms,20433.0,537.870553,421.38507,1.0,296.0,435.0,647.0,6445.0
population,20433.0,1424.946949,1133.20849,3.0,787.0,1166.0,1722.0,35682.0
households,20433.0,499.433465,382.299226,1.0,280.0,409.0,604.0,6082.0
median_income,20433.0,3.871162,1.899291,0.4999,2.5637,3.5365,4.744,15.0001
median_house_value,20433.0,206864.413155,115435.667099,14999.0,119500.0,179700.0,264700.0,500001.0


In [None]:
data.describe(include=["object"])

Unnamed: 0,ocean_proximity
count,20433
unique,5
top,<1H OCEAN
freq,9034


In [None]:
data["ocean_proximity"].value_counts()

<1H OCEAN     9034
INLAND        6496
NEAR OCEAN    2628
NEAR BAY      2270
ISLAND           5
Name: ocean_proximity, dtype: int64

In [None]:
px.histogram(data, x="median_house_value")

In [None]:
data["median_house_value"].value_counts()

500001.000000    958
137500.000000    119
162500.000000    116
112500.000000    103
187500.000000     92
                ... 
359200.000000      1
51200.000000       1
39800.000000       1
377600.000000      1
47000.000000       1
Name: median_house_value, Length: 3833, dtype: int64

In [None]:
data = data[data["median_house_value"] != 500001]

In [None]:
px.histogram(data, x="median_house_value")

In [None]:
data_dummies = pd.get_dummies(data=data, drop_first=True)
data_dummies.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


<a name="sekcja-2"></a>
### 2. Przygotowanie zbioru treningowego i testowego

In [None]:
train_dataset = data_dummies.sample(frac=0.8, random_state=0)
test_dataset = data_dummies.drop(train_dataset.index)

print(f"Zbiór treningowy: {train_dataset.shape}")
print(f"Zbiór testowy: {test_dataset.shape}")

Zbiór treningowy: (15580, 13)
Zbiór testowy: (3895, 13)


In [None]:
train_label = train_dataset.pop("median_house_value").to_numpy()
test_label = test_dataset.pop("median_house_value").to_numpy()

<a name="sekcja-3"></a>
### 3. Standaryzacja danych

In [None]:
from sklearn.preprocessing import StandardScaler

skaler = StandardScaler()

normed_train_dataset = skaler.fit_transform(train_dataset)
normed_test_dataset = skaler.fit_transform(test_dataset)

<a name="sekcja-4"></a>
### 4. Budowa modelu

In [None]:
def nn_model():
    model = Sequential()
    model.add(Dense(1024, kernel_regularizer="l2", activation="relu", input_shape=[len(train_dataset.keys())]))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(1))

    model.compile(optimizer="adam",
                loss="mse",
                metrics=["mae", "mse"])

    return model

In [None]:
model = nn_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1024)              13312     
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_2 (Dense)             (None, 128)               65664     
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 1

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor="val_loss", mode="min", patience=4, verbose=1)

history = model.fit(normed_train_dataset, train_label, epochs=1000, validation_split=0.2, batch_size=32, verbose=1, callbacks=[es])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [None]:
df_hist = pd.DataFrame(history.history)
df_hist["epoch"] = history.epoch
df_hist["rmse"] = np.sqrt(df_hist["mse"])
df_hist["val_rmse"] = np.sqrt(df_hist["val_mse"])
df_hist.head()

Unnamed: 0,loss,mae,mse,val_loss,val_mae,val_mse,epoch,rmse,val_rmse
0,17126267904.0,95444.671875,17126267904.0,4574177792.0,48820.96875,4574177792.0,0,130867.367606,67632.668083
1,4423407616.0,47839.136719,4423407616.0,3792577792.0,44682.640625,3792577792.0,1,66508.70331,61583.908548
2,3997463040.0,45505.535156,3997463040.0,3608647168.0,43764.761719,3608647168.0,2,63225.493592,60072.016514
3,3815079168.0,44475.308594,3815079168.0,3614172160.0,44225.273438,3614172160.0,3,61766.327137,60117.985329
4,3780323328.0,44100.992188,3780323328.0,3460371456.0,42805.605469,3460371456.0,4,61484.334005,58824.922065


<a name="sekcja-5"></a>
### 5. Ocena modelu

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig1 = go.Scatter(x=df_hist["epoch"], y=df_hist["rmse"], name="rmse", mode="markers+lines")
fig2 = go.Scatter(x=df_hist["epoch"], y=df_hist["val_rmse"], name="val_rmse", mode="markers+lines")

fig.add_trace(fig1)
fig.add_trace(fig2)
fig.update_layout(width=1000, height=500, title="RMSE vs. val_RMSE", xaxis_title="Epochs", yaxis_title="RMSE")
fig.show()

In [None]:
model.evaluate(normed_test_dataset, test_label)



[2636787456.0, 36022.71484375, 2636787456.0]

In [None]:
for name, value in zip(model.metrics_names, model.evaluate(normed_test_dataset, test_label)):
    print(f"Metric: {name} = {value}")

Metric: loss = 2636787456.0
Metric: mae = 36022.71484375
Metric: mse = 2636787456.0


In [None]:
predictions = model.predict(normed_test_dataset)



In [None]:
df = pd.DataFrame(test_label, columns=["median_house_value"])
df["predictions"] = predictions
df.head()

Unnamed: 0,median_house_value,predictions
0,281500.0,249149.34375
1,191300.0,210429.671875
2,162900.0,192335.6875
3,93800.0,172321.34375
4,155400.0,187460.03125


In [None]:
fig = px.scatter(df, "median_house_value", "predictions")
fig.add_trace(go.Scatter(x=[0,500000], y=[0,500000], mode="lines"))

In [None]:
df = pd.DataFrame(test_label, columns=["median_house_value"])
df["predictions"] = predictions
df["error"] = df["median_house_value"] - df["predictions"]
df.head()

Unnamed: 0,median_house_value,predictions,error
0,281500.0,249149.34375,32350.65625
1,191300.0,210429.671875,-19129.671875
2,162900.0,192335.6875,-29435.6875
3,93800.0,172321.34375,-78521.34375
4,155400.0,187460.03125,-32060.03125


In [None]:
px.histogram(df, x="error", marginal="rug", width=1000)