<a href="https://colab.research.google.com/github/harnalashok/deeplearning/blob/main/simpleNN_housing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 29th October, 2022
# California Housing Dataset
# Experiments with NN
# https://www.kaggle.com/datasets/camnugent/california-housing-prices

In [26]:
# 1.0
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 1.1
import tensorflow as tf

# 1.2 Helper libraries
import numpy as np
import matplotlib.pyplot as plt


In [3]:
# 2.0 Mount google drive 
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
# 2.1 Read FASHION-MNIST data

path = "/gdrive/MyDrive/Colab_data_files/california_housing_dataset/"


In [29]:
# 2.2
data = pd.read_csv(path + "housing.csv")

In [30]:
# 2.2.1
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
# 2.2.2
data.shape

(20640, 10)

In [31]:
# 2.3 Pop out target
y = data.pop('median_income')

In [32]:
# 2.3.1
data.shape   # (20640, 9)

(20640, 9)

In [33]:
# 3.0 Check NULL
data.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_house_value      0
ocean_proximity         0
dtype: int64

In [34]:
# 3.1 Fill null values with 0
data['total_bedrooms']= data['total_bedrooms'].fillna(0)

In [35]:
# 3.2
data[:2]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,358500.0,NEAR BAY


In [36]:
# 4.0
# Label encode ocean_proximity:
 
le = LabelEncoder()
le.fit(data['ocean_proximity'])
data['ocean_proximity'] = le.transform(data['ocean_proximity'])

In [37]:
# 4.1 Check data again:
data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,452600.0,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,358500.0,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,352100.0,3


In [38]:
# 5.0 Scale data
ss = StandardScaler()
ss.fit(data)
X = ss.transform(data)

In [39]:
# 5.1
X[:5]

array([[-1.32783522,  1.05254828,  0.98214266, -0.8048191 , -0.95459349,
        -0.9744286 , -0.97703285,  2.12963148,  1.2910888 ],
       [-1.32284391,  1.04318455, -0.60701891,  2.0458901 ,  1.3569129 ,
         0.86143887,  1.66996103,  1.31415614,  1.2910888 ],
       [-1.33282653,  1.03850269,  1.85618152, -0.53574589, -0.81027221,
        -0.82077735, -0.84363692,  1.25869341,  1.2910888 ],
       [-1.33781784,  1.03850269,  1.85618152, -0.62421459, -0.7038057 ,
        -0.76602806, -0.73378144,  1.16510007,  1.2910888 ],
       [-1.33781784,  1.03850269,  1.85618152, -0.46240395, -0.59733918,
        -0.75984669, -0.62915718,  1.17289952,  1.2910888 ]])

In [44]:
# 6.0 Split data:

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.15)
display(X_train.shape)
display(X_test.shape)

(17544, 9)

(3096, 9)

## Model

In [122]:
# 6.0
model = tf.keras.Sequential()

In [123]:
# 6.1
model.add( tf.keras.layers.Input(shape = (9,) ))

In [124]:
# 6.2
model.add(tf.keras.layers.Dense(20, activation = 'relu'))  # MAke it 5 and then 20 (not more or less)

In [117]:
# 6.2.1 Experiment with adding a dropout layer
#       but then increase number of units in Dense layer from 20 to 40
#model.add(tf.keras.layers.Dropout(rate = 0.5 ))

In [125]:
# 6.3 Experiment first with activation of sigmoid
#     and then no activation function
model.add(tf.keras.layers.Dense(1))    # Keep sigmoid; then remove sigmoid

In [126]:
# 6.4 Model summary:
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_23 (Dense)            (None, 20)                200       
                                                                 
 dense_24 (Dense)            (None, 1)                 21        
                                                                 
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________


In [127]:
# 6.5 Compile model
#     Expt with adam
model.compile(
                 loss = 'mae',
                 optimizer = 'adam',  # Try first with default optimizer and then with 'adam'
                                      #  may not make much difference 
                 metrics = ['mae']

              )

In [None]:
# 7.0
history = model.fit(X_train,y_train,
                    epochs = 70,
                    validation_data = (X_test, y_test)
                   )

In [None]:
# 7.1
model.evaluate(X_test,y_test)



[2.8809735774993896, 2.8809735774993896]

In [None]:
######################### I am done #################