<a href="https://colab.research.google.com/github/harnalashok/deeplearning/blob/main/simpleNN_housing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 29th October, 2022
# California Housing Dataset
# Experiments with NN
# https://www.kaggle.com/datasets/camnugent/california-housing-prices

In [12]:
# 1.0
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 1.1
import tensorflow as tf

# 1.2 Helper libraries
import numpy as np
import matplotlib.pyplot as plt


In [13]:
# 2.0 Mount google drive 
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [14]:
# 2.1 Read FASHION-MNIST data

path = "/gdrive/MyDrive/Colab_data_files/california_housing_dataset/"


In [27]:
# 2.2
data = pd.read_csv(path + "housing.csv")

In [28]:
# 2.2.1
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [29]:
# 2.2.2
data.shape

(20640, 10)

In [30]:
# 2.3 Pop out target
y = data.pop('median_house_value')

In [31]:
# 2.3.1
data.shape   # (20640, 9)

(20640, 9)

In [36]:
# 3.0 Check NULL
data.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
dtype: int64

In [34]:
data['total_bedrooms'].median()

435.0

In [35]:
# 3.1 Fill null values with 'median'

data['total_bedrooms']= data['total_bedrooms'].fillna(data['total_bedrooms'].median())

In [22]:
# 3.2
data[:2]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY


In [37]:
# 4.0
# Label encode ocean_proximity:
 
le = LabelEncoder()
le.fit(data['ocean_proximity'])
data['ocean_proximity'] = le.transform(data['ocean_proximity'])

In [38]:
# 4.1 Check data again:
data.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,3
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,3
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,3


In [39]:
# 5.0 Scale data
ss = StandardScaler()
ss.fit(data)
X = ss.transform(data)

In [40]:
# 5.1
X[:5]

array([[-1.32783522,  1.05254828,  0.98214266, -0.8048191 , -0.97247648,
        -0.9744286 , -0.97703285,  2.34476576,  1.2910888 ],
       [-1.32284391,  1.04318455, -0.60701891,  2.0458901 ,  1.35714343,
         0.86143887,  1.66996103,  2.33223796,  1.2910888 ],
       [-1.33282653,  1.03850269,  1.85618152, -0.53574589, -0.82702426,
        -0.82077735, -0.84363692,  1.7826994 ,  1.2910888 ],
       [-1.33781784,  1.03850269,  1.85618152, -0.62421459, -0.71972345,
        -0.76602806, -0.73378144,  0.93296751,  1.2910888 ],
       [-1.33781784,  1.03850269,  1.85618152, -0.46240395, -0.61242263,
        -0.75984669, -0.62915718, -0.012881  ,  1.2910888 ]])

In [41]:
# 6.0 Split data:

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.15)
display(X_train.shape)
display(X_test.shape)

(17544, 9)

(3096, 9)

## Model

In [76]:
# 6.0
model = tf.keras.Sequential()

In [77]:
# 6.1
model.add( tf.keras.layers.Input(shape = (9,) ))

In [78]:
# 6.2 Start 
model.add(tf.keras.layers.Dense(40, activation = 'relu'))  # MAke it 5 and then 20 (not more or less)

In [79]:
# 6.2.1 Experiment with adding a dropout layer
#       but then increase number of units in Dense layer from 20 to 40
model.add(tf.keras.layers.Dropout(rate = 0.5 ))

In [80]:
model.add(tf.keras.layers.Dense(20, activation = 'relu'))  
model.add(tf.keras.layers.Dropout(rate = 0.5 ))

In [81]:
model.add(tf.keras.layers.Dense(10, activation = 'relu'))  
#model.add(tf.keras.layers.Dropout(rate = 0.5 ))

In [82]:
# 6.3 Experiment first with activation of sigmoid
#     and then no activation function
model.add(tf.keras.layers.Dense(1))    # Keep sigmoid; then remove sigmoid

In [83]:
# 6.4 Model summary:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_11 (Dense)            (None, 40)                400       
                                                                 
 dropout_6 (Dropout)         (None, 40)                0         
                                                                 
 dense_12 (Dense)            (None, 20)                820       
                                                                 
 dropout_7 (Dropout)         (None, 20)                0         
                                                                 
 dense_13 (Dense)            (None, 10)                210       
                                                                 
 dense_14 (Dense)            (None, 1)                 11        
                                                                 
Total params: 1,441
Trainable params: 1,441
Non-traina

In [84]:
# 6.5 Compile model
#     Expt with adam
model.compile(
                 loss = 'mae',
                 optimizer = 'adam',  # Try first with default optimizer and then with 'adam'
                                      #  may not make much difference 
                 metrics = ['mae']

              )

In [85]:
# 7.0
history = model.fit(X_train,y_train,
                    epochs = 70,
                    validation_data = (X_test, y_test)
                   )

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


In [None]:
# 7.1
model.evaluate(X_test,y_test)



[0.6291962265968323, 0.6291962265968323]

In [None]:
######################### I am done #################