In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Read the CSV and Perform Basic Data Cleaning

In [5]:
# Load the data

housing_df = pd.read_csv('Data\clean_merged_data.csv')

In [6]:
housing_df.head()

Unnamed: 0,City,Longitude,latitude,Population,median_age,median_income,median_house_value,total_rooms,Bedrooms,Households,ocean_proximity,max_temp,Humidity,Cloudiness,wind_speed,Description,County,Employees,Establishments
0,Mission Viejo,-117.66,33.61,789,16,8.4112,286900,2022,254,270,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255
1,Mission Viejo,-117.66,33.62,1962,16,6.2177,256600,4065,661,636,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255
2,Mission Viejo,-117.67,33.61,1972,24,5.7871,227400,3859,661,624,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255
3,Mission Viejo,-117.66,33.61,1713,17,6.0471,248400,3464,519,530,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255
4,Mission Viejo,-117.66,33.61,860,21,7.1497,274000,1932,266,286,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255


In [7]:
#drop unnecessary/low value columns 'Longitude','latitude','City','County'

housing_df.drop(['City', 'County','Longitude','latitude'],axis=1, inplace=True)


In [8]:
#convert categorical values

housing_df=pd.get_dummies(housing_df)
housing_df.head()

Unnamed: 0,Population,median_age,median_income,median_house_value,total_rooms,Bedrooms,Households,max_temp,Humidity,Cloudiness,...,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,Description_broken clouds,Description_clear sky,Description_few clouds,Description_haze,Description_overcast clouds,Description_scattered clouds,Description_smoke,Description_thunderstorm
0,789,16,8.4112,286900,2022,254,270,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0
1,1962,16,6.2177,256600,4065,661,636,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0
2,1972,24,5.7871,227400,3859,661,624,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0
3,1713,17,6.0471,248400,3464,519,530,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0
4,860,21,7.1497,274000,1932,266,286,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0


In [9]:
# Split our preprocessed data into our features and target arrays
X = housing_df.drop(columns = ["median_house_value"])
y = housing_df['median_house_value']


In [10]:
# Split the preprocessed data into a training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=1/3)

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Linear Regression

In [12]:
# Define the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [13]:
# Fit the model
model.fit(X_train_scaled, y_train)

LinearRegression()

In [14]:
print(model.coef_)
print(model.intercept_)

[-5.69367879e+04  7.85802194e+03  7.04750226e+04  6.11368447e+03
  6.88516828e+03  4.80234909e+04  5.92503369e+03  1.34505638e+04
  1.97150188e+04 -3.15475934e+03  3.16834920e+05 -3.06643388e+05
 -7.52356209e+17 -6.39623928e+17 -2.47361426e+16 -4.84719566e+17
 -3.95203352e+17  3.01199093e+17  5.20520942e+17  4.42966001e+17
  4.95151312e+16  2.08613792e+17  2.50728456e+17  8.12950648e+16
  3.79938159e+16]
225385.53027222757


In [15]:
X.columns 
#'Bedrooms','Households','Employees','ocean_proximity_<1H OCEAN',

coef_df = pd.DataFrame(data= model.coef_,index= X.columns, columns = ['coef_value'])
coef_df

Unnamed: 0,coef_value
Population,-56936.79
median_age,7858.022
median_income,70475.02
total_rooms,6113.684
Bedrooms,6885.168
Households,48023.49
max_temp,5925.034
Humidity,13450.56
Cloudiness,19715.02
wind_speed,-3154.759


In [16]:
y_pred = model.predict(X_test_scaled)

In [17]:
results = pd.DataFrame({'Actual':y_test,'Predicted':y_pred})
results

Unnamed: 0,Actual,Predicted
9412,500001,330847.780272
9331,308900,270935.780272
4120,189800,183135.780272
10851,103600,104095.780272
618,366700,235647.780272
...,...,...
10120,137900,124063.780272
8009,500001,374239.780272
3005,156300,195543.780272
9492,179200,213583.780272


In [18]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error: 49596.080332173035
Mean Squared Error: 4560634611.67735
Root Mean Squared Error: 67532.47079499868


In [19]:
model.score(X_test_scaled, y_test)

0.6469528337450026

## Random Forest Regressor

In [20]:
random_forest = RandomForestRegressor(n_estimators = 1000, random_state = 1,criterion='squared_error',max_features = 'auto',
                                      max_depth = 7)

random_forest.fit(X_train_scaled,y_train)

RandomForestRegressor(max_depth=7, n_estimators=1000, random_state=1)

In [21]:
# Calculated actual v. predicted values for y

y_pred = random_forest.predict(X_test_scaled)
print(len(y_pred))
print(len(y_test))

print(f"y_pred ",y_pred)
print(f"y_test ", y_test)


df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df


3818
3818
y_pred  [398961.56724012 228462.04722042 182703.48924931 ... 167276.99514246
 226195.14834486 264229.80582515]
y_test  9412     500001
9331     308900
4120     189800
10851    103600
618      366700
          ...  
10120    137900
8009     500001
3005     156300
9492     179200
6454     158900
Name: median_house_value, Length: 3818, dtype: int64


Unnamed: 0,Actual,Predicted
9412,500001,398961.567240
9331,308900,228462.047220
4120,189800,182703.489249
10851,103600,83863.092953
618,366700,225737.692548
...,...,...
10120,137900,94539.558996
8009,500001,348411.255977
3005,156300,167276.995142
9492,179200,226195.148345


In [22]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 43042.8736477768
Mean Squared Error: 3723029129.806957
Root Mean Squared Error: 61016.62994468767


In [23]:
random_forest.score(X_test_scaled, y_test)

0.7117934243629898

### Neural Network

In [25]:

import sklearn as skl
import tensorflow as tf


In [35]:
nn_model = tf.keras.models.Sequential()
# Add our first Dense layer, including the input layer
nn_model.add(tf.keras.layers.Dense(units=5, activation="relu", input_dim=25))

In [36]:
# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=5, activation="sigmoid"))

In [37]:
# Summary
nn_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 5)                 130       
                                                                 
 dense_5 (Dense)             (None, 5)                 30        
                                                                 
Total params: 160
Trainable params: 160
Non-trainable params: 0
_________________________________________________________________


In [38]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="mean_squared_error", optimizer="adam", metrics=["mse"])

In [39]:
# Fit the model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100


Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [40]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

120/120 - 0s - loss: 63281344512.0000 - mse: 63281344512.0000 - 312ms/epoch - 3ms/step
Loss: 63281344512.0, Accuracy: 63281344512.0
