In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Read the CSV and Perform Basic Data Cleaning

In [5]:
# Load the data

housing_df = pd.read_csv('Data\clean_merged_data.csv')

In [6]:
housing_df.head()

Unnamed: 0,City,Longitude,latitude,Population,median_age,median_income,median_house_value,total_rooms,Bedrooms,Households,ocean_proximity,max_temp,Humidity,Cloudiness,wind_speed,Description,County,Employees,Establishments
0,Mission Viejo,-117.66,33.61,789,16,8.4112,286900,2022,254,270,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255
1,Mission Viejo,-117.66,33.62,1962,16,6.2177,256600,4065,661,636,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255
2,Mission Viejo,-117.67,33.61,1972,24,5.7871,227400,3859,661,624,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255
3,Mission Viejo,-117.66,33.61,1713,17,6.0471,248400,3464,519,530,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255
4,Mission Viejo,-117.66,33.61,860,21,7.1497,274000,1932,266,286,<1H OCEAN,94.75,63,59,5.01,broken clouds,Orange,1191075,71255


In [7]:
#drop unnecessary/low value columns 'Longitude','latitude','City','County'

housing_df.drop(['City', 'County','Longitude','latitude'],axis=1, inplace=True)


In [8]:
#convert categorical values

housing_df=pd.get_dummies(housing_df)
housing_df.head()

Unnamed: 0,Population,median_age,median_income,median_house_value,total_rooms,Bedrooms,Households,max_temp,Humidity,Cloudiness,...,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,Description_broken clouds,Description_clear sky,Description_few clouds,Description_haze,Description_overcast clouds,Description_scattered clouds,Description_smoke,Description_thunderstorm
0,789,16,8.4112,286900,2022,254,270,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0
1,1962,16,6.2177,256600,4065,661,636,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0
2,1972,24,5.7871,227400,3859,661,624,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0
3,1713,17,6.0471,248400,3464,519,530,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0
4,860,21,7.1497,274000,1932,266,286,94.75,63,59,...,0,0,1,0,0,0,0,0,0,0


In [9]:
#group y variable data for analysis using increments 100k
######## can change as needed

df2 = housing_df.copy()

df2.loc[df2['median_house_value'].between(0, 100000, 'both'), 'house_value_group'] = '<= $100k'
df2.loc[df2['median_house_value'].between(100000, 200000, 'right'), 'house_value_group'] = 'between $100k and $200k'
df2.loc[df2['median_house_value'].between(200000, 300000, 'right'), 'house_value_group'] = 'between $200k and $300k'
df2.loc[df2['median_house_value'].between(300000, 400000, 'right'), 'house_value_group'] = 'between $300k and $400k'
df2.loc[df2['median_house_value'].between(400000, 1000000000, 'right'), 'house_value_group'] = '>$400k'

In [10]:
#encode new housing variable to numeric

le = LabelEncoder()

df2['house_value_group'] = le.fit_transform(df2['house_value_group']) 
print(df2.head())

print(df2['house_value_group'].value_counts())
print(df2.house_value_group)


   Population  median_age  median_income  median_house_value  total_rooms  \
0         789          16         8.4112              286900         2022   
1        1962          16         6.2177              256600         4065   
2        1972          24         5.7871              227400         3859   
3        1713          17         6.0471              248400         3464   
4         860          21         7.1497              274000         1932   

   Bedrooms  Households  max_temp  Humidity  Cloudiness  ...  \
0       254         270     94.75        63          59  ...   
1       661         636     94.75        63          59  ...   
2       661         624     94.75        63          59  ...   
3       519         530     94.75        63          59  ...   
4       266         286     94.75        63          59  ...   

   ocean_proximity_NEAR OCEAN  Description_broken clouds  \
0                           0                          1   
1                           0   

# Split the Data into Training and Testing

In [11]:
# Create our features ,axis=1,inplace=True,axis=1
########## using standard 80-20 training split can re-specify as needed

X = housing_df.copy()
X = X.drop(columns=["median_house_value"])
X.head()

# Create our target
y = df2["house_value_group"].values


In [12]:
X

Unnamed: 0,Population,median_age,median_income,total_rooms,Bedrooms,Households,max_temp,Humidity,Cloudiness,wind_speed,...,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,Description_broken clouds,Description_clear sky,Description_few clouds,Description_haze,Description_overcast clouds,Description_scattered clouds,Description_smoke,Description_thunderstorm
0,789,16,8.4112,2022,254,270,94.75,63,59,5.01,...,0,0,1,0,0,0,0,0,0,0
1,1962,16,6.2177,4065,661,636,94.75,63,59,5.01,...,0,0,1,0,0,0,0,0,0,0
2,1972,24,5.7871,3859,661,624,94.75,63,59,5.01,...,0,0,1,0,0,0,0,0,0,0
3,1713,17,6.0471,3464,519,530,94.75,63,59,5.01,...,0,0,1,0,0,0,0,0,0,0
4,860,21,7.1497,1932,266,286,94.75,63,59,5.01,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11448,4005,9,8.3931,10484,1603,1419,95.76,57,20,11.50,...,0,0,0,0,1,0,0,0,0,0
11449,1665,16,7.2554,3781,504,499,95.76,57,20,11.50,...,0,0,0,0,1,0,0,0,0,0
11450,2146,16,6.1504,4390,660,633,95.76,57,20,11.50,...,0,0,0,0,1,0,0,0,0,0
11451,2061,13,7.3681,5415,827,714,95.76,57,20,11.50,...,0,0,0,0,1,0,0,0,0,0


In [13]:
y

array([3, 3, 3, ..., 3, 4, 3])

In [14]:
X.describe()

Unnamed: 0,Population,median_age,median_income,total_rooms,Bedrooms,Households,max_temp,Humidity,Cloudiness,wind_speed,...,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,Description_broken clouds,Description_clear sky,Description_few clouds,Description_haze,Description_overcast clouds,Description_scattered clouds,Description_smoke,Description_thunderstorm
count,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,...,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0,11453.0
mean,1418.18231,29.464682,4.020455,2555.83166,530.219244,496.074129,93.964839,55.465118,17.593731,9.627652,...,0.116039,0.06924,0.094036,0.563608,0.233127,0.002532,0.039553,0.059373,0.006374,0.001397
std,1059.406866,12.138935,1.979255,2097.909448,411.703906,370.943204,6.659285,17.079511,26.313119,3.740398,...,0.320286,0.253872,0.291892,0.495959,0.422841,0.050258,0.194915,0.236332,0.079585,0.037352
min,3.0,1.0,0.4999,6.0,2.0,2.0,61.84,9.0,0.0,1.01,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,790.0,20.0,2.6477,1421.0,292.0,279.0,91.4,46.0,0.0,6.91,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1170.0,30.0,3.6458,2077.0,429.0,408.0,95.31,59.0,0.0,11.5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1729.0,37.0,4.9643,3052.0,638.0,599.0,97.93,66.0,20.0,11.5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,15507.0,52.0,15.0001,32054.0,5419.0,5050.0,108.09,95.0,100.0,21.85,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
########## using standard 80-20 training split can re-specify as needed

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

In [16]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
########################   check the test and trainig sets  

print(f"X_train ",X_train.shape) 
print(f"X_test ",X_test.shape) 
print(f"y_train ",y_train.shape) 
print(f"y_test ",y_test.shape)

X_train  (8589, 25)
X_test  (2864, 25)
y_train  (8589,)
y_test  (2864,)


# Ensemble Learners
Run Random Forest and evaluate

Depending on need,
compare multiple algorithms to determine which algorithm results in the best performance:

** Logistic Regression
**Random Forest Regressor
** Neural Network
**Hist Gradient Booster Regressor



Note: Use a random state of 1 for each algorithm to ensure consistency between tests

# Logistic Regression

In [18]:
# Define the model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1,max_iter = 200,multi_class= 'auto')

In [19]:
# Fit the model
model.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [20]:
y_pred = model.predict(X_test_scaled)
y_pred

array([3, 2, 3, ..., 3, 3, 1])

In [21]:
results = pd.DataFrame({'Actual':y_test,'Predicted':y_pred})
results

Unnamed: 0,Actual,Predicted
0,1,3
1,2,2
2,3,3
3,3,3
4,4,3
...,...,...
2859,1,4
2860,3,2
2861,1,3
2862,3,3


In [22]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6243016759776536


# Random Forest Classifier

In [23]:
rf_model = RandomForestClassifier(n_estimators= 50,random_state=1,max_features = 'auto',max_depth = 7)

In [24]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [25]:
predictions = rf_model.predict(X_test_scaled)

In [26]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1","Actual_2","Actual_3","Actual_4"], columns=["Predicted 0", "Predicted 1","Predicted 2","Predicted 3","Predicted 4"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [27]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 0,222,0,114,0,0
Actual 1,1,164,31,78,7
Actual_2,50,1,950,102,1
Actual_3,4,4,344,448,3
Actual_4,2,34,71,203,30


Accuracy Score : 0.6333798882681564
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.66      0.72       336
           1       0.81      0.58      0.68       281
           2       0.63      0.86      0.73      1104
           3       0.54      0.56      0.55       803
           4       0.73      0.09      0.16       340

    accuracy                           0.63      2864
   macro avg       0.70      0.55      0.57      2864
weighted avg       0.65      0.63      0.60      2864



In [28]:
importances = rf_model.feature_importances_
importances

array([4.07540432e-02, 3.08869820e-02, 3.05064392e-01, 6.11850746e-02,
       2.19524578e-02, 2.20640857e-02, 9.18295491e-02, 9.00722978e-02,
       1.13120975e-02, 3.96582487e-02, 8.38951165e-02, 9.16317876e-02,
       1.36412099e-02, 6.37422329e-02, 1.17457423e-04, 8.74826569e-03,
       3.01775198e-03, 5.99937982e-03, 4.52150723e-03, 3.39326076e-03,
       9.75784724e-05, 1.74205160e-03, 4.16100475e-03, 4.51395305e-04,
       6.07717543e-05])

In [29]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3050643918953775, 'median_income'),
 (0.09182954913263268, 'max_temp'),
 (0.09163178764644841, 'Establishments'),
 (0.09007229779836373, 'Humidity'),
 (0.08389511650494208, 'Employees'),
 (0.06374223291572177, 'ocean_proximity_INLAND'),
 (0.0611850746250479, 'total_rooms'),
 (0.04075404318101802, 'Population'),
 (0.03965824870815149, 'wind_speed'),
 (0.03088698204802871, 'median_age'),
 (0.02206408566192164, 'Households'),
 (0.021952457770797587, 'Bedrooms'),
 (0.013641209868115591, 'ocean_proximity_<1H OCEAN'),
 (0.011312097465447582, 'Cloudiness'),
 (0.008748265686906835, 'ocean_proximity_NEAR BAY'),
 (0.005999379817989007, 'Description_broken clouds'),
 (0.00452150722743291, 'Description_clear sky'),
 (0.004161004745077002, 'Description_scattered clouds'),
 (0.0033932607613688624, 'Description_few clouds'),
 (0.0030177519819783215, 'ocean_proximity_NEAR OCEAN'),
 (0.0017420516021485757, 'Description_overcast clouds'),
 (0.00045139530526276607, 'Description_smoke'),
 (0.000117457

# Gradient Boosting Classifier

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
# define the model
model = GradientBoostingClassifier(random_state = 1)
model.fit(X_train_scaled,y_train)


GradientBoostingClassifier(random_state=1)

In [32]:
y_pred = model.predict(X_test_scaled)
y_pred

df1=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df1

Unnamed: 0,Actual,Predicted
0,1,1
1,2,2
2,3,3
3,3,2
4,4,4
...,...,...
2859,1,1
2860,3,2
2861,1,3
2862,3,3


In [33]:
model.score(X_test_scaled, y_test)

0.6843575418994413