In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
df = pd.read_csv('Resources/filled_final_table.csv')
df.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks
0,2022-06,2022,Alamance,382500,298,18,452,12,108,2112,443362,644,,5.52,,0.034063,54.0
1,2022-06,2022,Alexander,312450,34,35,28,0,20,1942,370648,59,,5.52,,0.016428,25.0
2,2022-06,2022,Alleghany,389000,54,39,24,0,20,1904,422802,92,,5.52,,-0.0382,0.0
3,2022-06,2022,Anson,170000,22,34,16,0,8,1502,262240,48,,5.52,,0.218638,3.0
4,2022-06,2022,Ashe,485000,76,31,48,0,20,1907,592879,137,,5.52,,-0.088346,5.0


In [3]:
df['violent_crime_rate']= df['violent_crime_rate'].astype(str).str.replace(',','').astype(float)
df.dtypes

Date                        object
Year                         int64
County                      object
median_listing_price         int64
active_listing_count         int64
median_days_on_market        int64
new_listing_count            int64
price_increased_count        int64
price_reduced_count          int64
median_square_feet           int64
average_listing_price        int64
total_listing_count          int64
violent_crime_rate         float64
30_Year_Fixed_Rate         float64
Number_of_Schools          float64
median_listing_price_mm    float64
Number_of_Parks            float64
dtype: object

In [4]:
filter_df = df[df['Year'] <2019]

In [5]:
filter_df['Price_Status'] = np.where(filter_df['median_listing_price_mm']>=0, 'up', 'down')

In [6]:
columns = [
    "Date", "Year", "County",
    "active_listing_count", "median_days_on_market", "new_listing_count",
    "price_increased_count", "price_reduced_count", "median_square_feet",
    "total_listing_count", "violent_crime_rate","Number_of_Parks",
    "Number_of_Schools","30_Year_Fixed_Rate", 
     "Price_Status"]

target = ["Price_Status"]

In [7]:
# Load the data

df_data = filter_df.loc[:, columns].copy()
df_data = df_data.dropna()

df_data.reset_index(inplace=True, drop=True)

df_data.head()
df_data.isnull().sum()

Date                     0
Year                     0
County                   0
active_listing_count     0
median_days_on_market    0
new_listing_count        0
price_increased_count    0
price_reduced_count      0
median_square_feet       0
total_listing_count      0
violent_crime_rate       0
Number_of_Parks          0
Number_of_Schools        0
30_Year_Fixed_Rate       0
Price_Status             0
dtype: int64

In [8]:
# Create our features
X = df_data.drop('Price_Status', axis=1)

X = pd.get_dummies(X)

# Create our target
y = df_data.loc[:, target].copy()

In [9]:
# Check the balance of our target values
y['Price_Status'].value_counts()

up      1448
down    1162
Name: Price_Status, dtype: int64

### Ensemble Learners


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_test.shape
y_test.shape
y_train.shape

(1957, 1)

In [11]:
pip install imbalanced-learn==0.9.0

Collecting scikit-learn>=1.0.1
  Downloading scikit_learn-1.0.2-cp37-cp37m-macosx_10_13_x86_64.whl (7.8 MB)
[K     |████████████████████████████████| 7.8 MB 1.5 MB/s eta 0:00:01
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0
    Uninstalling scikit-learn-1.0:
      Successfully uninstalled scikit-learn-1.0
Successfully installed scikit-learn-1.0.2
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install scikit-learn==1.0

Collecting scikit-learn==1.0
  Using cached scikit_learn-1.0-cp37-cp37m-macosx_10_13_x86_64.whl (7.9 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imbalanced-learn 0.9.0 requires scikit-learn>=1.0.1, but you have scikit-learn 1.0 which is incompatible.[0m
Successfully installed scikit-learn-1.0
Note: you may need to restart the kernel to use updated packages.


In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Price_Status': 1})

In [33]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
brf_model = brf_model.fit(X_train, y_train)

# Making predictions using the testing data
predictions = brf_model.predict(X_test)

In [34]:
X_train.shape

(1957, 134)

In [16]:
X.describe()

Unnamed: 0,Year,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,total_listing_count,violent_crime_rate,Number_of_Parks,...,County_Union,County_Vance,County_Wake,County_Warren,County_Watauga,County_Wayne,County_Wilkes,County_Wilson,County_Yadkin,County_Yancey
count,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,...,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0,2610.0
mean,2017.181609,568.260536,113.9659,156.97931,15.834483,132.331034,2067.97318,728.490421,285.934483,90.041379,...,0.011494,0.011494,0.011494,0.011494,0.011494,0.011494,0.011494,0.011494,0.011494,0.011494
std,0.75315,650.648846,41.824278,282.25789,45.725834,233.366958,343.581847,917.663347,176.313277,217.889553,...,0.106614,0.106614,0.106614,0.106614,0.106614,0.106614,0.106614,0.106614,0.106614,0.106614
min,2016.0,12.0,18.0,0.0,0.0,0.0,1295.0,14.0,19.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,159.0,84.0,28.0,0.0,20.0,1847.25,179.0,167.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2017.0,367.0,107.0,72.0,4.0,60.0,1996.0,434.0,227.7,25.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2018.0,681.5,137.0,168.0,12.0,144.0,2250.0,917.75,365.3,83.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2018.0,4908.0,330.0,2256.0,488.0,2180.0,3527.0,5986.0,920.3,1627.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)
Counter(y_train['Price_Status'])

Counter({'down': 864, 'up': 1093})

In [30]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = rf_model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[161, 137],
       [146, 209]])

In [35]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

y_pred = brf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5645004253710181

In [36]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,161,137
Actual 1,146,209


In [37]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       down       0.52      0.54      0.59      0.53      0.56      0.32       298
         up       0.60      0.59      0.54      0.60      0.56      0.32       355

avg / total       0.57      0.57      0.56      0.57      0.56      0.32       653



In [38]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)

[(0.0799400428498193, 'median_square_feet'),
 (0.07914113503157279, 'median_days_on_market'),
 (0.07741050378175579, 'total_listing_count'),
 (0.07729372980894128, 'active_listing_count'),
 (0.06939624893777557, 'price_reduced_count'),
 (0.06759715187860406, 'new_listing_count'),
 (0.0632074693089032, '30_Year_Fixed_Rate'),
 (0.05425894278814555, 'violent_crime_rate'),
 (0.047991644873677936, 'Number_of_Schools'),
 (0.04020975669416658, 'Number_of_Parks'),
 (0.0322096100282059, 'price_increased_count'),
 (0.017701253805088038, 'Year'),
 (0.016556278519109777, 'Date_2016-07'),
 (0.008246497465934258, 'Date_2017-04'),
 (0.008193925627171807, 'Date_2017-03'),
 (0.006322129608195239, 'Date_2017-06'),
 (0.006288128030707354, 'Date_2017-12'),
 (0.005655685269294064, 'Date_2018-03'),
 (0.005637827557722277, 'Date_2017-09'),
 (0.0056370624875779676, 'Date_2018-08'),
 (0.00553720588403463, 'Date_2017-02'),
 (0.005467247042240972, 'Date_2018-10'),
 (0.0053651048857870165, 'Date_2018-07'),
 (0.00

### Easy Ensemble AdaBoost Classifier

In [39]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec_model = EasyEnsembleClassifier(n_estimators=200, random_state=1)

# Fitting the model
eec_model = eec_model.fit(X_train, y_train)

# Making predictions using the testing data
predictions = eec_model.predict(X_test)

In [25]:
# Calculated the balanced accuracy score
y_pred = eec_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5827110312884015

In [26]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,192,106
Actual 1,170,185


In [27]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       down       0.53      0.64      0.52      0.58      0.58      0.34       298
         up       0.64      0.52      0.64      0.57      0.58      0.33       355

avg / total       0.59      0.58      0.59      0.58      0.58      0.34       653

