In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
df = pd.read_csv('Resources/filled_final_table.csv')
df.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks
0,2022-06,2022,Alamance,382500,298,18,452,12,108,2112,443362,644,,5.52,,0.034063,54.0
1,2022-06,2022,Alexander,312450,34,35,28,0,20,1942,370648,59,,5.52,,0.016428,25.0
2,2022-06,2022,Alleghany,389000,54,39,24,0,20,1904,422802,92,,5.52,,-0.0382,0.0
3,2022-06,2022,Anson,170000,22,34,16,0,8,1502,262240,48,,5.52,,0.218638,3.0
4,2022-06,2022,Ashe,485000,76,31,48,0,20,1907,592879,137,,5.52,,-0.088346,5.0


In [4]:
df.dtypes

Date                        object
Year                         int64
County                      object
median_listing_price         int64
active_listing_count         int64
median_days_on_market        int64
new_listing_count            int64
price_increased_count        int64
price_reduced_count          int64
median_square_feet           int64
average_listing_price        int64
total_listing_count          int64
violent_crime_rate          object
30_Year_Fixed_Rate         float64
Number_of_Schools          float64
median_listing_price_mm    float64
Number_of_Parks            float64
dtype: object

In [7]:

df['violent_crime_rate']= df['violent_crime_rate'].astype(str).str.replace(',','').astype(float)
df.dtypes

Date                        object
Year                         int64
County                      object
median_listing_price         int64
active_listing_count         int64
median_days_on_market        int64
new_listing_count            int64
price_increased_count        int64
price_reduced_count          int64
median_square_feet           int64
average_listing_price        int64
total_listing_count          int64
violent_crime_rate         float64
30_Year_Fixed_Rate         float64
Number_of_Schools          float64
median_listing_price_mm    float64
Number_of_Parks            float64
dtype: object

In [8]:
df["Year"].unique()

array([2022, 2021, 2020, 2019, 2018, 2017, 2016])

In [9]:
filter_df = df[df['Year'] <2019]

In [14]:
filter_df['Price_Status'] = np.where(filter_df['median_listing_price_mm']>=0, 'up', 'down')

In [15]:
filter_df.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
4196,2018-12,2018,Alamance,249900,761,78,172,28,116,2060,276731,843,419.9,4.64,94.0,0.021635,54.0,up
4197,2018-12,2018,Alexander,229900,90,102,8,0,16,1924,258096,103,221.9,4.64,49.0,0.108219,25.0,up
4198,2018-12,2018,Alleghany,114900,420,227,4,0,20,1886,179920,438,114.8,4.64,11.0,-0.00087,0.0,down
4199,2018-12,2018,Anson,125000,65,100,8,0,12,1500,152823,78,568.4,4.64,15.0,-0.037721,3.0,down
4200,2018-12,2018,Ashe,65000,899,214,28,0,52,1776,157405,957,,4.64,7.0,0.00077,5.0,up


In [16]:
columns = [
    "Date", "Year", "County",
    "active_listing_count", "median_days_on_market", "new_listing_count",
    "price_increased_count", "price_reduced_count", "median_square_feet",
    "total_listing_count", "violent_crime_rate","Number_of_Parks",
    "Number_of_Schools","30_Year_Fixed_Rate", 
     "Price_Status"]

target = ["Price_Status"]

In [17]:
# Load the data

df_data = filter_df.loc[:, columns].copy()
df_data = df_data.dropna()

df_data.reset_index(inplace=True, drop=True)

df_data.head()
df_data.isnull().sum()

Date                     0
Year                     0
County                   0
active_listing_count     0
median_days_on_market    0
new_listing_count        0
price_increased_count    0
price_reduced_count      0
median_square_feet       0
total_listing_count      0
violent_crime_rate       0
Number_of_Parks          0
Number_of_Schools        0
30_Year_Fixed_Rate       0
Price_Status             0
dtype: int64

In [18]:
# Create our features
X = df_data.drop('Price_Status', axis=1)

X = pd.get_dummies(X)

# Create our target
y = df_data.loc[:, target].copy()

In [19]:
# Check the balance of our target values
y['Price_Status'].value_counts()

up      1448
down    1162
Name: Price_Status, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Price_Status': 1})

In [21]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(X.dtypes)

Year                       int64
active_listing_count       int64
median_days_on_market      int64
new_listing_count          int64
price_increased_count      int64
price_reduced_count        int64
median_square_feet         int64
total_listing_count        int64
violent_crime_rate       float64
Number_of_Parks          float64
Number_of_Schools        float64
30_Year_Fixed_Rate       float64
Date_2016-07               uint8
Date_2016-08               uint8
Date_2016-09               uint8
Date_2016-10               uint8
Date_2016-11               uint8
Date_2016-12               uint8
Date_2017-01               uint8
Date_2017-02               uint8
Date_2017-03               uint8
Date_2017-04               uint8
Date_2017-05               uint8
Date_2017-06               uint8
Date_2017-07               uint8
Date_2017-08               uint8
Date_2017-09               uint8
Date_2017-10               uint8
Date_2017-11               uint8
Date_2017-12               uint8
Date_2018-

### Naive Random Oversampling

In [22]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Price_Status': 1})

In [23]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [24]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.49836468475281215

In [25]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[156, 142],
       [187, 168]])

In [26]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       down       0.45      0.52      0.47      0.49      0.50      0.25       298
         up       0.54      0.47      0.52      0.51      0.50      0.25       355

avg / total       0.50      0.50      0.50      0.50      0.50      0.25       653



### SMOTE Oversampling

In [27]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Price_Status': 1})

In [28]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [29]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5017203894507988

In [30]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[158, 140],
       [187, 168]])

In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       down       0.46      0.53      0.47      0.49      0.50      0.25       298
         up       0.55      0.47      0.53      0.51      0.50      0.25       355

avg / total       0.51      0.50      0.50      0.50      0.50      0.25       653



### Undersampling

In [32]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Price_Status': 1})

In [33]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [34]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.49671046412704417

In [35]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[176, 122],
       [212, 143]])

In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       down       0.45      0.59      0.40      0.51      0.49      0.24       298
         up       0.54      0.40      0.59      0.46      0.49      0.23       355

avg / total       0.50      0.49      0.50      0.48      0.49      0.24       653



### Combination (Over and Under) Sampling


In [37]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled)

Counter({'Price_Status': 1})

In [38]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs')
model.fit(X_resampled, y_resampled)

LogisticRegression()

In [39]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.536610265620569

In [40]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[230,  68],
       [248, 107]])

In [41]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       down       0.48      0.77      0.30      0.59      0.48      0.24       298
         up       0.61      0.30      0.77      0.40      0.48      0.22       355

avg / total       0.55      0.52      0.56      0.49      0.48      0.23       653

