In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv(r'C:\Users\earth\Desktop\class_folder\FinalProject\Resources\final_table.csv')
df.head()


Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks
0,2022-06,2022,Cherokee,362500.0,271.0,38.0,148.0,8.0,76.0,1904.0,407510.0,467.0,,5.52,,-0.052164,2.0
1,2022-06,2022,Craven,313500.0,154.0,36.0,196.0,12.0,48.0,1927.0,396716.0,581.0,,5.52,,0.039801,63.0
2,2022-06,2022,Catawba,342450.0,207.0,34.0,224.0,0.0,96.0,1923.0,521108.0,508.0,,5.52,,-0.042098,65.0
3,2022-06,2022,Gaston,330000.0,283.0,30.0,412.0,12.0,200.0,1668.0,377406.0,828.0,,5.52,,0.03937,99.0
4,2022-06,2022,Warren,275000.0,19.0,68.0,12.0,0.0,4.0,1824.0,419452.0,29.0,,5.52,,,1.0


In [4]:
#conditional based column = np.where(df['median_listing_price_mm'] (x>=0), 'up(1)', 'down(0)')
df['Price_Status'] = np.where(df['median_listing_price_mm']>=0, 'up', 'down')
df['violent_crime_rate']= df['violent_crime_rate'].str.replace(',','').astype(float)
df.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
0,2022-06,2022,Cherokee,362500.0,271.0,38.0,148.0,8.0,76.0,1904.0,407510.0,467.0,,5.52,,-0.052164,2.0,down
1,2022-06,2022,Craven,313500.0,154.0,36.0,196.0,12.0,48.0,1927.0,396716.0,581.0,,5.52,,0.039801,63.0,up
2,2022-06,2022,Catawba,342450.0,207.0,34.0,224.0,0.0,96.0,1923.0,521108.0,508.0,,5.52,,-0.042098,65.0,down
3,2022-06,2022,Gaston,330000.0,283.0,30.0,412.0,12.0,200.0,1668.0,377406.0,828.0,,5.52,,0.03937,99.0,up
4,2022-06,2022,Warren,275000.0,19.0,68.0,12.0,0.0,4.0,1824.0,419452.0,29.0,,5.52,,,1.0,down


In [5]:
df_new1 = df.drop(['Date'], axis=1)
df_new1.dtypes

Year                         int64
County                      object
median_listing_price       float64
active_listing_count       float64
median_days_on_market      float64
new_listing_count          float64
price_increased_count      float64
price_reduced_count        float64
median_square_feet         float64
average_listing_price      float64
total_listing_count        float64
violent_crime_rate         float64
30_Year_Fixed_Rate         float64
Number_of_Schools          float64
median_listing_price_mm    float64
Number_of_Parks            float64
Price_Status                object
dtype: object

In [6]:
df_new1["Year"].unique()

array([2022, 2021, 2020, 2019, 2018, 2017, 2016], dtype=int64)

In [7]:
filter_df = df_new1[df_new1['Year'] <2019]
#filter_df["Year"].unique()

In [8]:

columns = [
     "Year", "County", 
    "active_listing_count", "median_days_on_market", "new_listing_count",
    "price_increased_count", "price_reduced_count", "median_square_feet",
    "total_listing_count", "violent_crime_rate","Number_of_Parks",
    "Number_of_Schools","30_Year_Fixed_Rate", 
     "Price_Status"]

target = ["Price_Status"]

In [9]:
# Load the data

df_data = filter_df.loc[:, columns].copy()
df_data = df_data.dropna()

df_data.reset_index(inplace=True, drop=True)

df_data.head()
df_data.isnull().sum()

Year                     0
County                   0
active_listing_count     0
median_days_on_market    0
new_listing_count        0
price_increased_count    0
price_reduced_count      0
median_square_feet       0
total_listing_count      0
violent_crime_rate       0
Number_of_Parks          0
Number_of_Schools        0
30_Year_Fixed_Rate       0
Price_Status             0
dtype: int64

Split the Data into Training and Testing

In [10]:
# Create our features
X = df_data.drop('Price_Status', axis=1)

X = pd.get_dummies(X)

# Create our target
y = df_data.loc[:, target].copy()



In [11]:
# Check the balance of our target values
y.value_counts()

Price_Status
down            2454
up               156
dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1) 
                                                 
X_test.shape


(653, 104)

Balanced Random Forest ClassifierÂ¶

In [13]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)
Counter(y_train['Price_Status'])

Counter({'down': 1838, 'up': 119})

In [14]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = rf_model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[532,  84],
       [  2,  35]], dtype=int64)

In [15]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

# score changes to 1.0 if violent_crime_rate is converted from object to float

0.9047911547911548

In [16]:
# Print the imbalanced classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        down       1.00      0.86      0.93       616
          up       0.29      0.95      0.45        37

    accuracy                           0.87       653
   macro avg       0.65      0.90      0.69       653
weighted avg       0.96      0.87      0.90       653



In [17]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)





[(0.3087612589053883, '30_Year_Fixed_Rate'),
 (0.18473001628995933, 'Year'),
 (0.04906316069991865, 'median_days_on_market'),
 (0.048166062353651895, 'active_listing_count'),
 (0.0464933619375409, 'price_reduced_count'),
 (0.04477406812660071, 'Number_of_Schools'),
 (0.04427359752929202, 'median_square_feet'),
 (0.043072549937883814, 'total_listing_count'),
 (0.03830075836727427, 'new_listing_count'),
 (0.03408303704174747, 'Number_of_Parks'),
 (0.028868158011376738, 'violent_crime_rate'),
 (0.023122315373498716, 'price_increased_count'),
 (0.0025734106685933156, 'County_Polk'),
 (0.002333284917366244, 'County_Wilson'),
 (0.0022871939739942007, 'County_Franklin'),
 (0.0022743985346060373, 'County_Transylvania'),
 (0.0022350488561828485, 'County_Clay'),
 (0.0021418858092868494, 'County_Randolph'),
 (0.002138589844203885, 'County_Durham'),
 (0.00210989768339623, 'County_Lincoln'),
 (0.0020443857492807183, 'County_Duplin'),
 (0.002034283614672708, 'County_Columbus'),
 (0.00199520627031112

# Hypertune features

In [18]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=900, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)
Counter(y_train['Price_Status'])

Counter({'down': 1838, 'up': 119})

In [19]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = rf_model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[532,  84],
       [  0,  37]], dtype=int64)

In [20]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9318181818181819

In [21]:
# Print the imbalanced classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        down       1.00      0.86      0.93       616
          up       0.31      1.00      0.47        37

    accuracy                           0.87       653
   macro avg       0.65      0.93      0.70       653
weighted avg       0.96      0.87      0.90       653



In [None]:
# heat map of feature correlation
plt.rcParams['figure.figsize']=35,35
g = sns.heatmap(df.corr(),annot=True, fmt = ".1f")

In [None]:
# distribution plot of column
sns.distplot(df['median_listing_price'])

In [None]:
# convert Price_class to numeric, distribution of median_listing_price increase|decrease month/month
df['Price_Status'] = df['Price_Status'].replace(['up','down'],['1','0'])
sns.distplot(df['Price_Status'])

In [None]:
# Reference code for heatmaps :https://www.kaggle.com/code/bsivavenu/house-price-calculation-methods-for-beginners

# most correlated features
corrmat = df.corr()
top_corr_features = corrmat.index[abs(corrmat["median_listing_price"])>0.30]
plt.figure(figsize=(10,10))
g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="mako")

In [None]:
# seaborn linear regression fit: median listing price vs median square feet
sns.jointplot(data=df, x="median_listing_price", y="median_square_feet", kind="reg")

# outliers present

In [None]:
#median_listing_price vs average listing price
sns.jointplot(data=df, x="median_listing_price", y="average_listing_price", kind="reg")

In [None]:
# seaborn linear regression fit: median listing price vs median square feet
sns.jointplot(data=df, x="median_listing_price", y="violent_crime_rate", kind="reg")

In [None]:
#median listing price vs Year
sns.jointplot(data=df, x="median_listing_price", y="Year", kind="reg")

In [None]:
# Find Missing Ratio of Dataset
# reference: https://stackoverflow.com/questions/51070985/find-out-the-percentage-of-missing-values-in-each-column-in-the-given-dataset
#percent_missing = df.isnull().sum() * 100 / len(df)
#missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 #'percent_missing': percent_missing})
#missing_value_df

In [None]:
# reference
# https://www.kaggle.com/code/kanncaa1/machine-learning-tutorial-for-beginners/notebook
#https://www.kaggle.com/code/erick5/predicting-house-prices-with-machine-learning/notebook

In [None]:
# dict of county names with values
#county_names = df.County.unique()
#county_di = dict(zip(county_names, range(len(county_names))))
#county_di

In [None]:
# county names with #'s'
#df_new = df.copy()
#df_new['County'].replace(county_di, inplace=True)
#df_new.head()

In [None]:
#df_new.to_csv("c:/tmp/courses.csv",header=False)

In [None]:
# copy the data
#df_max_scaled = df_data.copy()
  
# apply normalization techniques
#for column in df_max_scaled.columns:
    #df_max_scaled[column] = df_max_scaled[column]  / df_max_scaled[column].abs().max()
      
# view normalized data
#display(df_max_scaled)