In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv(r'C:\Users\earth\Desktop\class_folder\FinalProject\Resources\filled_final_table.csv')
df.head()


Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks
0,2022-06,2022,Alamance,382500,298,18,452,12,108,2112,443362,644,,5.52,,0.034063,54.0
1,2022-06,2022,Alexander,312450,34,35,28,0,20,1942,370648,59,,5.52,,0.016428,25.0
2,2022-06,2022,Alleghany,389000,54,39,24,0,20,1904,422802,92,,5.52,,-0.0382,0.0
3,2022-06,2022,Anson,170000,22,34,16,0,8,1502,262240,48,,5.52,,0.218638,3.0
4,2022-06,2022,Ashe,485000,76,31,48,0,20,1907,592879,137,,5.52,,-0.088346,5.0


In [28]:
#conditional based column = np.where(df['median_listing_price_mm'] (x>=0), 'up(1)', 'down(0)')
df['Price_Status'] = np.where(df['median_listing_price_mm']>=0, 'up', 'down')
df['violent_crime_rate']= df['violent_crime_rate'].str.replace(',','').astype(float)
df.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
0,2022-06,2022,Alamance,382500,298,18,452,12,108,2112,443362,644,,5.52,,0.034063,54.0,up
1,2022-06,2022,Alexander,312450,34,35,28,0,20,1942,370648,59,,5.52,,0.016428,25.0,up
2,2022-06,2022,Alleghany,389000,54,39,24,0,20,1904,422802,92,,5.52,,-0.0382,0.0,down
3,2022-06,2022,Anson,170000,22,34,16,0,8,1502,262240,48,,5.52,,0.218638,3.0,up
4,2022-06,2022,Ashe,485000,76,31,48,0,20,1907,592879,137,,5.52,,-0.088346,5.0,down


In [29]:
df_new1 = df.drop(['Date'], axis=1)
df_new1.dtypes

Year                         int64
County                      object
median_listing_price         int64
active_listing_count         int64
median_days_on_market        int64
new_listing_count            int64
price_increased_count        int64
price_reduced_count          int64
median_square_feet           int64
average_listing_price        int64
total_listing_count          int64
violent_crime_rate         float64
30_Year_Fixed_Rate         float64
Number_of_Schools          float64
median_listing_price_mm    float64
Number_of_Parks            float64
Price_Status                object
dtype: object

In [30]:
df_new1["Year"].unique()

array([2022, 2021, 2020, 2019, 2018, 2017, 2016], dtype=int64)

In [31]:
filter_df = df_new1[df_new1['Year'] <2019]
#filter_df["Year"].unique()

In [32]:

columns = [
     "Year", "County", 
    "active_listing_count", "median_days_on_market", "new_listing_count",
    "price_increased_count", "price_reduced_count", "median_square_feet",
    "total_listing_count", "violent_crime_rate","Number_of_Parks",
    "Number_of_Schools","30_Year_Fixed_Rate", 
     "Price_Status"]

target = ["Price_Status"]

In [33]:
# Load the data

df_data = filter_df.loc[:, columns].copy()
df_data = df_data.dropna()

df_data.reset_index(inplace=True, drop=True)

df_data.head()
df_data.isnull().sum()

Year                     0
County                   0
active_listing_count     0
median_days_on_market    0
new_listing_count        0
price_increased_count    0
price_reduced_count      0
median_square_feet       0
total_listing_count      0
violent_crime_rate       0
Number_of_Parks          0
Number_of_Schools        0
30_Year_Fixed_Rate       0
Price_Status             0
dtype: int64

Split the Data into Training and Testing

In [34]:
# Create our features
X = df_data.drop('Price_Status', axis=1)

X = pd.get_dummies(X)

# Create our target
y = df_data.loc[:, target].copy()



In [35]:
# Check the balance of our target values
y.value_counts()

Price_Status
up              1448
down            1162
dtype: int64

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1) 
                                                 
X_test.shape


(653, 104)

Balanced Random Forest Classifier¶

In [37]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)
Counter(y_train['Price_Status'])

Counter({'down': 864, 'up': 1093})

In [38]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = rf_model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[162, 136],
       [162, 193]], dtype=int64)

In [39]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

# score changes to 1.0 if violent_crime_rate is converted from object to float

0.5436430664524057

In [40]:
# Print the imbalanced classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        down       0.50      0.54      0.52       298
          up       0.59      0.54      0.56       355

    accuracy                           0.54       653
   macro avg       0.54      0.54      0.54       653
weighted avg       0.55      0.54      0.54       653



In [41]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)





[(0.10671278415466, '30_Year_Fixed_Rate'),
 (0.10096187499467439, 'active_listing_count'),
 (0.10089983780866356, 'median_days_on_market'),
 (0.10030907920154715, 'median_square_feet'),
 (0.09677620884921763, 'total_listing_count'),
 (0.08711799004372474, 'new_listing_count'),
 (0.08689255814566962, 'price_reduced_count'),
 (0.055766691604878356, 'violent_crime_rate'),
 (0.04388434602107332, 'Number_of_Schools'),
 (0.03942473932878671, 'price_increased_count'),
 (0.03647066187110157, 'Number_of_Parks'),
 (0.024335716315164187, 'Year'),
 (0.003395867613404141, 'County_Swain'),
 (0.0028051118133615004, 'County_Rutherford'),
 (0.002511960467145174, 'County_Duplin'),
 (0.0020618194621488304, 'County_Orange'),
 (0.0020215274389601976, 'County_Stanly'),
 (0.0019855941135636856, 'County_Henderson'),
 (0.0019730372687337, 'County_Clay'),
 (0.0018865602554889811, 'County_Stokes'),
 (0.001810053704152922, 'County_Cleveland'),
 (0.0017691052059457463, 'County_Wilkes'),
 (0.0017510276278626, 'Coun

# Hypertune features

In [54]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=900, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)
Counter(y_train['Price_Status'])

Counter({'down': 864, 'up': 1093})

In [55]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = rf_model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[157, 141],
       [153, 202]], dtype=int64)

In [56]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.5479298610454675

In [57]:
# Print the imbalanced classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        down       0.51      0.53      0.52       298
          up       0.59      0.57      0.58       355

    accuracy                           0.55       653
   macro avg       0.55      0.55      0.55       653
weighted avg       0.55      0.55      0.55       653



In [None]:
# heat map of feature correlation
plt.rcParams['figure.figsize']=35,35
g = sns.heatmap(df.corr(),annot=True, fmt = ".1f")

In [None]:
# distribution plot of column
sns.distplot(df['median_listing_price'])

In [None]:
# convert Price_class to numeric, distribution of median_listing_price increase|decrease month/month
df['Price_Status'] = df['Price_Status'].replace(['up','down'],['1','0'])
sns.distplot(df['Price_Status'])

In [None]:
# Reference code for heatmaps :https://www.kaggle.com/code/bsivavenu/house-price-calculation-methods-for-beginners

# most correlated features
corrmat = df.corr()
top_corr_features = corrmat.index[abs(corrmat["median_listing_price"])>0.30]
plt.figure(figsize=(10,10))
g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="mako")

In [None]:
# seaborn linear regression fit: median listing price vs median square feet
sns.jointplot(data=df, x="median_listing_price", y="median_square_feet", kind="reg")

# outliers present

In [None]:
#median_listing_price vs average listing price
sns.jointplot(data=df, x="median_listing_price", y="average_listing_price", kind="reg")

In [None]:
# seaborn linear regression fit: median listing price vs median square feet
sns.jointplot(data=df, x="median_listing_price", y="violent_crime_rate", kind="reg")

In [None]:
#median listing price vs Year
sns.jointplot(data=df, x="median_listing_price", y="Year", kind="reg")

In [None]:
# Find Missing Ratio of Dataset
# reference: https://stackoverflow.com/questions/51070985/find-out-the-percentage-of-missing-values-in-each-column-in-the-given-dataset
#percent_missing = df.isnull().sum() * 100 / len(df)
#missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 #'percent_missing': percent_missing})
#missing_value_df

In [None]:
# reference
# https://www.kaggle.com/code/kanncaa1/machine-learning-tutorial-for-beginners/notebook
#https://www.kaggle.com/code/erick5/predicting-house-prices-with-machine-learning/notebook

In [None]:
# dict of county names with values
#county_names = df.County.unique()
#county_di = dict(zip(county_names, range(len(county_names))))
#county_di

In [None]:
# county names with #'s'
#df_new = df.copy()
#df_new['County'].replace(county_di, inplace=True)
#df_new.head()

In [None]:
#df_new.to_csv("c:/tmp/courses.csv",header=False)

In [None]:
# copy the data
#df_max_scaled = df_data.copy()
  
# apply normalization techniques
#for column in df_max_scaled.columns:
    #df_max_scaled[column] = df_max_scaled[column]  / df_max_scaled[column].abs().max()
      
# view normalized data
#display(df_max_scaled)