In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
from collections import Counter

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_selection import SelectFromModel

In [2]:
file_path = Path('filled_final_table.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks
0,2022-06,2022,Alamance,382500,298,18,452,12,108,2112,443362,644,,5.52,,0.034063,54.0
1,2022-06,2022,Alexander,312450,34,35,28,0,20,1942,370648,59,,5.52,,0.016428,25.0
2,2022-06,2022,Alleghany,389000,54,39,24,0,20,1904,422802,92,,5.52,,-0.038200,0.0
3,2022-06,2022,Anson,170000,22,34,16,0,8,1502,262240,48,,5.52,,0.218638,3.0
4,2022-06,2022,Ashe,485000,76,31,48,0,20,1907,592879,137,,5.52,,-0.088346,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7191,2016-07,2016,Wayne,165000,593,83,104,4,92,1868,177275,603,442,3.44,88.0,,36.0
7192,2016-07,2016,Wilkes,234900,320,116,44,4,48,2146,337545,380,208.4,3.44,52.0,,34.0
7193,2016-07,2016,Wilson,117500,512,52,92,8,80,1922,150833,646,398.9,3.44,87.0,,70.0
7194,2016-07,2016,Yadkin,210000,146,142,20,0,8,2460,285062,161,231.2,3.44,16.0,,4.0


In [3]:
df['violent_crime_rate']= df['violent_crime_rate'].str.replace(',','').astype(float)

In [4]:
df_drop = df.drop(['Date','median_listing_price_mm','average_listing_price'], axis=1)

In [5]:
df_drop = df_drop.dropna()

In [6]:
df_drop.columns

Index(['Year', 'County', 'median_listing_price', 'active_listing_count',
       'median_days_on_market', 'new_listing_count', 'price_increased_count',
       'price_reduced_count', 'median_square_feet', 'total_listing_count',
       'violent_crime_rate', '30_Year_Fixed_Rate', 'Number_of_Schools',
       'Number_of_Parks'],
      dtype='object')

In [7]:
df_drop

Unnamed: 0,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,Number_of_Parks
4196,2018,Alamance,249900,761,78,172,28,116,2060,843,419.9,4.64,94.0,54.0
4197,2018,Alexander,229900,90,102,8,0,16,1924,103,221.9,4.64,49.0,25.0
4198,2018,Alleghany,114900,420,227,4,0,20,1886,438,114.8,4.64,11.0,0.0
4199,2018,Anson,125000,65,100,8,0,12,1500,78,568.4,4.64,15.0,3.0
4202,2018,Beaufort,245000,272,151,32,0,48,1950,337,221.2,4.64,25.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7191,2016,Wayne,165000,593,83,104,4,92,1868,603,442.0,3.44,88.0,36.0
7192,2016,Wilkes,234900,320,116,44,4,48,2146,380,208.4,3.44,52.0,34.0
7193,2016,Wilson,117500,512,52,92,8,80,1922,646,398.9,3.44,87.0,70.0
7194,2016,Yadkin,210000,146,142,20,0,8,2460,161,231.2,3.44,16.0,4.0


In [8]:
selected_features = df_drop[['Year', 'County','active_listing_count',
       'median_days_on_market', 'new_listing_count', 'price_increased_count',
       'price_reduced_count', 'median_square_feet',
       'total_listing_count', 'violent_crime_rate', '30_Year_Fixed_Rate',
       'Number_of_Schools', 'Number_of_Parks']]
selected_features

Unnamed: 0,Year,County,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,Number_of_Parks
4196,2018,Alamance,761,78,172,28,116,2060,843,419.9,4.64,94.0,54.0
4197,2018,Alexander,90,102,8,0,16,1924,103,221.9,4.64,49.0,25.0
4198,2018,Alleghany,420,227,4,0,20,1886,438,114.8,4.64,11.0,0.0
4199,2018,Anson,65,100,8,0,12,1500,78,568.4,4.64,15.0,3.0
4202,2018,Beaufort,272,151,32,0,48,1950,337,221.2,4.64,25.0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7191,2016,Wayne,593,83,104,4,92,1868,603,442.0,3.44,88.0,36.0
7192,2016,Wilkes,320,116,44,4,48,2146,380,208.4,3.44,52.0,34.0
7193,2016,Wilson,512,52,92,8,80,1922,646,398.9,3.44,87.0,70.0
7194,2016,Yadkin,146,142,20,0,8,2460,161,231.2,3.44,16.0,4.0


In [9]:
X = selected_features
X = pd.get_dummies(X)
y = df_drop[['median_listing_price']]

print("Shape: ", X.shape, y.shape)

Shape:  (2610, 104) (2610, 1)


In [10]:
# Use sklearn's `train_test_split` to split the data into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print("Shape: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape)                                                    
                                                    

Shape:  (1957, 104) (1957, 1) (653, 104) (653, 1)


In [11]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=1000, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)

In [12]:
print(f"Training Data Score: {rf_model.score(X_train, y_train)}")
print(f"Testing Data Score: {rf_model.score(X_test, y_test)}")

Training Data Score: 0.9955948591860083
Testing Data Score: 0.9701223712196744


In [13]:
# Make predictions with the model
RandomForestRegressorPredictions = rf_model.predict(X_test)

In [14]:
RandomForestRegressorPredictions

array([142396.29 , 223912.293, 205593.766, 214372.58 , 174432.444,
       277393.789, 283192.847, 347353.911, 144059.084,  99494.75 ,
       227225.549, 132869.275, 176660.224, 215325.422, 185830.352,
       210645.045, 214183.124, 223935.416, 244486.283, 159668.891,
       291662.382, 340581.061, 239611.959, 278401.12 , 276183.497,
       286908.456, 272205.516, 229135.318, 275827.538, 297950.398,
       124568.821, 239207.672, 166576.36 , 131712.941, 193874.71 ,
       138514.766, 144832.319, 344074.696, 267338.9  , 326434.825,
       300724.617, 191971.621, 493388.403, 219144.955, 163503.953,
       130170.78 , 343809.226, 393226.486, 129942.834, 175189.848,
       245022.018, 204416.648, 147173.811, 173134.115, 254218.699,
       239935.747, 281494.35 , 429106.787, 126646.031, 131259.998,
       408928.754, 294159.063, 396095.685, 184938.398, 229507.1  ,
       139396.992, 418600.045, 176326.811, 204593.56 , 195229.185,
       279917.417, 260588.282, 436696.762, 222602.949, 206658.

In [15]:
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5614705407387783, 'median_square_feet'),
 (0.08802895727449206, 'violent_crime_rate'),
 (0.06574498431408445, 'Number_of_Schools'),
 (0.05031801854852657, 'active_listing_count'),
 (0.03794307544888948, 'total_listing_count'),
 (0.028564797714270007, 'Number_of_Parks'),
 (0.016485373852907107, 'County_Dare'),
 (0.01358105974274081, 'County_Chatham'),
 (0.012519733195902683, 'County_Buncombe'),
 (0.011198816759373596, 'median_days_on_market'),
 (0.011172013836695282, 'price_reduced_count'),
 (0.009348394857507016, 'County_Carteret'),
 (0.008830557095004804, '30_Year_Fixed_Rate'),
 (0.007813767024528276, 'new_listing_count'),
 (0.006730581228355788, 'price_increased_count'),
 (0.004346634361814265, 'County_Clay'),
 (0.004123472741641632, 'County_Haywood'),
 (0.004107111055483487, 'County_Hertford'),
 (0.0036802074242333, 'Year'),
 (0.003607763697718617, 'County_Henderson'),
 (0.003573800968378975, 'County_Orange'),
 (0.0027189458012043316, 'County_Polk'),
 (0.002667008937458723, 'Cou

In [16]:
rf_model.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [17]:
rf_model.decision_path(X)

(<2610x2289454 sparse matrix of type '<class 'numpy.int64'>'
 	with 39281214 stored elements in Compressed Sparse Row format>,
 array([      0,    2297,    4584, ..., 2284806, 2287157, 2289454],
       dtype=int32))

In [18]:
rf_model.apply(X)

array([[1259, 1374, 1390, ..., 1376, 1338, 1487],
       [ 893,  293,  382, ...,  310,  422,  284],
       [ 857,  198,  109, ...,  214,   89,  128],
       ...,
       [1192,  893,  944, ...,  953,  829,  937],
       [1794, 1747, 1681, ..., 1767, 1672, 1783],
       [ 260,  361,  126, ...,   85,   27,  337]], dtype=int64)

In [19]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [20]:
# Use the random grid to search for best hyperparameters
#rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3,
                   estimator=RandomForestRegressor(n_estimators=1000,
                                                   random_state=1),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [21]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [22]:
accuracy = 0 
def evaluate(rf_model, X_test, y_test):
    global accuracy
    predictions = model.predict(X_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [23]:
accuracy

0

In [24]:
base_model = rf_model(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

TypeError: 'RandomForestRegressor' object is not callable