In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from pathlib import Path
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import GradientBoostingClassifier

In [2]:
file_path = Path('final_table_with_conditional.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
0,2022-06,2022,Cherokee,362500,271,38,148,8,76,1904,407510,467,,5.52,,-0.052164,2.0,down
1,2022-06,2022,Craven,313500,154,36,196,12,48,1927,396716,581,,5.52,,0.039801,63.0,up
2,2022-06,2022,Catawba,342450,207,34,224,0,96,1923,521108,508,,5.52,,-0.042098,65.0,down
3,2022-06,2022,Gaston,330000,283,30,412,12,200,1668,377406,828,,5.52,,0.03937,99.0,up
4,2022-06,2022,Warren,275000,19,68,12,0,4,1824,419452,29,,5.52,,,1.0,down


In [3]:
df['violent_crime_rate']= df['violent_crime_rate'].str.replace(',','').astype(float)
df.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
0,2022-06,2022,Cherokee,362500,271,38,148,8,76,1904,407510,467,,5.52,,-0.052164,2.0,down
1,2022-06,2022,Craven,313500,154,36,196,12,48,1927,396716,581,,5.52,,0.039801,63.0,up
2,2022-06,2022,Catawba,342450,207,34,224,0,96,1923,521108,508,,5.52,,-0.042098,65.0,down
3,2022-06,2022,Gaston,330000,283,30,412,12,200,1668,377406,828,,5.52,,0.03937,99.0,up
4,2022-06,2022,Warren,275000,19,68,12,0,4,1824,419452,29,,5.52,,,1.0,down


In [4]:
df_clean = df.drop(['Date'], axis=1)
df_clean.dtypes

Year                         int64
County                      object
median_listing_price         int64
active_listing_count         int64
median_days_on_market        int64
new_listing_count            int64
price_increased_count        int64
price_reduced_count          int64
median_square_feet           int64
average_listing_price        int64
total_listing_count          int64
violent_crime_rate         float64
30_Year_Fixed_Rate         float64
Number_of_Schools          float64
median_listing_price_mm    float64
Number_of_Parks            float64
Price_Status                object
dtype: object

In [5]:
filter_df = df_clean[df_clean['Year'] <2019]
filter_df.head()

Unnamed: 0,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
4196,2018,Wilkes,224000,306,126,52,0,28,2174,352753,340,243.6,4.64,53.0,-0.025663,34.0,down
4197,2018,Caswell,253900,52,183,0,0,4,1942,354254,52,,4.64,47.0,,37.0,down
4198,2018,Forsyth,245000,1197,85,296,12,204,2234,325987,1445,660.2,4.64,354.0,-0.019608,479.0,down
4199,2018,Surry,179900,145,114,28,0,24,1987,235683,176,89.0,4.64,45.0,-0.024403,13.0,down
4200,2018,Cherokee,199000,545,132,48,0,52,1708,264228,547,272.1,4.64,26.0,-0.004502,2.0,down


In [6]:
# Load the data
columns = [
    "Year", "County", 
    "active_listing_count", "median_days_on_market", "new_listing_count",
    "price_increased_count", "price_reduced_count", "median_square_feet",
    "total_listing_count", "violent_crime_rate",
    "30_Year_Fixed_Rate", "Number_of_Schools",
    "Number_of_Parks", "Price_Status"
]

df = filter_df.loc[:, columns].copy()


# Drop the null rows
df = df.dropna()




df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,Year,County,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,Number_of_Parks,Price_Status
0,2018,Wilkes,306,126,52,0,28,2174,340,243.6,4.64,53.0,34.0,down
1,2018,Forsyth,1197,85,296,12,204,2234,1445,660.2,4.64,354.0,479.0,down
2,2018,Surry,145,114,28,0,24,1987,176,89.0,4.64,45.0,13.0,down
3,2018,Cherokee,545,132,48,0,52,1708,547,272.1,4.64,26.0,2.0,down
4,2018,Orange,364,105,80,0,56,2350,445,153.6,4.64,117.0,126.0,down
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2605,2016,Alleghany,267,116,40,0,20,1876,271,138.1,3.44,10.0,0.0,down
2606,2016,Iredell,1145,74,352,44,404,2890,1474,312.6,3.44,150.0,104.0,down
2607,2016,Wilson,512,52,92,8,80,1922,646,398.9,3.44,87.0,70.0,down
2608,2016,Avery,378,104,48,0,40,1864,380,221.1,3.44,40.0,9.0,down


In [7]:
# defining features

df.columns

Index(['Year', 'County', 'active_listing_count', 'median_days_on_market',
       'new_listing_count', 'price_increased_count', 'price_reduced_count',
       'median_square_feet', 'total_listing_count', 'violent_crime_rate',
       '30_Year_Fixed_Rate', 'Number_of_Schools', 'Number_of_Parks',
       'Price_Status'],
      dtype='object')

In [21]:
df['Price_Status'] = df['Price_Status'].astype('int')
df.dtypes

ValueError: invalid literal for int() with base 10: 'down'

In [8]:
selected_features = df[['active_listing_count', 'median_days_on_market', 'new_listing_count', 'price_increased_count', 'price_reduced_count', 'median_square_feet', 'total_listing_count', 'violent_crime_rate', '30_Year_Fixed_Rate',
       'Number_of_Schools', 'Number_of_Parks']]
selected_features.shape

(2610, 11)

In [9]:
# Defining predictor and target features to X and y respectively.
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape to create this

X = selected_features
y = df[['Price_Status']].values.reshape(-1,1)

print("Shape: ", X.shape, y.shape)

Shape:  (2610, 11) (2610, 1)


In [10]:
# Using sklearn's 'train_test_split' to split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 42)
print("Shape: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Shape:  (1957, 11) (1957, 1) (653, 11) (653, 1)


In [11]:
# Scale the data
X_minmax = MinMaxScaler().fit(X_train)

X_train_scaled = X_minmax.transform(X_train)
X_test_scaled = X_minmax.transform(X_test)

In [12]:
gradientBoostedTree = GradientBoostingClassifier()
gradientBoostedTree

GradientBoostingClassifier()

In [13]:
gradientBoostedTree.fit(X_train_scaled, y_train.flatten())

GradientBoostingClassifier()

In [14]:
print(f"Training Data Score: {gradientBoostedTree.score(X_train, y_train)}")
print(f"Testing Data Score: {gradientBoostedTree.score(X_test, y_test)}")

Training Data Score: 0.512008175779254
Testing Data Score: 0.48392036753445633


In [15]:
gradientBoostedPredictions = gradientBoostedTree.predict(X_test_scaled)

In [16]:
gradientBoostedPredictProba = gradientBoostedTree.predict_proba(X_test_scaled)[:,1]

In [17]:
# Print classification report
print(classification_report(y_test.flatten(), gradientBoostedPredictions,
                            target_names=["Price Increase", "Price Decrease"]))
print("Accuracy:", accuracy_score(y_test.flatten(), gradientBoostedPredictions))

                precision    recall  f1-score   support

Price Increase       0.95      0.98      0.96       608
Price Decrease       0.50      0.33      0.40        45

      accuracy                           0.93       653
     macro avg       0.73      0.65      0.68       653
  weighted avg       0.92      0.93      0.92       653

Accuracy: 0.9310872894333844


In [18]:
print("Gradient Boosting Classifier")
print("========================")
print("Accuracy: ",accuracy_score(y_test.flatten(), gradientBoostedPredictions)) 
print("Precision: ",precision_score(y_test.flatten(), gradientBoostedPredictions, pos_label='up'))
print("Recall: ",recall_score(y_test.flatten(), gradientBoostedPredictions, pos_label='up'))
print("F1-Score: ",f1_score(y_test.flatten(), gradientBoostedPredictions, pos_label='up'))
print("AUC score: ",roc_auc_score(y_test.flatten().astype(float), gradientBoostedPredictions).astype(float))

Gradient Boosting Classifier
Accuracy:  0.9310872894333844
Precision:  0.5
Recall:  0.3333333333333333
F1-Score:  0.4


ValueError: could not convert string to float: 'down'

In [None]:
prt = precision_recall_curve(y_test.flatten(), gradientBoostedPredictProba)
plt.plot(p, r)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve - Gradient Boost')

confusionMatrix = confusion_matrix(y_test.flatten(), gradientBoostedPredictions) 
plt.figure(figsize =(6, 6)) 
sns.heatmap(confusionMatrix, xticklabels = ["Price Decrease", "Price Increase"],  
            yticklabels = ["Price Decrease", "Price Increase"]); 
plt.title("Confusion matrix - Gradient Boost") 
plt.ylabel('Actual Class') 
plt.xlabel('Predicted class') 
plt.show()