In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

Read the CSV and Perform Basic Data Cleaning

In [3]:
df = pd.read_csv(r'C:\Users\earth\Desktop\class_folder\FinalProject\Resources\final_table.csv')
df= df.sort_values('County')
df.head()


Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks
2417,2020-06,2020,Alamance,282450.0,423.0,53.0,352.0,28.0,124.0,2235.0,350926.0,699.0,505.5,3.16,,0.012366,54.0
2982,2020-01,2020,Alamance,246798.0,566.0,75.0,256.0,12.0,172.0,1953.0,288490.0,743.0,505.5,3.62,,0.037269,54.0
4046,2019-02,2019,Alamance,255000.0,681.0,101.0,320.0,32.0,152.0,2181.0,286615.0,780.0,420.2,4.37,,0.043235,54.0
1274,2021-06,2021,Alamance,280000.0,197.0,20.0,388.0,12.0,48.0,1892.0,411055.0,569.0,,2.98,,0.018182,54.0
4981,2018-05,2018,Alamance,239900.0,809.0,64.0,408.0,48.0,200.0,2069.0,273460.0,956.0,419.9,4.59,94.0,,54.0


In [4]:
# Class column based on percent change from 'median_listing_price_mm' : reference:https://stackoverflow.com/questions/21702342/creating-a-new-column-based-on-if-elif-else-condition
def f(row):
    if row['median_listing_price_mm'] == 0:
        val = 'same'
    elif row['median_listing_price_mm'] > 0:
        val = 'up'
    else:
        val = 'down'
    return val

In [5]:
df['Price_Status'] = df.apply(f, axis=1)
df.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
2417,2020-06,2020,Alamance,282450.0,423.0,53.0,352.0,28.0,124.0,2235.0,350926.0,699.0,505.5,3.16,,0.012366,54.0,up
2982,2020-01,2020,Alamance,246798.0,566.0,75.0,256.0,12.0,172.0,1953.0,288490.0,743.0,505.5,3.62,,0.037269,54.0,up
4046,2019-02,2019,Alamance,255000.0,681.0,101.0,320.0,32.0,152.0,2181.0,286615.0,780.0,420.2,4.37,,0.043235,54.0,up
1274,2021-06,2021,Alamance,280000.0,197.0,20.0,388.0,12.0,48.0,1892.0,411055.0,569.0,,2.98,,0.018182,54.0,up
4981,2018-05,2018,Alamance,239900.0,809.0,64.0,408.0,48.0,200.0,2069.0,273460.0,956.0,419.9,4.59,94.0,,54.0,down


In [6]:
# convert county names to numbers ,dictionary of county names with values
county_names = df.County.unique()
county_di = dict(zip(county_names, range(len(county_names))))
#county_di

In [7]:
# county names with #'s'
df_num = df.copy()
df_num['County'].replace(county_di, inplace=True)
df_num.head()

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
2417,2020-06,2020,0,282450.0,423.0,53.0,352.0,28.0,124.0,2235.0,350926.0,699.0,505.5,3.16,,0.012366,54.0,up
2982,2020-01,2020,0,246798.0,566.0,75.0,256.0,12.0,172.0,1953.0,288490.0,743.0,505.5,3.62,,0.037269,54.0,up
4046,2019-02,2019,0,255000.0,681.0,101.0,320.0,32.0,152.0,2181.0,286615.0,780.0,420.2,4.37,,0.043235,54.0,up
1274,2021-06,2021,0,280000.0,197.0,20.0,388.0,12.0,48.0,1892.0,411055.0,569.0,,2.98,,0.018182,54.0,up
4981,2018-05,2018,0,239900.0,809.0,64.0,408.0,48.0,200.0,2069.0,273460.0,956.0,419.9,4.59,94.0,,54.0,down


In [8]:
df_num['violent_crime_rate'] = df_num['violent_crime_rate'].str.replace(',','').astype(float)
df_num.dtypes

Date                        object
Year                         int64
County                       int64
median_listing_price       float64
active_listing_count       float64
median_days_on_market      float64
new_listing_count          float64
price_increased_count      float64
price_reduced_count        float64
median_square_feet         float64
average_listing_price      float64
total_listing_count        float64
violent_crime_rate         float64
30_Year_Fixed_Rate         float64
Number_of_Schools          float64
median_listing_price_mm    float64
Number_of_Parks            float64
Price_Status                object
dtype: object

In [9]:
filter_df =df_num.dropna()
#filter_df["Year"].unique() recent years have NaNs.

In [10]:
filter_df.query('County==19')

Unnamed: 0,Date,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,average_listing_price,total_listing_count,violent_crime_rate,30_Year_Fixed_Rate,Number_of_Schools,median_listing_price_mm,Number_of_Parks,Price_Status
4200,2018-12,2018,19,199000.0,545.0,132.0,48.0,0.0,52.0,1708.0,264228.0,547.0,272.1,4.64,26.0,-0.004502,2.0,down
4556,2018-09,2018,19,200000.0,630.0,98.0,108.0,0.0,84.0,1783.0,265680.0,635.0,272.1,4.63,26.0,-0.069551,2.0,down
4433,2018-10,2018,19,199900.0,628.0,112.0,120.0,4.0,96.0,1750.0,262052.0,634.0,272.1,4.83,26.0,-0.0005,2.0,down
4361,2018-11,2018,19,199900.0,569.0,119.0,84.0,0.0,84.0,1766.0,266630.0,571.0,272.1,4.87,26.0,0.0,2.0,same
4660,2018-08,2018,19,214950.0,656.0,92.0,140.0,8.0,92.0,1789.0,270440.0,660.0,272.1,4.55,26.0,-0.016022,2.0,down


In [11]:
filter_df["Year"].unique()

array([2018], dtype=int64)

In [12]:
# leave 'median_listing_price' in columns? Price_Status is classifier for median_listing_price (increased(up)/decreased(down))
columns = [
    "Date", "Year", "County", "median_listing_price",
    "active_listing_count", "median_days_on_market", "new_listing_count",
    "price_increased_count", "price_reduced_count", "median_square_feet",
    "total_listing_count", "violent_crime_rate","Number_of_Parks",
    "Number_of_Schools","30_Year_Fixed_Rate", 
     "Price_Status"]

# conditional column 'Price_status' is condition on 'median_listing...mm' which is "the percentage change
# in the median listing price from the previous month." An increase is 'up', decrease is 'down'.

# df['Price_Status'] = np.where(df['median_listing_price_mm']>=0, 'up', 'down')


# https://www.realtor.com/research/data/

target = ["Price_Status"]

In [13]:
# Load the data

df_data = filter_df.loc[:, columns].copy()
df_data = df_data.dropna()

df_data.reset_index(inplace=True, drop=True)

df_data.head()
df_data.isnull().sum()

Date                     0
Year                     0
County                   0
median_listing_price     0
active_listing_count     0
median_days_on_market    0
new_listing_count        0
price_increased_count    0
price_reduced_count      0
median_square_feet       0
total_listing_count      0
violent_crime_rate       0
Number_of_Parks          0
Number_of_Schools        0
30_Year_Fixed_Rate       0
Price_Status             0
dtype: int64

Split the Data into Training and Testing

In [14]:
# Create our features
X = df_data.drop('Price_Status', axis=1)

X = pd.get_dummies(X)

# Create our target
y = df_data.loc[:, target].copy()



In [15]:
X.describe()


Unnamed: 0,Year,County,median_listing_price,active_listing_count,median_days_on_market,new_listing_count,price_increased_count,price_reduced_count,median_square_feet,total_listing_count,violent_crime_rate,Number_of_Parks,Number_of_Schools,30_Year_Fixed_Rate,Date_2018-08,Date_2018-09,Date_2018-10,Date_2018-11,Date_2018-12
count,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0,335.0
mean,2018.0,51.462687,253234.026866,637.597015,96.61194,171.892537,18.80597,179.558209,2099.773134,829.047761,298.086567,114.208955,97.776119,4.704,0.2,0.2,0.2,0.2,0.2
std,0.0,28.546804,95028.269661,711.972214,27.362041,275.296575,44.647468,305.507052,326.02356,966.525274,174.557044,243.314657,107.536767,0.124056,0.400598,0.400598,0.400598,0.400598,0.400598
min,2018.0,0.0,79900.0,34.0,18.0,4.0,0.0,0.0,1531.0,39.0,62.2,2.0,12.0,4.55,0.0,0.0,0.0,0.0,0.0
25%,2018.0,27.0,179900.0,233.0,77.5,40.0,0.0,36.0,1872.0,289.0,172.3,18.0,37.0,4.63,0.0,0.0,0.0,0.0,0.0
50%,2018.0,50.0,238000.0,398.0,93.0,84.0,4.0,88.0,2062.0,498.0,244.1,47.0,66.0,4.64,0.0,0.0,0.0,0.0,0.0
75%,2018.0,78.0,316187.5,776.5,114.0,194.0,20.0,184.0,2269.5,977.0,383.6,100.0,112.0,4.83,0.0,0.0,0.0,0.0,0.0
max,2018.0,98.0,549000.0,4208.0,208.0,1836.0,360.0,2128.0,3356.0,5361.0,805.1,1627.0,569.0,4.87,1.0,1.0,1.0,1.0,1.0


In [16]:
# Check the balance of our target values
y.value_counts()

Price_Status
down            179
up              133
same             23
dtype: int64

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                  stratify=y)
X_train.shape


(251, 19)

Balanced Random Forest Classifier¶

In [18]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score
rf_model = BalancedRandomForestClassifier(n_estimators=1000, random_state=1) 
rf_model = rf_model.fit(X_train, y_train)
Counter(y_train['Price_Status'])

Counter({'down': 134, 'up': 100, 'same': 17})

In [19]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)


0.38095238095238093

In [20]:
# Calculated the balanced accuracy score
#from sklearn.metrics import balanced_accuracy_score

#balanced_accuracy_score(y_test, y_pred)



In [21]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       down       0.65      0.24      0.85      0.35      0.45      0.19        45
       same       0.12      0.33      0.81      0.17      0.52      0.26         6
         up       0.38      0.58      0.39      0.46      0.48      0.23        33

avg / total       0.50      0.38      0.67      0.38      0.47      0.21        84



In [22]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)





[(0.09795703542525905, 'Number_of_Schools'),
 (0.09193276424723894, 'median_days_on_market'),
 (0.08273171677041412, 'median_square_feet'),
 (0.07939639081543948, 'active_listing_count'),
 (0.07834190409836013, 'median_listing_price'),
 (0.07702750494886994, 'total_listing_count'),
 (0.07392663162607271, 'Number_of_Parks'),
 (0.07210056411753138, 'price_reduced_count'),
 (0.06987458003092757, 'County'),
 (0.06615890036893453, 'violent_crime_rate'),
 (0.06598366717949387, 'new_listing_count'),
 (0.044587805700024186, 'price_increased_count'),
 (0.035664475641133786, '30_Year_Fixed_Rate'),
 (0.01898862736710104, 'Date_2018-09'),
 (0.012157858283138858, 'Date_2018-12'),
 (0.011785604421475769, 'Date_2018-10'),
 (0.010764244000218245, 'Date_2018-08'),
 (0.010619724958366394, 'Date_2018-11'),
 (0.0, 'Year')]

In [23]:
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier

In [24]:
ad_model = AdaBoostClassifier(n_estimators=1000, random_state=1) 
ad_model = ad_model.fit(X_train, y_train)
y_pred = ad_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.44047619047619047

In [25]:
gb_model = GradientBoostingClassifier(n_estimators=1000, random_state=1) 
gb_model = gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5476190476190477

Nearest Centroid Classifier 

In [26]:
 from sklearn.neighbors import NearestCentroid
nc_model = NearestCentroid()
nc_model.fit(X_train, y_train)
y_pred = nc_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.27380952380952384

SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train)

Counter(y_resampled['Price_Status'])

In [None]:
# Train the random forest classifier model using the resampled data
model_SMOTE =rf_model

model_SMOTE.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model_SMOTE.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

AdaBoostClassifier SMOTE train

In [None]:
# Train the adaboost classifier model using the resampled data
model_SMOTE =ad_model

model_SMOTE.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model_SMOTE.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

SMOTE LogisticRegression

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(
    X_train, y_train)

Counter(y_resampled['Price_Status'])

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model_SMOTE = LogisticRegression(solver='lbfgs', random_state=1)

model_SMOTE.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model_SMOTE.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

Combined sampling

In [None]:
# Resample the training data with SMOTEENN

from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)

X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled['Price_Status'])

In [None]:
# Train the Logistic Regression model using the resampled data
model_combo = LogisticRegression(solver='lbfgs', random_state=1)

model_combo.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model_combo.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

Naive Bayes BernoulliNB

In [None]:
from sklearn.naive_bayes import BernoulliNB
model_naive = BernoulliNB(fit_prior=True)
model_naive.fit(X_train, y_train)
model_naive.predict(X_test)
model_naive.score(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model_naive, X, y, scoring='accuracy', n_jobs=-1)
print (scores)

In [None]:
# heat map of feature correlation
plt.rcParams['figure.figsize']=35,35
g = sns.heatmap(df.corr(),annot=True, fmt = ".1f")

In [None]:
# distribution plot of column
sns.distplot(df['median_listing_price'])

In [None]:
# convert Price_class to numeric, distribution of median_listing_price increase|decrease month/month
df['Price_Status'] = df['Price_Status'].replace(['up','down'],['1','0'])
sns.distplot(df['Price_Status'])

In [None]:
# Reference code for heatmaps :https://www.kaggle.com/code/bsivavenu/house-price-calculation-methods-for-beginners

# most correlated features
corrmat = df.corr()
top_corr_features = corrmat.index[abs(corrmat["median_listing_price"])>0.30]
plt.figure(figsize=(10,10))
g = sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="mako")

In [None]:
# seaborn linear regression fit: median listing price vs median square feet
sns.jointplot(data=df, x="median_listing_price", y="median_square_feet", kind="reg")

# outliers present

In [None]:
#median_listing_price vs average listing price
sns.jointplot(data=df, x="median_listing_price", y="average_listing_price", kind="reg")

In [None]:
#median listing price vs Year
sns.jointplot(data=df, x="median_listing_price", y="Year", kind="reg")

In [None]:
# Find Missing Ratio of Dataset
# reference: https://stackoverflow.com/questions/51070985/find-out-the-percentage-of-missing-values-in-each-column-in-the-given-dataset
#percent_missing = df.isnull().sum() * 100 / len(df)
#missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 #'percent_missing': percent_missing})
#missing_value_df

In [None]:
# reference
# https://www.kaggle.com/code/kanncaa1/machine-learning-tutorial-for-beginners/notebook
#https://www.kaggle.com/code/erick5/predicting-house-prices-with-machine-learning/notebook