In [1]:
### RealEstateOracle Version 2.0 Machine Learning algorithm creation and export 
## Written in Python 3.9.5
# Blake Donahoo 8-20-2021

# place knr_sales.mdl and knr_rental.mdl into the same directory with realestateoracle.py

## DONE ##
# stopped running live SQL queries so that *.CSV files can be loaded and compiled into a moveable *.exe with the program
# limited the input data for predictions to state and date 
# built and export sales and rental ml models and successfully predicted off of them

## TO DO ##
# need to add algorithm for automatic encoding/decoding of state and date data 
# need to add graphics export module for matplotlib visuals to be viewed at the cmd line

In [2]:
# main modules to be compiled into realestateoracle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

# modules only neccesary for ML construction
import sklearn
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import joblib

In [3]:
startup_logo =""" 
||\   ||////    ^    ||       ||////   ///// ////////  ^  //////// ||////
|| \  ||       / \   ||       ||      //        ||    / \    ||    ||
||_// ||////  /__ \  ||       ||////  //////    ||   /__ \   ||    ||////
|| \  ||     //    \ ||       ||          //    ||  //    \  ||    ||
||  \ ||//////      \||////   ||//// //////     || //      \ ||    ||////

       
    ///////  ||\       ^     /////// ||      ||//// 
  ///    /// || \     / \   ///      ||      ||
 ///    ///  ||_//   /__ \  ///      ||      ||////
 //    ///   || \   //    \ ///      ||      ||
 ///////     ||  \ //      \ /////// ||///// ||////
-----------------------------------------------------
A primitive data science library for real estate
Creator: Blake Donahoo
Published: 07-15-2021
Version: 2.0
Updated on: 08-20-2021

Data Source: https://www.zillow.com/research/data/

Data & Functionality contained: 
 - Home values in every state since 1996
 - Calculated value changes over time for every state since 1996
 - Forecasting Metrics and predictive algorithms
 - Rental Values in every state since 2014
 
Coming in Version 2.5:
  - Mean/Median/Mode, Standard Deviation for every slice of information
  - Python generated visualizations like ScatterPlots, Histograms & BarGraphs
"""


In [4]:
### read data into a DataFrame
# Rental NaN values filled with 10-day average 
# sales NaN values filled with 6-mo average
rental = pd.read_csv('realestate_data/Rental_FullEDA_fillNaN.csv', parse_dates = ['Date'])
sales = pd.read_csv('realestate_data/Combined_RealEstateData_Cleaned.csv', parse_dates = ['Date'])

In [5]:


### move our main "y" variable to the first index position to make things easier later 
# "y" for rental is R_Monthly
monthly = rental.pop('R_Monthly')
rental.insert(0, 'R_Monthly', monthly)
rental.reset_index()
# "y" for sales is Avg_AggMortgage
mortgage = sales.pop('Avg_AggMortgage')
sales.insert(0, 'Avg_AggMortgage', mortgage)

In [6]:
rental["Date"].value_counts()

2017-02-01    106
2020-06-01    106
2017-06-01    106
2018-10-01    106
2017-05-01    106
             ... 
2019-02-01    105
2014-02-01    105
2014-04-01    105
2014-03-01    104
2014-01-01     99
Name: Date, Length: 89, dtype: int64

In [None]:


# Convert string and datetype data into numeric values so that they can be used in our algorithms

### convert dates to numerical labels
rental['Date'] = le.fit_transform(np.array(rental['Date']))
# convert CityName to numerical labels
rental['CityName'] = le.fit_transform(np.array(rental['CityName']))
# convert Lstate to numerical label Identifiers
rental['Lstate'] = le.fit_transform(np.array(rental['Lstate']))
# round long float values to int
rental['R_Monthly'] = np.rint(np.array(rental.R_Monthly)).astype(int) ### y value ###
rental['R_Annual'] = np.rint(np.array(rental.R_Annual)).astype(int)
rental['R_PriorMonth'] = np.rint(np.array(rental.R_PriorMonth)).astype(int)
rental['R_DiffPrevMonth'] = np.rint(np.array(rental.R_DiffPrevMonth)).astype(int)
rental['R_60DayDiff'] = np.rint(np.array(rental.R_60DayDiff)).astype(int)
rental['R_60DayChange'] = np.rint(np.array(rental.R_60DayChange)).astype(int)
rental['R_90DayDiff'] = np.rint(np.array(rental.R_90DayDiff)).astype(int)
rental['R_90DayChange'] = np.rint(np.array(rental.R_90DayChange)).astype(int)


### convert dates to numerical label Identifiers
sales['Date'] = le.fit_transform(np.array(sales['Date']))
# convert Lstate to numerical label Identifiers
sales['Lstate'] = le.fit_transform(np.array(sales['Lstate']))
# convert CityName to numerical labels
sales['CityName'] = le.fit_transform(np.array(sales['CityName']))
# round long float values to int
sales['Avg_AggMortgage'] = np.rint(np.array(sales.Avg_AggMortgage)).astype(int) ### y value ###
# BOTTOM TIER
sales['BottomTier'] = np.rint(np.array(sales.BottomTier)).astype(int)
sales['B_30DayChange'] = np.rint(np.array(sales.B_30DayChange)).astype(int)
sales['B_60DayChange'] = np.rint(np.array(sales.B_60DayChange)).astype(int)
sales['B_90DayChange'] = np.rint(np.array(sales.B_90DayChange)).astype(int)
sales['B_PrincipalMonthly'] = np.rint(np.array(sales.B_PrincipalMonthly)).astype(int)
sales['B_AvgMortgage'] = np.rint(np.array(sales.B_AvgMortgage)).astype(int)
sales['B_InterestMonthly'] = np.rint(np.array(sales.B_InterestMonthly)).astype(int)
sales['B_AvgAnnual'] = np.rint(np.array(sales.B_90DayChange)).astype(int)
# MIDDLE TIER
sales['MiddleTier'] = np.rint(np.array(sales.MiddleTier)).astype(int)
sales['M_30DayChange'] = np.rint(np.array(sales.M_30DayChange)).astype(int)
sales['M_60DayChange'] = np.rint(np.array(sales.M_60DayChange)).astype(int)
sales['M_90DayChange'] = np.rint(np.array(sales.M_90DayChange)).astype(int)
sales['M_PrincipalMonthly'] = np.rint(np.array(sales.M_PrincipalMonthly)).astype(int)
sales['M_AvgMortgage'] = np.rint(np.array(sales.M_AvgMortgage)).astype(int)
sales['M_InterestMonthly'] = np.rint(np.array(sales.M_InterestMonthly)).astype(int)
sales['M_AvgAnnual'] = np.rint(np.array(sales.M_90DayChange)).astype(int)
# TOP TIER
sales['TopTier'] = np.rint(np.array(sales.TopTier)).astype(int)
sales['T_30DayChange'] = np.rint(np.array(sales.T_30DayChange)).astype(int)
sales['T_60DayChange'] = np.rint(np.array(sales.T_60DayChange)).astype(int)
sales['T_90DayChange'] = np.rint(np.array(sales.T_90DayChange)).astype(int)
sales['T_PrincipalMonthly'] = np.rint(np.array(sales.T_PrincipalMonthly)).astype(int)
sales['T_AvgMortgage'] = np.rint(np.array(sales.T_AvgMortgage)).astype(int)
sales['T_InterestMonthly'] = np.rint(np.array(sales.T_InterestMonthly)).astype(int)
sales['T_AvgAnnual'] = np.rint(np.array(sales.T_90DayChange)).astype(int)

In [None]:
rental.to_csv('rental_prediction_key.csv')
rental.head()

In [None]:
sales.to_csv('sales_prediction_key.csv')
sales.head()

In [None]:


### RENTAL ###
### Predictive modeling and *.mdl export for in-app predictions ###
feature_cols = ['Lstate', 'Date']
X = rental[feature_cols]
y = rental.R_Monthly
knr = KNeighborsRegressor(n_neighbors=10)
knr.fit(X,y)

# test dataset
rental_array = rental.values
X = rental_array[:,1:10] # rental[ everything else ]
y = rental_array[:,0] # rental['R_Monthly']
test_size = 0.10
seed = 40

# train and fit test dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(\
                                    X, y, test_size=test_size, random_state=seed)
model = KNeighborsRegressor(n_neighbors=10)
model.fit(X_train, Y_train)

print(cross_val_score(knr, X, y, cv=10, scoring='r2').mean())

In [None]:
X[-1,:]

In [None]:
### Rental Prediction Test #### predicting the monthly rental rate with only a state and date input

#              Lstate  Date
rental_test = [['0', '90']]
rental_test2 = [['20', '120']]
print(knr.predict(rental_test))

In [None]:
# Rental export and prediction reload / test

joblib.dump(knr,'knr_rental.mdl')
predict_rental = joblib.load('knr_rental.mdl')
print(predict_rental.predict(rental_test2))

In [None]:
#feature_cols = ['Lstate', 'Date', 'BottomTier', 'B_30DayChange'\
#                , 'B_60DayChange', 'B_90DayChange', 'B_PrincipalMonthly', 'B_AvgMortgage'\
#                , 'B_InterestMonthly', 'B_AvgAnnual', 'MiddleTier','M_30DayChange'\
#                , 'M_60DayChange', 'M_90DayChange', 'M_PrincipalMonthly'\
#                , 'M_AvgMortgage', 'M_InterestMonthly', 'M_AvgAnnual', 'TopTier'\
#                , 'T_30DayChange', 'T_60DayChange', 'T_90DayChange'\
#                , 'T_PrincipalMonthly', 'T_AvgMortgage', 'T_InterestMonthly', 'T_AvgAnnual']

In [None]:


### SALES ###
### Predictive modeling and *.mdl export for in-app predictions ###
feature_cols = ['Lstate', 'Date']
X = sales[feature_cols]
y = sales.Avg_AggMortgage
knr = KNeighborsRegressor(n_neighbors=5)
knr.fit(X,y)

# test dataset
sales_array = sales.values
X = sales_array[:,1:24] # sales[ everything else ]
Y = sales_array[:,0] # sales['Avg_AggMortgage']
test_size = 0.10
seed = 40

# train and fit test dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(\
                                    X, Y, test_size=test_size, random_state=seed)
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X_train, Y_train)

cross_val_score(knr, X, Y, cv=50, scoring='r2').mean()

In [None]:
X[-1,:]

In [None]:
### Rental Prediction Test #### predicting the average mortgage with only a state and date input 
#              Lstate  Date
sales_test = [['5', '10000']]
sales_test2 = [['40', '15000']]
print(knr.predict(sales_test))

In [None]:
# Rental export and prediction reload / test

joblib.dump(knr,'knr_sales.mdl')
predict_sales = joblib.load('knr_sales.mdl')
print(predict_sales.predict(sales_test2))

In [7]:
print(startup_logo)
# Input number labels:::
input1_op1 = 'Avg Sales price change over time'
input1_op2 = 'Avg Rental Rates change over time'
input1_op3 = 'Oracle Prediction'
input1_op4 = 'Scatterplots'

 
||\   ||////    ^    ||       ||////   ///// ////////  ^  //////// ||////
|| \  ||       / \   ||       ||      //        ||    / \    ||    ||
||_// ||////  /__ \  ||       ||////  //////    ||   /__ \   ||    ||////
|| \  ||     //    \ ||       ||          //    ||  //    \  ||    ||
||  \ ||//////      \||////   ||//// //////     || //      \ ||    ||////

       
    ///////  ||\       ^     /////// ||      ||//// 
  ///    /// || \     / \   ///      ||      ||
 ///    ///  ||_//   /__ \  ///      ||      ||////
 //    ///   || \   //    \ ///      ||      ||
 ///////     ||  \ //      \ /////// ||///// ||////
-----------------------------------------------------
A primitive data science library for real estate
Creator: Blake Donahoo
Published: 07-15-2021
Version: 2.0
Updated on: 08-20-2021

Data Source: https://www.zillow.com/research/data/

Data & Functionality contained: 
 - Home values in every state since 1996
 - Calculated value changes over time for every state since 199

In [8]:
def reset():
    input_0 = input("Type the number 0 and press enter to return to the main menu \n")
    if input_0 == "0":
        mainmenu()
    elif input_0 != "0":
        input("Invalid Input Error: Type the number 0 and press enter to return to the main menu \n")
        if input_0 == "0":
            mainmenu()
        elif input_0 != "0": 
            input("You're killing me. Type the number 0 and press enter to return to the main menu \n")
            if input_0 == "0":
                mainmenu()

In [9]:
def oraclePredict():
    print('You chose {} \n\n'.format(input1_op3))
    predict_sales = joblib.load('knr_sales.mdl')
    predict_rental = joblib.load('knr_rental.mdl')
    
    oracleInput_market = input("Type \"rental\" or \"sales\" to pick a market to calculate a prediction for \n\n")
    oracleInput_state = input("Type the two letter abbreviation of the state you want a prediction for (not case sensitive) \n\n")
    oracleInput_date = input("Type the numerical date you want a prediction for in the format: mm-dd-yyyy (with dashes) \n\n")
    print('You chose to predict the {} market in the state of {} around the time of {} \n\n'\
          .format(oracleInput_market,oracleInput_state,oracleInput_date)) 
    
    if oracleInput_state:
        if oracleInput_state.lower() == 'al':
            oracleInput_state = 1 
        elif oracleInput_state.lower() == 'ak':
            oracleInput_state = 2 
        elif oracleInput_state.lower() == 'ar':
            oracleInput_state = 3 
        elif oracleInput_state.lower() == 'az':
            oracleInput_state = 4 
        elif oracleInput_state.lower() == 'ca':
            oracleInput_state = 5 
        elif oracleInput_state.lower() == 'co':
            oracleInput_state = 6 
        elif oracleInput_state.lower() == 'ct':
            oracleInput_state = 7 
        elif oracleInput_state.lower() == 'de':
            oracleInput_state = 8 
        elif oracleInput_state.lower() == 'fl':
            oracleInput_state = 9 
        elif oracleInput_state.lower() == 'ga':
            oracleInput_state = 10 
        elif oracleInput_state.lower() == 'hi':
            oracleInput_state = 11 
        elif oracleInput_state.lower() == 'ia':
            oracleInput_state = 12 
        elif oracleInput_state.lower() == 'id':
            oracleInput_state = 13 
        elif oracleInput_state.lower() == 'il':
            oracleInput_state = 14 
        elif oracleInput_state.lower() == 'in':
            oracleInput_state = 15 
        elif oracleInput_state.lower() == 'ks':
            oracleInput_state = 16 
        elif oracleInput_state.lower() == 'ky':
            oracleInput_state = 17 
        elif oracleInput_state.lower() == 'la':
            oracleInput_state = 18 
        elif oracleInput_state.lower() == 'ma':
            oracleInput_state = 19 
        elif oracleInput_state.lower() == 'md':
            oracleInput_state = 20 
        elif oracleInput_state.lower() == 'me':
            oracleInput_state = 21 
        elif oracleInput_state.lower() == 'mi':
            oracleInput_state = 22 
        elif oracleInput_state.lower() == 'mn':
            oracleInput_state = 23 
        elif oracleInput_state.lower() == 'mo':
            oracleInput_state = 24 
        elif oracleInput_state.lower() == 'ms':
            oracleInput_state = 25 
        elif oracleInput_state.lower() == 'mt':
            oracleInput_state = 26 
        elif oracleInput_state.lower() == 'nc':
            oracleInput_state = 27 
        elif oracleInput_state.lower() == 'nd':
            oracleInput_state = 28 
        elif oracleInput_state.lower() == 'ne':
            oracleInput_state = 29 
        elif oracleInput_state.lower() == 'nh':
            oracleInput_state = 30 
        elif oracleInput_state.lower() == 'nj':
            oracleInput_state = 31 
        elif oracleInput_state.lower() == 'nm':
            oracleInput_state = 32 
        elif oracleInput_state.lower() == 'nv':
            oracleInput_state = 33 
        elif oracleInput_state.lower() == 'ny':
            oracleInput_state = 34 
        elif oracleInput_state.lower() == 'oh':
            oracleInput_state = 35 
        elif oracleInput_state.lower() == 'ok':
            oracleInput_state = 36 
        elif oracleInput_state.lower() == 'or':
            oracleInput_state = 37 
        elif oracleInput_state.lower() == 'pa':
            oracleInput_state = 38 
        elif oracleInput_state.lower() == 'ri':
            oracleInput_state = 39 
        elif oracleInput_state.lower() == 'sc':
            oracleInput_state = 40 
        elif oracleInput_state.lower() == 'sd':
            oracleInput_state = 41 
        elif oracleInput_state.lower() == 'tn':
            oracleInput_state = 42 
        elif oracleInput_state.lower() == 'tx':
            oracleInput_state = 43 
        elif oracleInput_state.lower() == 'ut':
            oracleInput_state = 44 
        elif oracleInput_state.lower() == 'vt':
            oracleInput_state = 45 
        elif oracleInput_state.lower() == 'va':
            oracleInput_state = 46 
        elif oracleInput_state.lower() == 'wa':
            oracleInput_state = 47 
        elif oracleInput_state.lower() == 'wi':
            oracleInput_state = 48 
        elif oracleInput_state.lower() == 'wv':
            oracleInput_state = 49 
        elif oracleInput_state.lower() == 'wy':
            oracleInput_state = 50 
        
        
    if oracleInput_date:
        oracleInput_date = re.sub('\d+\-\d+\-', '',oracleInput_date)
    
    if oracleInput_market == "sales":
        test_set = np.array(oracleInput_state), np.array(oracleInput_date)
        s_prediction = predict_sales.predict([test_set])
        print(s_prediction)
        print("\n\n")
        mainmenu()
    elif oracleInput_market == "rental":
        test_set = np.array(oracleInput_state), np.array(oracleInput_date)
        r_prediction = predict_rental.predict([test_set])
        print(r_prediction)
        print("\n\n")
        mainmenu()
    
    

In [None]:
oraclePredict()

You chose Oracle Prediction 


Type "rental" or "sales" to pick a market to calculate a prediction for 

sales
Type the two letter abbreviation of the state you want a prediction for (not case sensitive) 

tx
Type the numerical date you want a prediction for in the format: mm-dd-yyyy (with dashes) 

01-01-2040


  return f(*args, **kwargs)


You chose to predict the sales market in the state of tx around the time of 01-01-2040 


[3572.4]





In [10]:
def mainmenu():
    input_1 = input("""What would you like to know? \n\n
    Type the number of the option that best fits your question and press enter.\n\n 
    1 = Avg Sales price table data  2 = Avg Rental Rates table data \n
    3 = Oracle Prediction
    \n\n""")

    if input_1 == "3":
        oraclePredict()
    
    else:
        input_2 = input("""
        What state would you like to know about? \n\n 
        Type the two letter state abbreviation for state isolated metrics \n\n 
        or \"all\" for national metrics.\n\n 
        example: tx (not case sensitive) \n\n""")
    

        if input_1 == "1" and input_2.lower() == "all":
            print('You chose {} with a filter of {}\n\n'.format(input1_op1, input_2.upper()))
            print(sales)
            reset()
        elif input_1 == "1":
            print('You chose {} with a filter of {}\n\n'.format(input1_op1, input_2.upper()))
            print(sales[sales.Lstate == input_2.upper()])
            reset()
        elif input_1 == "2" and input_2.lower() == "all":
            print('You chose {} with a filter of {}\n\n'.format(input1_op2, input_2.upper()))
            print(rental)
            reset()
        elif input_1 == "2":
            print('You chose {} with a filter of {}\n\n'.format(input1_op2, input_2.upper()))
            print(rental[rental.Lstate == input_2.upper()]) 
            reset()
        

In [11]:
mainmenu()

What would you like to know? 


    Type the number of the option that best fits your question and press enter.

 
    1 = Avg Sales price table data  2 = Avg Rental Rates table data 

    3 = Oracle Prediction
    

1

        What state would you like to know about? 

 
        Type the two letter state abbreviation for state isolated metrics 

 
        or "all" for national metrics.

 
        example: tx (not case sensitive) 

tx
You chose Avg Sales price change over time with a filter of TX


      Avg_AggMortgage  BottomTier  MiddleTier  TopTier Lstate       Date  \
1853      1553.814722       85093      180443   275793     TX 2014-01-01   
1854      1553.814722       85093      180443   275793     TX 2014-01-01   
1855      1553.814722       85093      180443   275793     TX 2014-01-01   
1856      1553.814722       85093      180443   275793     TX 2014-01-01   
1857      1553.814722       85093      180443   275793     TX 2014-01-01   
...               ...         ...       

KeyboardInterrupt: Interrupted by user