In [1]:
### RealEstateOracle Version 2.0 Machine Learning algorithm creation and export 
## Written in Python 3.9.5
# Blake Donahoo 8-20-2021

## DONE ##
# stopped running live SQL queries so that *.CSV files can be loaded and compiled into a moveable *.exe with the program
# limited the input data for predictions to state and date 
# built and export sales and rental ml models and successfully predicted off of them

## TO DO ##
# need to add algorithm for automatic encoding/decoding of state and date data 
# need to add graphics export module for matplotlib visuals to be viewed at the cmd line

In [2]:
# main modules to be compiled into realestateoracle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# modules only neccesary for ML construction
import sklearn
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import joblib

In [3]:
startup_logo =""" 
||\   ||////    ^    ||       ||////   ///// ////////  ^  //////// ||////
|| \  ||       / \   ||       ||      //        ||    / \    ||    ||
||_// ||////  /__ \  ||       ||////  //////    ||   /__ \   ||    ||////
|| \  ||     //    \ ||       ||          //    ||  //    \  ||    ||
||  \ ||//////      \||////   ||//// //////     || //      \ ||    ||////

       
    ///////  ||\       ^     /////// ||      ||//// 
  ///    /// || \     / \   ///      ||      ||
 ///    ///  ||_//   /__ \  ///      ||      ||////
 //    ///   || \   //    \ ///      ||      ||
 ///////     ||  \ //      \ /////// ||///// ||////
-----------------------------------------------------
A primitive data science library for real estate
Creator: Blake Donahoo
Published: 07-15-2021
Version: 2.0
Updated on: 08-20-2021

Data Source: https://www.zillow.com/research/data/

Data & Functionality contained: 
 - Home values in every state since 1996
 - Calculated value changes over time for every state since 1996
 - Forecasting Metrics and predictive algorithms
 - Rental Values in every state since 2014
 
Coming in Version 2.5:
  - Mean/Median/Mode, Standard Deviation for every slice of information
  - Python generated visualizations like ScatterPlots, Histograms & BarGraphs
"""


In [4]:
### read data into a DataFrame
# Rental NaN values filled with 10-day average 
# sales NaN values filled with 6-mo average
rental = pd.read_csv('realestate_data/Rental_FullEDA_fillNaN.csv', parse_dates = ['Date'])
sales = pd.read_csv('realestate_data/Combined_RealEstateData_Cleaned.csv', parse_dates = ['Date'])

In [5]:


### move our main "y" variable to the first index position to make things easier later 
# "y" for rental is R_Monthly
monthly = rental.pop('R_Monthly')
rental.insert(0, 'R_Monthly', monthly)
rental.reset_index()
# "y" for sales is Avg_AggMortgage
mortgage = sales.pop('Avg_AggMortgage')
sales.insert(0, 'Avg_AggMortgage', mortgage)

In [6]:


# Convert string and datetype data into numeric values so that they can be used in our algorithms

### convert dates to numerical labels
rental['Date'] = le.fit_transform(np.array(rental['Date']))
# convert CityName to numerical labels
rental['CityName'] = le.fit_transform(np.array(rental['CityName']))
# convert Lstate to numerical label Identifiers
rental['Lstate'] = le.fit_transform(np.array(rental['Lstate']))
# round long float values to int
rental['R_Monthly'] = np.rint(np.array(rental.R_Monthly)).astype(int) ### y value ###
rental['R_Annual'] = np.rint(np.array(rental.R_Annual)).astype(int)
rental['R_PriorMonth'] = np.rint(np.array(rental.R_PriorMonth)).astype(int)
rental['R_DiffPrevMonth'] = np.rint(np.array(rental.R_DiffPrevMonth)).astype(int)
rental['R_60DayDiff'] = np.rint(np.array(rental.R_60DayDiff)).astype(int)
rental['R_60DayChange'] = np.rint(np.array(rental.R_60DayChange)).astype(int)
rental['R_90DayDiff'] = np.rint(np.array(rental.R_90DayDiff)).astype(int)
rental['R_90DayChange'] = np.rint(np.array(rental.R_90DayChange)).astype(int)


### convert dates to numerical label Identifiers
sales['Date'] = le.fit_transform(np.array(sales['Date']))
# convert Lstate to numerical label Identifiers
sales['Lstate'] = le.fit_transform(np.array(sales['Lstate']))
# convert CityName to numerical labels
sales['CityName'] = le.fit_transform(np.array(sales['CityName']))
# round long float values to int
sales['Avg_AggMortgage'] = np.rint(np.array(sales.Avg_AggMortgage)).astype(int) ### y value ###
# BOTTOM TIER
sales['BottomTier'] = np.rint(np.array(sales.BottomTier)).astype(int)
sales['B_30DayChange'] = np.rint(np.array(sales.B_30DayChange)).astype(int)
sales['B_60DayChange'] = np.rint(np.array(sales.B_60DayChange)).astype(int)
sales['B_90DayChange'] = np.rint(np.array(sales.B_90DayChange)).astype(int)
sales['B_PrincipalMonthly'] = np.rint(np.array(sales.B_PrincipalMonthly)).astype(int)
sales['B_AvgMortgage'] = np.rint(np.array(sales.B_AvgMortgage)).astype(int)
sales['B_InterestMonthly'] = np.rint(np.array(sales.B_InterestMonthly)).astype(int)
sales['B_AvgAnnual'] = np.rint(np.array(sales.B_90DayChange)).astype(int)
# MIDDLE TIER
sales['MiddleTier'] = np.rint(np.array(sales.MiddleTier)).astype(int)
sales['M_30DayChange'] = np.rint(np.array(sales.M_30DayChange)).astype(int)
sales['M_60DayChange'] = np.rint(np.array(sales.M_60DayChange)).astype(int)
sales['M_90DayChange'] = np.rint(np.array(sales.M_90DayChange)).astype(int)
sales['M_PrincipalMonthly'] = np.rint(np.array(sales.M_PrincipalMonthly)).astype(int)
sales['M_AvgMortgage'] = np.rint(np.array(sales.M_AvgMortgage)).astype(int)
sales['M_InterestMonthly'] = np.rint(np.array(sales.M_InterestMonthly)).astype(int)
sales['M_AvgAnnual'] = np.rint(np.array(sales.M_90DayChange)).astype(int)
# TOP TIER
sales['TopTier'] = np.rint(np.array(sales.TopTier)).astype(int)
sales['T_30DayChange'] = np.rint(np.array(sales.T_30DayChange)).astype(int)
sales['T_60DayChange'] = np.rint(np.array(sales.T_60DayChange)).astype(int)
sales['T_90DayChange'] = np.rint(np.array(sales.T_90DayChange)).astype(int)
sales['T_PrincipalMonthly'] = np.rint(np.array(sales.T_PrincipalMonthly)).astype(int)
sales['T_AvgMortgage'] = np.rint(np.array(sales.T_AvgMortgage)).astype(int)
sales['T_InterestMonthly'] = np.rint(np.array(sales.T_InterestMonthly)).astype(int)
sales['T_AvgAnnual'] = np.rint(np.array(sales.T_90DayChange)).astype(int)

In [7]:
rental.tail()

Unnamed: 0,R_Monthly,Lstate,CityName,Date,R_Annual,R_PriorMonth,R_DiffPrevMonth,R_60DayDiff,R_60DayChange,R_90DayDiff,R_90DayChange
9414,1196,40,57,86,14352,1346,-150,1341,-145,1190,6
9415,1205,40,57,87,14460,1196,9,1346,-141,1341,-136
9416,1349,40,53,87,16188,1205,144,1196,153,1346,3
9417,1350,40,53,88,16200,1349,1,1205,145,1196,154
9418,1219,40,57,88,14628,1350,-131,1349,-130,1205,14


In [24]:


### RENTAL ###
### Predictive modeling and *.mdl export for in-app predictions ###
feature_cols = ['Lstate', 'Date']
X = rental[feature_cols]
y = rental.R_Monthly
knr = KNeighborsRegressor(n_neighbors=10)
knr.fit(X,y)

# test dataset
X = rental_array[:,1:10] # rental[ everything else ]
y = rental_array[:,0] # rental['R_Monthly']
test_size = 0.10
seed = 40

# train and fit test dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(\
                                    X, y, test_size=test_size, random_state=seed)
model = KNeighborsRegressor(n_neighbors=10)
model.fit(X_train, Y_train)

print(cross_val_score(knr, X, y, cv=10, scoring='r2').mean())

0.9954756479019983


In [25]:
X[-1,:]

array([   40,    57,    88, 14628,  1350,  -131,  1349,  -130,  1205],
      dtype=int64)

In [26]:
### Rental Prediction Test ####

#              Lstate  Date
rental_test = [['0', '90']]
rental_test2 = [['20', '120']]
print(knr.predict(rental_test))

[1681.]


  return f(*args, **kwargs)


In [27]:
# Rental export and prediction reload / test

joblib.dump(knr,'knr_rental.mdl')
predict_rental = joblib.load('knr_rental.mdl')
print(predict_rental.predict(rental_test2))

[1525.3]


  return f(*args, **kwargs)


In [28]:
#feature_cols = ['Lstate', 'Date', 'BottomTier', 'B_30DayChange'\
#                , 'B_60DayChange', 'B_90DayChange', 'B_PrincipalMonthly', 'B_AvgMortgage'\
#                , 'B_InterestMonthly', 'B_AvgAnnual', 'MiddleTier','M_30DayChange'\
#                , 'M_60DayChange', 'M_90DayChange', 'M_PrincipalMonthly'\
#                , 'M_AvgMortgage', 'M_InterestMonthly', 'M_AvgAnnual', 'TopTier'\
#                , 'T_30DayChange', 'T_60DayChange', 'T_90DayChange'\
#                , 'T_PrincipalMonthly', 'T_AvgMortgage', 'T_InterestMonthly', 'T_AvgAnnual']

In [29]:


### SALES ###
### Predictive modeling and *.mdl export for in-app predictions ###
feature_cols = ['Lstate', 'Date']
X = sales[feature_cols]
y = sales.Avg_AggMortgage
knr = KNeighborsRegressor(n_neighbors=5)
knr.fit(X,y)

# test dataset
sales_array = sales.values
X = sales_array[:,1:24] # sales[ everything else ]
Y = sales_array[:,0] # sales['Avg_AggMortgage']
test_size = 0.10
seed = 40

# train and fit test dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(\
                                    X, Y, test_size=test_size, random_state=seed)
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X_train, Y_train)

cross_val_score(knr, X, Y, cv=50, scoring='r2').mean()

0.9528024234542437

In [30]:
X[-1,:]

array([ 422637,  792474, 1162311,       6,      88,    4840,    7862,
         10582,   11191,   17671,   23577,   17542,   27479,   36571,
          1173,    3639,    2466,    2201,    6824,    4623,    3228,
         10009,    6781], dtype=int64)

In [31]:
### Rental Prediction Test ####
#              Lstate  Date
sales_test = [['5', '10000']]
sales_test2 = [['40', '15000']]
print(knr.predict(sales_test))

[4210.]


  return f(*args, **kwargs)


In [33]:
# Rental export and prediction reload / test

joblib.dump(knr,'knr_sales.mdl')
predict_sales = joblib.load('knr_sales.mdl')
print(predict_sales.predict(sales_test2))

[3572.4]


  return f(*args, **kwargs)
