In [None]:
# LINEAR & LOGISTIC REGRESSION MODELING ((2)) FOR THE REALESTATE DB (((Rental Market)))

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
from sklearn.datasets import make_regression

from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

In [164]:
# read data into a DataFrame
# NaN values filled with 24-mo average 

rental = pd.read_csv('realestate_data/Rental_FullEDA_fillNaN.csv', parse_dates = ['Date'])

In [165]:
# Since we will be predicting the amount of rent to be paid in a month in any given state at a date in the future,
#  "y" will be "R_Monthly", so we need to move it to index 0 to make things easier 

monthly = rental.pop('R_Monthly')
rental.insert(0, 'R_Monthly', monthly)
rental.head()


Unnamed: 0,R_Monthly,Lstate,CityName,Date,R_Annual,R_PriorMonth,R_DiffPrevMonth,R_60DayDiff,R_60DayChange,R_90DayDiff,R_90DayChange
0,924,AL,Birmingham,2014-01-01,11088,964.291667,2.791667,963.262153,4.866319,962.231409,7.385417
1,944,AL,Birmingham,2014-02-01,11328,924.0,20.0,964.291667,4.791667,963.262153,7.25
2,937,AL,Birmingham,2014-03-01,11244,944.0,-7.0,924.0,13.0,964.291667,7.0
3,935,AL,Birmingham,2014-04-01,11220,937.0,-2.0,944.0,-9.0,924.0,11.0
4,937,AL,Birmingham,2014-05-01,11244,935.0,2.0,937.0,0.0,944.0,-7.0


In [166]:
rental['Lstate'].value_counts()

CA               890
FL               890
OH               621
NY               445
NC               445
TX               445
CO               445
PA               439
TN               354
CT               267
MA               267
UT               267
SC               266
GA               265
MO               178
OK               178
MI               178
WA               178
LA               178
WI               178
AZ               178
VA               177
KY                89
DC                89
IL                89
MN                89
ID                89
NV                89
MS                89
IN                89
AL                89
MD                89
NE                89
United States     89
NM                89
IA                89
RI                89
OR                89
AR                89
HI                89
KS                88
Name: Lstate, dtype: int64

In [167]:
# Convert string and datetype data into numeric values so that they can be used in our algorithms

le = preprocessing.LabelEncoder()

# convert dates to numerical labels
rental['Date'] = le.fit_transform(np.array(rental['Date']))

# convert Lstate to numerical labels
rental['Lstate'] = le.fit_transform(np.array(rental['Lstate']))

# convert CityName to numerical labels
rental['CityName'] = le.fit_transform(np.array(rental['CityName']))

# round long float values to int
rental['R_Annual'] = np.rint(np.array(rental.R_Annual)).astype(int)
rental['R_Monthly'] = np.rint(np.array(rental.R_Monthly)).astype(int)
rental['R_PriorMonth'] = np.rint(np.array(rental.R_PriorMonth)).astype(int)
rental['R_DiffPrevMonth'] = np.rint(np.array(rental.R_DiffPrevMonth)).astype(int)
rental['R_60DayDiff'] = np.rint(np.array(rental.R_60DayDiff)).astype(int)
rental['R_60DayChange'] = np.rint(np.array(rental.R_60DayChange)).astype(int)
rental['R_90DayDiff'] = np.rint(np.array(rental.R_90DayDiff)).astype(int)
rental['R_90DayChange'] = np.rint(np.array(rental.R_90DayChange)).astype(int)

In [168]:
rental.head()

Unnamed: 0,R_Monthly,Lstate,CityName,Date,R_Annual,R_PriorMonth,R_DiffPrevMonth,R_60DayDiff,R_60DayChange,R_90DayDiff,R_90DayChange
0,924,0,10,0,11088,964,3,963,5,962,7
1,944,0,10,1,11328,924,20,964,5,963,7
2,937,0,10,2,11244,944,-7,924,13,964,7
3,935,0,10,3,11220,937,-2,944,-9,924,11
4,937,0,10,4,11244,935,2,937,0,944,-7


In [169]:
rental.describe()

Unnamed: 0,R_Monthly,Lstate,CityName,Date,R_Annual,R_PriorMonth,R_DiffPrevMonth,R_60DayDiff,R_60DayChange,R_90DayDiff,R_90DayChange
count,9419.0,9419.0,9419.0,9419.0,9419.0,9419.0,9419.0,9419.0,9419.0,9419.0,9419.0
mean,1313.997983,19.440174,52.488162,44.046608,15767.975794,-9118487.0,-9119793.0,-18238290.0,-18239590.0,-27358090.0,-27359380.0
std,408.159519,12.498957,30.592759,25.661204,4897.914224,139655100.0,139655000.0,197080500.0,197080300.0,240855800.0,240855700.0
min,581.0,0.0,0.0,0.0,6972.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0
25%,1055.0,7.0,26.0,22.0,12660.0,1052.5,-58.0,1050.0,-112.0,1047.5,-53.0
50%,1205.0,21.0,52.0,44.0,14460.0,1201.0,4.0,1199.0,6.0,1196.0,5.0
75%,1443.0,31.0,79.0,66.0,17316.0,1440.0,54.0,1438.0,111.0,1436.0,55.0
max,3096.0,40.0,105.0,88.0,37152.0,3096.0,1645.0,3096.0,1645.0,3096.0,1668.0


In [170]:
rental.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9419 entries, 0 to 9418
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   R_Monthly        9419 non-null   int32
 1   Lstate           9419 non-null   int32
 2   CityName         9419 non-null   int32
 3   Date             9419 non-null   int64
 4   R_Annual         9419 non-null   int32
 5   R_PriorMonth     9419 non-null   int32
 6   R_DiffPrevMonth  9419 non-null   int32
 7   R_60DayDiff      9419 non-null   int32
 8   R_60DayChange    9419 non-null   int32
 9   R_90DayDiff      9419 non-null   int32
 10  R_90DayChange    9419 non-null   int32
dtypes: int32(10), int64(1)
memory usage: 441.6 KB


In [171]:
### Predictive modeling and *.mdl export for in-app predictions ###

In [185]:
feature_cols = cols = ['Date', 'Lstate', 'R_DiffPrevMonth', 'R_60DayChange', 'R_90DayChange']
X = rental[feature_cols]
y = rental.R_Monthly
knr = KNeighborsRegressor(n_neighbors=1)
knr.fit(X,y)

KNeighborsRegressor(n_neighbors=1)

In [173]:
### Decision Tree Classification for Accuracy Score ###

In [186]:
# test dataset

rental_array = rental.values
X = rental_array[:,1:10] # rental[ everything else ]
Y = rental_array[:,0] # rental['R_Monthly']
test_size = 0.10
seed = 40

In [187]:
# train and fit test dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(\
                                    X, Y, test_size=test_size, random_state=seed)

model = KNeighborsRegressor(n_neighbors=1)
model.fit(X_train, Y_train)

KNeighborsRegressor(n_neighbors=1)

In [188]:
treeclf = DecisionTreeClassifier(max_depth=75, random_state=seed)
cross_val_score(treeclf, X, Y, cv=10, scoring='accuracy').mean()



0.8736572422615865

In [121]:
### Finding which rows of data are causing the above error, which states that some y-axis values only appear once 
#        (which means they can't be split by our cv count)

In [122]:
rental['R_Monthly'].value_counts()

1019    26
1118    25
1099    25
1080    25
1123    24
        ..
2325     1
2333     1
2341     1
2365     1
2039     1
Name: R_Monthly, Length: 1750, dtype: int64

In [123]:
# turn our value counts into a table with the R_Monthly value to make sanity checks easier:::

value_counts = rental['R_Monthly'].value_counts().rename_axis('unique_values').reset_index(name='counts')
value_counts

Unnamed: 0,unique_values,counts
0,1019,26
1,1118,25
2,1099,25
3,1080,25
4,1123,24
...,...,...
1745,2325,1
1746,2333,1
1747,2341,1
1748,2365,1


In [124]:
### count how many rows have a unique value for R_Monthly:

rental_value_counts = rental['R_Monthly'].value_counts()
single_value_count = 0 
for i in rental_value_counts:
    if i == 1:
        single_value_count+=1
print(single_value_count)

# 550 rows in our "y" column have values that only appear once...remove them for the sake of accuracy? lets try and see:::

550


In [125]:
# compile a list of row indexes with a singularly-occuring R_Monthly value:::

row_indexes = list(set([]))
refined_lst = []
for i in value_counts['unique_values']:
    for k in rental['R_Monthly']:
        if i == k:
            if k not in row_indexes:
                row_indexes.append(rental['R_Monthly'].loc[k])
for i in row_indexes:
    if i not in refined_lst:
        refined_lst.append(i)

In [126]:
print(refined_lst)

[1844, 2351, 1895, 1745, 3022, 1760, 1765, 2397, 1922, 3073, 2369, 2935, 2832, 2356, 2237, 3083, 1223, 2800, 3096, 1471, 1216, 2380, 1739, 2901, 2044, 1209, 2392, 2364, 1188, 1448, 3072, 1783, 1447, 2174, 1184, 1281, 3048, 2349, 3066, 1388, 2400, 1740, 1310, 1437, 1449, 2224, 3017, 1892, 2455, 2010, 2403, 2418, 1377, 3079, 1380, 2335, 1923, 1212, 1811, 2453, 1312, 2840, 1439, 3036, 1284, 3077, 2222, 1168, 1828, 1358, 1830, 1078, 3026, 1832, 2957, 2466, 3080, 2438, 2216, 2289, 1846, 1917, 1407, 1887, 1205, 3091, 1784, 1535, 2328, 1693, 1192, 2409, 1362, 2396, 2298, 2505, 2355, 3030, 1741, 3063, 2241, 1318, 1884, 2257, 2059, 1280, 1412, 2394, 1903, 2325, 1276, 1402, 2240, 1176, 1869, 1695, 1432, 1383, 1099, 1630, 1757, 3013, 1098, 1872, 1401, 2186, 1291, 1144, 2117, 1399, 1937, 1857, 1854, 2429, 1089, 1202, 2244, 2354, 1480, 1866, 2680, 3095, 2200, 1544, 1250, 1691, 1642, 1314, 1807, 2219, 1139, 1762, 2156, 1966, 2370, 1886, 1496, 1717, 1728, 1771, 3049, 1258, 3035, 1924, 2066, 2018, 123

In [127]:
# sanity check ***making sure we captured row indexes correctly before we drop 
# all of the row indexes appended to [row_indexes]. Lets check the first one in the list:::

print(rental.loc[1844])

R_Monthly           2034
Lstate                 5
CityName              90
Date                  51
R_Annual           24408
R_PriorMonth        2020
R_DiffPrevMonth       14
R_60DayDiff         1318
R_60DayChange        716
R_90DayDiff         1341
R_90DayChange        693
Name: 1844, dtype: int64


In [128]:
# drop row indexes according to our list of distinct row indexes
for i in refined_lst:
    rental.drop([i], axis=0, inplace=True)

In [129]:
rental_r = rental.reset_index()

In [130]:
rental_r.describe() # now we have dropped all rows with R_Monthly values that only appear once

Unnamed: 0,index,R_Monthly,Lstate,CityName,Date,R_Annual,R_PriorMonth,R_DiffPrevMonth,R_60DayDiff,R_60DayChange,R_90DayDiff,R_90DayChange
count,8561.0,8561.0,8561.0,8561.0,8561.0,8561.0,8561.0,8561.0,8561.0,8561.0,8561.0,8561.0
mean,5003.290737,1289.205583,20.892653,52.844644,43.912977,15470.467002,-9530826.0,-9532106.0,-19062940.0,-19064210.0,-28595060.0,-28596320.0
std,2674.479815,397.290141,12.185033,30.826518,25.459722,4767.481691,142764100.0,142764000.0,201448300.0,201448200.0,246169700.0,246169600.0
min,0.0,581.0,0.0,0.0,0.0,6972.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0
25%,2958.0,1043.0,8.0,25.0,22.0,12516.0,1041.0,-53.0,1037.0,-102.0,1036.0,-46.0
50%,5138.0,1183.0,23.0,53.0,44.0,14196.0,1181.0,4.0,1179.0,6.0,1176.0,5.0
75%,7278.0,1411.0,31.0,79.0,66.0,16932.0,1407.0,49.0,1401.0,101.0,1401.0,49.0
max,9418.0,3096.0,40.0,105.0,88.0,37152.0,3096.0,1645.0,3096.0,1645.0,3096.0,1668.0


In [131]:
# just to be safe, lets put everything from rental_r into a new dataframe to avoid the possibility that dropped indexes
# are simply hidden and not truly deleted.

rental_data = {'R_Monthly': rental_r.iloc[:,1].values,
               'Date': rental_r.iloc[:,4].values,
               'Lstate': rental_r.iloc[:,2].values,
               'CityName': rental_r.iloc[:,3].values,
               'R_Annual': rental_r.iloc[:,5].values,
               'R_PriorMonth': rental_r.iloc[:,6].values,
               'R_DiffPrevMonth': rental_r.iloc[:,7].values,
               'R_60DayDiff': rental_r.iloc[:,8].values,
               'R_60DayChange': rental_r.iloc[:,9].values,
               'R_90DayDiff': rental_r.iloc[:,10].values,
               'R_90DayChange': rental_r.iloc[:,11].values,
}
rental_df = pd.DataFrame(rental_data, columns=['R_Monthly', 'Date', 'Lstate', 'CityName', 'R_Annual', 'R_PriorMonth'\
                                               , 'R_DiffPrevMonth', 'R_60DayDiff', 'R_60DayChange', 'R_90DayDiff'\
                                               , 'R_90DayChange'])
rental_df

Unnamed: 0,R_Monthly,Date,Lstate,CityName,R_Annual,R_PriorMonth,R_DiffPrevMonth,R_60DayDiff,R_60DayChange,R_90DayDiff,R_90DayChange
0,924,0,0,10,11088,964,3,963,5,962,7
1,944,1,0,10,11328,924,20,964,5,963,7
2,937,2,0,10,11244,944,-7,924,13,964,7
3,935,3,0,10,11220,937,-2,944,-9,924,11
4,937,4,0,10,11244,935,2,937,0,944,-7
...,...,...,...,...,...,...,...,...,...,...,...
8556,1196,86,40,57,14352,1346,-150,1341,-145,1190,6
8557,1205,87,40,57,14460,1196,9,1346,-141,1341,-136
8558,1349,87,40,53,16188,1205,144,1196,153,1346,3
8559,1350,88,40,53,16200,1349,1,1205,145,1196,154


In [132]:
rental_df.value_counts()

R_Monthly  Date  Lstate  CityName  R_Annual  R_PriorMonth  R_DiffPrevMonth  R_60DayDiff  R_60DayChange  R_90DayDiff  R_90DayChange
581        6     28      105       6972      987           -406             807          -226           998          -417             1
1317       86    27      1         15804     1335          -18              1150          167           2413         -1096            1
1318       87    28      23        15816     752            566             1031          287           1004          314             1
           82    38      76        15816     1300           18              1333         -15            1325         -7               1
           79    38      100       15816     1305           13              1303          15            1289          29              1
                                                                                                                                     ..
1091       17    36      80        13092     1086    

In [133]:
rental_df.iloc[:,0].values # rental_df['R_Monthly']

array([ 924,  944,  937, ..., 1349, 1350, 1219])

In [134]:
rental_df.iloc[:,1:10].values # rental_df[ everything else ]

array([[   0,    0,   10, ...,  963,    5,  962],
       [   1,    0,   10, ...,  964,    5,  963],
       [   2,    0,   10, ...,  924,   13,  964],
       ...,
       [  87,   40,   53, ..., 1196,  153, 1346],
       [  88,   40,   53, ..., 1205,  145, 1196],
       [  88,   40,   57, ..., 1349, -130, 1205]], dtype=int64)

In [135]:
# now to try our coef. of determination classification again:

# test dataset


X =  rental_df.iloc[:,1:10].values     # rental[ everything else ]
Y = rental_df.iloc[:,0].values         # rental['R_Monthly']
test_size = 0.30
seed = 7

In [136]:
# train and fit test dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(\
                                    X, Y, test_size=test_size, random_state=seed)

model = KNeighborsRegressor(n_neighbors=1)
model.fit(X_train, Y_train)

KNeighborsRegressor(n_neighbors=1)

In [146]:
# well, our error persisted, and after playing with max_depth and cv value we retained our original accuracy. We'll go 
# ahead and use our original model...

treeclf = DecisionTreeClassifier(max_depth=85, random_state=seed)
cross_val_score(treeclf, X, Y, cv=15, scoring='accuracy').mean()



0.8704640468655586

In [189]:
### Prediction Test ####


#               Date   State  30d  60d   90d  
rental_test = [['1980', '25', '3', '0', '-2']]

print(knr.predict(rental_test))

        

[1237.]


  return f(*args, **kwargs)


In [190]:
# ^^^ Should probably use a different module in the future to encode string data!! 

joblib.dump(knr,'knr_rental.mdl')

['knr_rental.mdl']

In [191]:
predict_knr = joblib.load('knr_rental.mdl')

In [192]:
### Using "rental_test" defined above, a display of how the exported model would be used with user input ###

print(predict_knr.predict(rental_test))

[1237.]


  return f(*args, **kwargs)
