In [None]:
# LINEAR & LOGISTIC REGRESSION MODELING FOR THE REALESTATE DB (((Sales/Mortgage Market)))

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_blobs
from sklearn.datasets import make_regression

from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

import pickle 

In [3]:
# read data into a DataFrame
# NaN values filled with 24-mo average 

sales = pd.read_csv('realestate_data/Sales_FullEDA_fillNaN.csv', parse_dates = ['Date'])

In [4]:
sales.head()

Unnamed: 0,StateName,Lstate,Date,BottomTier,B_PriorMonth,B_DiffPrevMonth,B_60DayDiff,B_60DayChange,B_90DayDiff,B_90DayChange,...,M_PrincipalMonthly,M_AvgMortgage,T_PrincipalMonthly,T_AvgMortgage,B_InterestMonthly,M_InterestMonthly,T_InterestMonthly,B_AvgAnnual,M_AvgAnnual,T_AvgAnnual
0,Alaska,AK,1996-01-01,67187,69615.0,278.041667,69491.58333,518.592014,69376.85764,766.316479,...,352,1093.585278,518,1608.624444,392.554722,741.585278,1090.624444,6942.656666,13123.02333,19303.49333
1,Alaska,AK,1996-02-01,67407,67187.0,220.0,69615.0,603.208333,69491.58333,854.303819,...,353,1096.866111,520,1613.283056,393.449167,743.866111,1093.283056,6965.39,13162.39333,19359.39667
2,Alaska,AK,1996-03-01,67532,67407.0,125.0,67187.0,345.0,69615.0,981.291667,...,354,1097.778889,520,1614.040833,394.525555,743.778889,1094.040833,6978.306666,13173.34667,19368.49
3,Alaska,AK,1996-04-01,67910,67532.0,378.0,67407.0,503.0,67187.0,723.0,...,355,1102.273889,522,1619.767222,396.780555,747.273889,1097.767222,7017.366666,13227.28667,19437.20667
4,Alaska,AK,1996-05-01,68258,67910.0,348.0,67532.0,726.0,67407.0,851.0,...,356,1105.003611,523,1622.238611,398.777222,749.003611,1099.238611,7053.326666,13260.04333,19466.86333


In [6]:
# Convert string and datetype data into numeric values so that they can be used in our algorithms

le = preprocessing.LabelEncoder()

# convert dates to numerical label Identifiers
sales['Date'] = le.fit_transform(np.array(sales['Date']))

# convert Lstate to numerical label Identifiers
sales['Lstate'] = le.fit_transform(np.array(sales['Lstate']))

# convert StateName to numerical label Identifiers
sales['StateName'] = le.fit_transform(np.array(sales['StateName']))

# round long float values to int
sales['BottomTier'] = np.rint(np.array(sales.BottomTier)).astype(int)
sales['B_PriorMonth'] = np.rint(np.array(sales.B_PriorMonth)).astype(int)
sales['B_DiffPrevMonth'] = np.rint(np.array(sales.B_DiffPrevMonth)).astype(int)
sales['B_60DayDiff'] = np.rint(np.array(sales.B_60DayDiff)).astype(int)
sales['B_60DayChange'] = np.rint(np.array(sales.B_60DayChange)).astype(int)
sales['B_90DayDiff'] = np.rint(np.array(sales.B_90DayDiff)).astype(int)
sales['B_90DayChange'] = np.rint(np.array(sales.B_90DayChange)).astype(int)
sales['B_PrincipalMonthly'] = np.rint(np.array(sales.B_PrincipalMonthly)).astype(int)
sales['B_AvgMortgage'] = np.rint(np.array(sales.B_AvgMortgage)).astype(int)
sales['B_InterestMonthly'] = np.rint(np.array(sales.B_InterestMonthly)).astype(int)
sales['B_AvgAnnual'] = np.rint(np.array(sales.B_90DayChange)).astype(int)

sales['MiddleTier'] = np.rint(np.array(sales.MiddleTier)).astype(int)
sales['M_PriorMonth'] = np.rint(np.array(sales.M_PriorMonth)).astype(int)
sales['M_DiffPrevMonth'] = np.rint(np.array(sales.M_DiffPrevMonth)).astype(int)
sales['M_60DayDiff'] = np.rint(np.array(sales.M_60DayDiff)).astype(int)
sales['M_60DayChange'] = np.rint(np.array(sales.M_60DayChange)).astype(int)
sales['M_90DayDiff'] = np.rint(np.array(sales.M_90DayDiff)).astype(int)
sales['M_90DayChange'] = np.rint(np.array(sales.M_90DayChange)).astype(int)
sales['M_PrincipalMonthly'] = np.rint(np.array(sales.M_PrincipalMonthly)).astype(int)
sales['M_AvgMortgage'] = np.rint(np.array(sales.M_AvgMortgage)).astype(int)
sales['M_InterestMonthly'] = np.rint(np.array(sales.M_InterestMonthly)).astype(int)
sales['M_AvgAnnual'] = np.rint(np.array(sales.M_90DayChange)).astype(int)

sales['TopTier'] = np.rint(np.array(sales.TopTier)).astype(int)
sales['T_PriorMonth'] = np.rint(np.array(sales.T_PriorMonth)).astype(int)
sales['T_DiffPrevMonth'] = np.rint(np.array(sales.T_DiffPrevMonth)).astype(int)
sales['T_60DayDiff'] = np.rint(np.array(sales.T_60DayDiff)).astype(int)
sales['T_60DayChange'] = np.rint(np.array(sales.T_60DayChange)).astype(int)
sales['T_90DayDiff'] = np.rint(np.array(sales.T_90DayDiff)).astype(int)
sales['T_90DayChange'] = np.rint(np.array(sales.T_90DayChange)).astype(int)
sales['T_PrincipalMonthly'] = np.rint(np.array(sales.T_PrincipalMonthly)).astype(int)
sales['T_AvgMortgage'] = np.rint(np.array(sales.T_AvgMortgage)).astype(int)
sales['T_InterestMonthly'] = np.rint(np.array(sales.T_InterestMonthly)).astype(int)
sales['T_AvgAnnual'] = np.rint(np.array(sales.T_90DayChange)).astype(int)

In [7]:
sales.head()

Unnamed: 0,StateName,Lstate,Date,BottomTier,B_PriorMonth,B_DiffPrevMonth,B_60DayDiff,B_60DayChange,B_90DayDiff,B_90DayChange,...,M_PrincipalMonthly,M_AvgMortgage,T_PrincipalMonthly,T_AvgMortgage,B_InterestMonthly,M_InterestMonthly,T_InterestMonthly,B_AvgAnnual,M_AvgAnnual,T_AvgAnnual
0,1,0,0,67187,69615,278,69492,519,69377,766,...,352,1094,518,1609,393,742,1091,766,869,972
1,1,0,1,67407,67187,220,69615,603,69492,854,...,353,1097,520,1613,393,744,1093,854,872,889
2,1,0,2,67532,67407,125,67187,345,69615,981,...,354,1098,520,1614,395,744,1094,981,880,779
3,1,0,3,67910,67532,378,67407,503,67187,723,...,355,1102,522,1620,397,747,1098,723,1009,1294
4,1,0,4,68258,67910,348,67532,726,67407,851,...,356,1105,523,1622,399,749,1099,851,945,1040


In [8]:
sales.describe()

Unnamed: 0,StateName,Lstate,Date,BottomTier,B_PriorMonth,B_DiffPrevMonth,B_60DayDiff,B_60DayChange,B_90DayDiff,B_90DayChange,...,M_PrincipalMonthly,M_AvgMortgage,T_PrincipalMonthly,T_AvgMortgage,B_InterestMonthly,M_InterestMonthly,T_InterestMonthly,B_AvgAnnual,M_AvgAnnual,T_AvgAnnual
count,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,...,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0,15332.0
mean,24.878033,24.918536,153.668797,113942.814832,-6889939.0,-7002879.0,-13893810.0,-14005770.0,-20897670.0,-21008670.0,...,619.892643,1923.207083,923.769958,2865.23526,665.169254,1303.314506,1941.465106,-21008670.0,-21007820.0,-21006970.0
std,14.767491,14.784158,87.513208,62962.732052,122445700.0,122439200.0,172880800.0,172871700.0,211387000.0,211376000.0,...,309.806242,960.400717,451.997445,1401.186642,367.288848,650.594661,949.189409,211376000.0,211376000.0,211376100.0
min,0.0,0.0,0.0,25285.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,-2147484000.0,...,208.0,647.0,346.0,1075.0,148.0,439.0,729.0,-2147484000.0,-2147484000.0,-2147484000.0
25%,12.0,12.0,78.0,67376.75,67053.5,71.0,66769.75,146.0,66524.75,220.0,...,401.0,1245.0,614.0,1904.0,393.0,844.0,1290.0,220.0,306.75,372.75
50%,25.0,25.0,154.0,96228.5,95852.0,269.0,95461.0,538.0,95113.0,800.0,...,532.0,1651.0,797.0,2472.0,562.0,1119.0,1675.0,800.0,1479.5,2147.5
75%,38.0,38.0,229.0,146845.75,146323.5,617.0,145879.5,1227.0,145301.0,1829.0,...,745.0,2312.0,1077.0,3342.0,857.0,1567.0,2264.0,1829.0,3285.0,4702.25
max,50.0,50.0,304.0,422637.0,417797.0,9376.0,414775.0,16541.0,412055.0,22655.0,...,2255.0,6993.0,3387.0,10500.0,2466.0,4738.0,7113.0,22655.0,44780.0,67189.0


In [9]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15332 entries, 0 to 15331
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   StateName           15332 non-null  int32
 1   Lstate              15332 non-null  int64
 2   Date                15332 non-null  int64
 3   BottomTier          15332 non-null  int32
 4   B_PriorMonth        15332 non-null  int32
 5   B_DiffPrevMonth     15332 non-null  int32
 6   B_60DayDiff         15332 non-null  int32
 7   B_60DayChange       15332 non-null  int32
 8   B_90DayDiff         15332 non-null  int32
 9   B_90DayChange       15332 non-null  int32
 10  MiddleTier          15332 non-null  int32
 11  M_PriorMonth        15332 non-null  int32
 12  M_DiffPrevMonth     15332 non-null  int32
 13  M_60DayDiff         15332 non-null  int32
 14  M_60DayChange       15332 non-null  int32
 15  M_90DayDiff         15332 non-null  int32
 16  M_90DayChange       15332 non-null  int3

In [10]:
# generating a 2d classification dataset 
X,y = make_blobs(n_samples=100, centers=5, n_features=10, random_state=1)

In [11]:
# fit final model
sales_LogR = LogisticRegression()
sales_LogR.fit(X,y)

LogisticRegression()

In [12]:
# Multi-class prediction
# new instance where we do not know the answer (test-set with output witheld)

Xnew, _ = make_blobs(n_samples=10, centers=5, n_features=10, random_state=1)
ynew = sales_LogR.predict_proba(Xnew)

for i in range(len(Xnew)):
    print("X=%s, Predicted=%s" % (Xnew[i], ynew[i]))

X=[ -7.62957168  -0.98426898   8.06287876   0.83268813   4.57809873
  -4.64338798   3.46380005   6.72512798 -11.00735177   5.31804569], Predicted=[1.87688096e-04 2.17712527e-04 1.40837732e-04 9.99412654e-01
 4.11079837e-05]
X=[ 6.17492844 10.24140044 -3.41588149  1.8242511   7.22157903  8.72010791
 -8.06902104 -8.45689316 -6.82571975  7.362092  ], Predicted=[2.69650013e-04 1.63072953e-04 9.99271173e-01 1.79837288e-04
 1.16266277e-04]
X=[-1.10317989  3.40629717 -5.42243686  7.48677701 -8.32061875  4.92916702
  0.53167145 -0.22269977 -8.63637503 -6.54243608], Predicted=[2.35318007e-04 9.96280932e-01 3.61778124e-04 4.38728311e-04
 2.68324309e-03]
X=[-2.75862791  3.35504729 -6.11984924  8.14897192 -8.61326472  4.34045228
 -1.36831663  2.05893773 -7.94665917 -4.78510206], Predicted=[2.02240451e-04 9.99066654e-01 1.50013480e-04 2.74702388e-04
 3.06389594e-04]
X=[ 6.20145276  9.77528316 -3.53321672  3.96546096  6.85712076  8.26969706
 -8.1772945  -8.08942043 -5.40447373  7.74800649], Predicte

In [13]:
# generating a Regression dataset

X, y = make_regression(n_samples=100, n_features=10, noise=0.1, random_state=1)

sales_LinR = LinearRegression()
sales_LinR.fit(X,y)

LinearRegression()

In [15]:
# predictions on multiple regressions

Xnew, _ = make_regression(n_samples=10, n_features=10, noise=0.1, random_state=1)
ynew = sales_LinR.predict(Xnew)

for i in range(len(Xnew)):
    print("X=%s, Predicted=%s" % (Xnew[i], ynew[i]))

X=[ 1.13376944 -0.3224172  -0.17242821 -1.09989127 -2.06014071 -0.87785842
  0.58281521  0.04221375 -0.38405435  1.46210794], Predicted=-44.80963489195843
X=[ 0.19829972  0.18656139 -0.67066229  0.11900865 -0.20075807  0.37756379
  1.12948391  0.12182127  0.41005165 -0.22232814], Predicted=13.846487740869263
X=[ 0.86540763 -0.52817175  1.74481176 -2.3015387  -0.61175641 -0.7612069
 -0.24937038  0.3190391  -1.07296862  1.62434536], Predicted=-22.77391294469603
X=[ 0.90085595  0.90159072 -0.12289023 -0.68372786  1.14472371 -0.93576943
  0.53035547 -0.26788808  0.50249434 -1.10061918], Predicted=-81.38985413339785
X=[ 0.42349435 -0.37528495 -0.34385368  0.07734007  0.18515642  0.04359686
  0.69803203 -0.62000084 -0.63873041  1.19891788], Predicted=28.125886856564634
X=[ 0.05080775 -0.74715829  0.19091548 -0.63699565 -0.88762896  2.10025514
  0.61720311  0.12015895  1.6924546  -0.19183555], Predicted=2.1637766939960166
X=[ 0.48851815  0.51292982  1.13162939 -0.07557171  1.25286816  1.51981

In [16]:
# Binary class prediction

Xnew = [[1.6924546, 0.23009474, -0.84520564, 0.53035547, 0.83898341, 0.82797464,\
         -1.09989127, 1.14472371, -1.11731035, 0.76201118]]
ynew = sales_LinR.predict(Xnew)

for i in range(len(Xnew)):
    print("X=%s, Predicted=%s" % (Xnew[i], ynew[i]))

X=[1.6924546, 0.23009474, -0.84520564, 0.53035547, 0.83898341, 0.82797464, -1.09989127, 1.14472371, -1.11731035, 0.76201118], Predicted=241.74695800352603


In [17]:
# alternative classification dataset
sales_array = sales.values
X = sales_array[:,1:35] # sales[ everything else ]
Y = sales_array[:,0] # sales['StateName']
test_size = 0.25
#seed = 5

In [18]:
X

array([[     0,      0,  67187, ...,   1091,    766,    869],
       [     0,      1,  67407, ...,   1093,    854,    872],
       [     0,      2,  67532, ...,   1094,    981,    880],
       ...,
       [    50,    302, 176880, ...,   2547,   2678,   4191],
       [    50,    303, 177881, ...,   2559,   2806,   4380],
       [    50,    304, 178897, ...,   2571,   2935,   4441]], dtype=int64)

In [19]:
Y

array([ 1,  1,  1, ..., 50, 50, 50], dtype=int64)

In [22]:
# train and fit alternative dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(\
                                    X, Y, test_size=test_size, random_state=10)

model = LogisticRegression()
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [24]:
# Multi-class prediction for alternative dataset
# new instance where we do not know the answer (test-set with output witheld)

Xnew, _ = make_blobs(n_samples=10, centers=5, n_features=34, random_state=1)
ynew = model.predict_proba(Xnew)
ynew2 = model.predict(Xnew)

for i in range(len(Xnew)):
    print("X=%s, predict_proba=%s" % (Xnew[i], ynew[i]))

X=[  5.77551858  -2.50211044   7.93426607   3.19501582  -9.16850153
   5.80621021   4.40258496  10.27073125  -6.531327    -7.72595882
   9.50519048   3.52333392  -6.84527892   5.67364391   7.21535184
   7.67495671   2.47456953  -6.79979116  -8.74969326  -9.44042017
 -10.97266349  -5.52367383   7.81854451   0.59244496   0.94045439
   6.66515888  -8.45044835  -4.94934675   0.28863001  11.15987491
   0.74523151  -9.14944403   4.99076751  -4.54598628], predict_proba=[0.0196078  0.019608   0.01960798 0.01960794 0.01960773 0.01960782
 0.01960777 0.01960782 0.01960774 0.01960781 0.01960779 0.01960769
 0.01960783 0.01960778 0.01960782 0.01960795 0.01960779 0.01960781
 0.01960782 0.0196078  0.0196078  0.01960779 0.01960781 0.0196078
 0.01960783 0.01960798 0.01960782 0.01960783 0.01960799 0.01960782
 0.01960778 0.01960798 0.01960771 0.01960797 0.01960783 0.01960802
 0.01960798 0.01960783 0.01960779 0.01960782 0.01960781 0.01960781
 0.01960781 0.0196078  0.01960793 0.01960793 0.01960788 0.0196077

In [25]:
for i in range(len(Xnew)):
    print("X=%s, predicted=%s" % (Xnew[i], ynew2[i]))

X=[  5.77551858  -2.50211044   7.93426607   3.19501582  -9.16850153
   5.80621021   4.40258496  10.27073125  -6.531327    -7.72595882
   9.50519048   3.52333392  -6.84527892   5.67364391   7.21535184
   7.67495671   2.47456953  -6.79979116  -8.74969326  -9.44042017
 -10.97266349  -5.52367383   7.81854451   0.59244496   0.94045439
   6.66515888  -8.45044835  -4.94934675   0.28863001  11.15987491
   0.74523151  -9.14944403   4.99076751  -4.54598628], predicted=35
X=[  8.99811784   0.20542049   8.8630644   -7.62494424  -7.08042486
   7.34988064  -1.76171515  -6.43044861   8.82667091  -3.77795441
   5.85224678   6.06331882   8.42492748   3.35835296   4.14156716
  -3.88982038  -6.04231819   9.14997743  -1.69235607  10.69664489
   2.48691827   1.99640542  -7.60965545   9.91123524  -0.94100713
   1.77891704  -1.82073638  -5.08227267   6.95112039   1.55451683
 -10.12917245   2.28607379  -2.97476541  -0.1395161 ], predicted=6
X=[  3.29176811  -3.88852524   3.02962005   6.48911899  -9.39156501
 

In [26]:
sales.corr()

Unnamed: 0,StateName,Lstate,Date,BottomTier,B_PriorMonth,B_DiffPrevMonth,B_60DayDiff,B_60DayChange,B_90DayDiff,B_90DayChange,...,M_PrincipalMonthly,M_AvgMortgage,T_PrincipalMonthly,T_AvgMortgage,B_InterestMonthly,M_InterestMonthly,T_InterestMonthly,B_AvgAnnual,M_AvgAnnual,T_AvgAnnual
StateName,1.0,0.994211,0.011237,-0.115691,-0.002391,-0.002332,-0.003345,-0.003303,-0.004086,-0.004052,...,-0.149105,-0.149095,-0.159621,-0.159622,-0.115686,-0.14909,-0.159623,-0.004052,-0.004053,-0.004053
Lstate,0.994211,1.0,0.008144,-0.115112,-0.002309,-0.00225,-0.003228,-0.003187,-0.003943,-0.00391,...,-0.148823,-0.148813,-0.159461,-0.159461,-0.115108,-0.148809,-0.159461,-0.00391,-0.00391,-0.003911
Date,0.011237,0.008144,1.0,0.40707,0.097734,0.097529,0.137833,0.13769,0.168459,0.168345,...,0.399503,0.399503,0.390141,0.390149,0.407065,0.399502,0.390153,0.168345,0.168345,0.168346
BottomTier,-0.115691,-0.115112,0.40707,1.0,0.044061,0.043552,0.062044,0.061688,0.075954,0.075666,...,0.969881,0.969881,0.942606,0.94261,0.999999,0.969881,0.942611,0.075666,0.075668,0.07567
B_PriorMonth,-0.002391,-0.002309,0.097734,0.044061,1.0,1.0,0.705965,0.705965,0.575484,0.575484,...,0.045357,0.045355,0.045123,0.045131,0.044063,0.045354,0.045135,0.575484,0.575484,0.575484
B_DiffPrevMonth,-0.002332,-0.00225,0.097529,0.043552,1.0,1.0,0.705949,0.705949,0.575458,0.575458,...,0.044864,0.044862,0.044644,0.044652,0.043555,0.044861,0.044656,0.575458,0.575458,0.575458
B_60DayDiff,-0.003345,-0.003228,0.137833,0.062044,0.705965,0.705949,1.0,1.0,0.815164,0.815164,...,0.063894,0.063902,0.063591,0.063594,0.062053,0.063906,0.063596,0.815164,0.815164,0.815164
B_60DayChange,-0.003303,-0.003187,0.13769,0.061688,0.705965,0.705949,1.0,1.0,0.815155,0.815156,...,0.063549,0.063557,0.063256,0.063259,0.061697,0.063561,0.06326,0.815156,0.815156,0.815156
B_90DayDiff,-0.004086,-0.003943,0.168459,0.075954,0.575484,0.575458,0.815164,0.815155,1.0,1.0,...,0.07825,0.078251,0.077871,0.077881,0.075963,0.078251,0.077885,1.0,1.0,1.0
B_90DayChange,-0.004052,-0.00391,0.168345,0.075666,0.575484,0.575458,0.815164,0.815156,1.0,1.0,...,0.077971,0.077972,0.0776,0.077609,0.075675,0.077972,0.077614,1.0,1.0,1.0


In [28]:
# correlation matrix heatmap 
#sns.heatmap(sales.corr(), annot=True)

In [None]:
###### Decision Tree Classifier ######

In [30]:
### Bottom Tier ### lets try one of three price tiers first
# define X and y

feature_cols = ['Lstate', 'Date', 'BottomTier', 'B_PriorMonth','B_DiffPrevMonth','B_60DayDiff',\
                'B_60DayChange', 'B_90DayDiff', 'B_90DayChange', 'B_PrincipalMonthly', 'B_AvgMortgage',\
               'B_InterestMonthly', 'B_AvgAnnual']

X = sales[feature_cols]

y = sales.StateName

In [31]:
treeclf = DecisionTreeClassifier(max_depth=30, random_state=1)
cross_val_score(treeclf, X, y, cv=50, scoring='accuracy').mean()

0.8208673436801432

In [34]:
### All Three Tiers ###
# define X and y

feature_cols = ['Lstate', 'Date', 'BottomTier', 'B_PriorMonth','B_DiffPrevMonth','B_60DayDiff'\
                , 'B_60DayChange', 'B_90DayDiff', 'B_90DayChange', 'B_PrincipalMonthly', 'B_AvgMortgage'\
               , 'B_InterestMonthly', 'B_AvgAnnual', 'MiddleTier', 'M_PriorMonth','M_DiffPrevMonth'\
               , 'M_60DayDiff','M_60DayChange', 'M_90DayDiff', 'M_90DayChange', 'M_PrincipalMonthly'\
               , 'M_AvgMortgage', 'M_InterestMonthly', 'M_AvgAnnual', 'TopTier', 'T_PriorMonth'\
                , 'T_DiffPrevMonth','T_60DayDiff', 'T_60DayChange', 'T_90DayDiff', 'T_90DayChange'\
                , 'T_PrincipalMonthly', 'T_AvgMortgage', 'T_InterestMonthly', 'T_AvgAnnual']


X = sales[feature_cols]

y = sales.StateName

In [35]:
treeclf = DecisionTreeClassifier(max_depth=30, random_state=1)
cross_val_score(treeclf, X, y, cv=50, scoring='accuracy').mean()

0.8208673436801432