In [353]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib as mpl
import matplotlib.cm as cm

In [354]:
assessor_dir = 'datasets/assessorSequential.csv'
df = pd.read_csv(assessor_dir)

In [355]:
df = df.drop(columns=['Unnamed: 0','X11th.Draw','PIN','Township Code','Neighborhood Code','Age','Longitude','Latitude','ZIP'])

In [356]:
draw_col = df.columns[2:12]
draw_col

Index(['X1st.Draw', 'X2nd.Draw', 'X3rd.Draw', 'X4th.Draw', 'X5th.Draw',
       'X6th.Draw', 'X7th.Draw', 'X8th.Draw', 'X9th.Draw', 'X10th.Draw'],
      dtype='object')

In [357]:
df['avg_draw'] = df[draw_col].mean(axis=1)
df = df.drop(columns=draw_col)

In [358]:
df_full = df.copy()

In [359]:
df_reduced = df.copy()
df_full.head(5)

Unnamed: 0,Date.Sampled,Address,Sale Price,Tract Median Income,avg_draw
0,9/4/2019,100XX S BELL AVE,280000.0,122727.0,2.433
1,7/16/2016,100XX S CALHOUN AVE,,44423.0,9.866
2,12/17/2019,100XX S CALUMET AVE,,40612.0,10.399
3,12/14/2019,100XX S CARPENTER ST,,37207.0,8.663
4,7/14/2021,100XX S CARPENTER ST,,37207.0,9.28


In [360]:
df_full[~df_full.isin(['NaN', 'NaT']).any(axis=1)]
df_full.shape
df_full.dropna(inplace=True)
df_full.shape

(149, 5)

In [361]:
#assign 0 to <7.5 and 1 to >=7.5
df_full['avg_lead_target'] = np.where(df_full['avg_draw']<6, 0, 1)

In [362]:
df_full.head(10)

Unnamed: 0,Date.Sampled,Address,Sale Price,Tract Median Income,avg_draw,avg_lead_target
0,9/4/2019,100XX S BELL AVE,280000.0,122727.0,2.433,0
17,12/20/2016,102XX S ARTESIAN AVE,219000.0,98281.0,5.192,0
20,9/17/2019,102XX S OGLESBY AVE,147400.0,30069.0,27.25,1
21,9/28/2016,103XX S HAMILTON AVE,330000.0,110344.0,6.315,1
36,10/10/2019,105XX S CENTRAL PARK AVE,280000.0,100361.0,10.505,1
42,11/4/2021,105XX S KEDZIE AVE,290000.0,91924.0,14.775,1
44,8/30/2021,105XX S SEELEY AVE,464000.0,110344.0,5.901,0
49,11/30/2021,106XX S EBERHART AVE,86000.0,45273.0,15.01,1
65,12/6/2021,108XX S EGGLESTON AVE,146000.0,41167.0,2.018,0
67,3/2/2022,108XX S HAMLIN AVE,273000.0,118640.0,4.998,0


In [363]:
df_full['avg_lead_target'].value_counts()

1    105
0     44
Name: avg_lead_target, dtype: int64

In [364]:
df_full['sale_price'] = df_full['Sale Price']
df_full['tract_income'] = df_full['Tract Median Income']
df_full = df_full.drop(columns=['Sale Price','Tract Median Income'])

In [365]:
class_0 = df_full[df_full['avg_lead_target']==0]
print(class_0.shape)
class_1 = df_full[df_full['avg_lead_target']==1]

class_1_under = class_1.sample(class_0.shape[0])

df_balanced = pd.concat([class_0, class_1_under], axis=0)

(44, 6)


In [366]:
df_balanced['avg_lead_target'].value_counts()

0    44
1    44
Name: avg_lead_target, dtype: int64

In [367]:
df_balanced

Unnamed: 0,Date.Sampled,Address,avg_draw,avg_lead_target,sale_price,tract_income
0,9/4/2019,100XX S BELL AVE,2.433,0,280000.0,122727.0
17,12/20/2016,102XX S ARTESIAN AVE,5.192,0,219000.0,98281.0
44,8/30/2021,105XX S SEELEY AVE,5.901,0,464000.0,110344.0
65,12/6/2021,108XX S EGGLESTON AVE,2.018,0,146000.0,41167.0
67,3/2/2022,108XX S HAMLIN AVE,4.998,0,273000.0,118640.0
...,...,...,...,...,...,...
246,11/16/2020,15XX E 86TH ST,13.020,1,239900.0,42766.0
343,8/27/2016,18XX W WELLINGTON AVE,10.851,1,1200000.0,178750.0
1488,3/18/2022,72XX W BRYN MAWR AVE,8.810,1,328000.0,83197.0
1680,12/10/2019,89XX S JEFFERY BLVD,8.224,1,200000.0,50426.0


In [368]:
log_reg = smf.logit(formula='avg_lead_target ~ tract_income + sale_price', data=df_balanced).fit()

Optimization terminated successfully.
         Current function value: 0.680425
         Iterations 4


In [369]:
print(log_reg.summary())

                           Logit Regression Results                           
Dep. Variable:        avg_lead_target   No. Observations:                   88
Model:                          Logit   Df Residuals:                       85
Method:                           MLE   Df Model:                            2
Date:                Tue, 25 Apr 2023   Pseudo R-squ.:                 0.01835
Time:                        01:14:48   Log-Likelihood:                -59.877
converged:                       True   LL-Null:                       -60.997
Covariance Type:            nonrobust   LLR p-value:                    0.3264
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -0.1459      0.567     -0.258      0.797      -1.257       0.965
tract_income -6.472e-06   9.23e-06     -0.701      0.483   -2.46e-05    1.16e-05
sale_price    1.606e-06   1.11e-06      1.45

In [370]:
log_reg_reduced = smf.logit(formula='avg_lead_target ~ sale_price', data=df_full).fit()

Optimization terminated successfully.
         Current function value: 0.601887
         Iterations 5


In [371]:
log_reg_reduced.summary()

0,1,2,3
Dep. Variable:,avg_lead_target,No. Observations:,149.0
Model:,Logit,Df Residuals:,147.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 25 Apr 2023",Pseudo R-squ.:,0.008146
Time:,01:14:48,Log-Likelihood:,-89.681
converged:,True,LL-Null:,-90.418
Covariance Type:,nonrobust,LLR p-value:,0.2248

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.5693,0.308,1.848,0.065,-0.034,1.173
sale_price,8.166e-07,7.07e-07,1.155,0.248,-5.69e-07,2.2e-06


In [372]:
df_reduced = df_reduced[['Tract Median Income','avg_draw']]
df_reduced['avg_lead_target'] = np.where(df_reduced['avg_draw']<9, 0, 1)
df_reduced['tract_income'] = df_reduced['Tract Median Income']
df_reduced = df_reduced.drop(columns=['Tract Median Income'])

In [373]:
class_0 = df_reduced[df_reduced['avg_lead_target']==0]
print(class_0.shape)
class_1 = df_reduced[df_reduced['avg_lead_target']==1]

class_1_under = class_1.sample(class_0.shape[0])

df_balanced1 = pd.concat([class_0, class_1_under], axis=0)

(816, 3)


In [374]:
df_balanced1['avg_lead_target'].value_counts()

0    816
1    816
Name: avg_lead_target, dtype: int64

In [375]:
log_reg1 = smf.logit(formula='avg_lead_target ~ tract_income', data=df_balanced1).fit()

Optimization terminated successfully.
         Current function value: 0.692325
         Iterations 3


In [376]:
log_reg1.summary()

0,1,2,3
Dep. Variable:,avg_lead_target,No. Observations:,1104.0
Model:,Logit,Df Residuals:,1102.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 25 Apr 2023",Pseudo R-squ.:,0.0008461
Time:,01:14:49,Log-Likelihood:,-764.33
converged:,True,LL-Null:,-764.97
Covariance Type:,nonrobust,LLR p-value:,0.2552

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.1103,0.148,0.745,0.456,-0.180,0.401
tract_income,-2.324e-06,2.05e-06,-1.136,0.256,-6.33e-06,1.68e-06
