In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dateutil.parser
import seaborn as sns
from IPython.display import Image

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
pd.set_option('display.precision', 3)

In [None]:
import pickle
with open(r"../data/pkl/logreg-alldata-final.pkl", "rb") as input_file:
    logreg = pickle.load(input_file)

In [None]:
# Prepare data

# Load cleaned dataset
df_pluto = pd.read_pickle('../data/pkl/df_pluto-cat-feat-2013-2017.pkl')

# To filter
drop_mask = ['BldgClass','FireComp','IrrLotCode','Sanborn','SplitZone','ZoneDist1','APPBBL','AreaSource','BBL','BoroCode','CondoNo','TaxMap','new_bldg_prob','ZoneCodeChanged','LandUse','LandUse2016','ZoneCodeBecameCondo','LandUse2016_11.0','LandUse2016_10.0','LandUse2016_9.0','LandUse2016_8.0','LandUse2016_7.0','LandUse2016_6.0','LandUse2016_5.0','LandUse2016_4.0','LandUse2016_3.0','LandUse2016_2.0','LandUse2016_1.0']]
holdout_mask = ((df_pluto['LandUse']==11)&(df_pluto['PolicePrct_75.0'] == 1));

# Create holdout set of current vacant lots that are in specific zip code
df_pluto_holdout = df_pluto[holdout_mask]

# Remove columns from train/test set
df_pluto = df_pluto[df_pluto.columns.difference(drop_mask)]
df_pluto = df_pluto.dropna()

# Not great, but prevents overwriting land use with all properties when filtering based on orginal dataframe
df_pluto_holdout = df_pluto_holdout[df_pluto_holdout.columns.difference(drop_mask)]
df_pluto_holdout = df_pluto_holdout.dropna()

# Subset later used for mapping
df_pluto_holdout_ref = df_pluto_holdout

# Drop analysis metric on main dataset
df_pluto =df_pluto.drop(df_pluto[df_pluto['PolicePrct_75.0'] == 1].index)

# Remove dependent variable from holdout set
df_pluto_holdout = df_pluto_holdout[df_pluto_holdout.columns.difference(['ZoneCodeWasVacant'])]

In [None]:
from imblearn.over_sampling import SMOTE 

# Subset data
df_pluto_subset = df_pluto

# Prepare coefs
y = df_pluto_subset['ZoneCodeWasVacant'].astype(int)
X = df_pluto_subset[df_pluto_subset.columns.difference(['ZoneCodeWasVacant'])]

# Create test train split
xtrain, xtest, ytrain, ytest= train_test_split(X,y)

# Smote train/ test data
sm = SMOTE(random_state=42)
xtrain, ytrain = sm.fit_sample(xtrain,ytrain)

# Scale train/test/holdout values values
scaler = StandardScaler().fit(xtrain)
xtrain_s = scaler.transform(xtrain)
xtest_s = pd.DataFrame(scaler.transform(xtest))
df_pluto_holdout = pd.DataFrame(scaler.transform(df_pluto_holdout))

In [None]:
# Predict holdout data
y_pred = logreg.predict_proba(df_pluto_holdout)

# Save predictions on holdout dataframe
df_pluto_holdout_ref = df_pluto_holdout_ref.reset_index()
df_pluto_holdout_ref = pd.merge(df_pluto_holdout_ref,pd.DataFrame(y_pred),how='outer', left_index=True, right_index=True)
df_pluto_holdout_ref['VacantChanges'] = df_pluto_holdout_ref['1_x']
df_pluto_holdout_ref['VacantRemains'] = df_pluto_holdout_ref['0_x']

df_pluto_holdout_ref

In [None]:
pickle.dump(df_pluto_holdout_ref, open('../data/pkl/holdout_Precinct_75.pkl', 'wb'))