Inspired/based on https://github.com/rayidghani/magicloops and https://github.com/dssg/MLforPublicPolicy/blob/master/labs/2019/lab6_feature_generation_sol.ipynb

In [352]:
import pipeline_evictions as pipeline
import ml_loop_evictions as loop

import importlib
import datetime
import pandas as pd
import numpy as np

# Read data

In [353]:
importlib.reload(pipeline)

datafile = "data/tracts.csv"

#Read data, parsing year column to date type
data = pd.read_csv(datafile, parse_dates=['year'])

# Create outcome label

In [354]:
def get_eviction_rate(df, year,geoid):
  
  data_to_return = df.loc[(df['year'] == year) & (df['GEOID'] == geoid)]
  
  return data_to_return['eviction-rate'].iloc[0]

In [355]:
importlib.reload(pipeline)

from dateutil.relativedelta import relativedelta

#Obtain eviction-rate cutoff for the top 10%, for each year
cutoff_10_percent={}
for year in range(2000,2017):
    year = pd.Timestamp(year,1,1)
    cutoff_10_percent[year]=data.loc[data['year'] == year]['eviction-rate'].quantile(.9)
    
top_10_eviction_rate_in_any_next_3_years_column = np.zeros(len(data))

for index, row in data.iterrows():
  
  #Because the outcome will come from eviction-rate in next 3 years and we have data till 2016,
  #features data bust be from 2013 or before
  
  if(row['year']<=pd.Timestamp(2013,1,1)):    
    
    found_year_where_eviction_was_in_top_10_percent=0
    
    #Get eviction for the next 3 years
    for i in range(1,4):
      date_in_i_years = row['year'] + relativedelta(years=i)
      eviction_rate_in_i_years = get_eviction_rate(data, date_in_i_years,row['GEOID'])
    
      top_10_eviction_rate_in_i_years = 1 if eviction_rate_in_i_years>= cutoff_10_percent[date_in_i_years] else 0
      
      #Debugging
#       if(top_10_eviction_rate_in_i_years==1):
#         print(row['GEOID'])
#         print(row['year'])
#         print(date_in_i_years)
#         print(eviction_rate_in_i_years)
#         print(cutoff_10_percent[date_in_i_years])
#         print(top_10_eviction_rate_in_i_years)        
    
      #If we found one year that meets requirement, we are done with looping
      if(top_10_eviction_rate_in_i_years==1):
        found_year_where_eviction_was_in_top_10_percent=1
        break

    if (found_year_where_eviction_was_in_top_10_percent):
      top_10_eviction_rate_in_any_next_3_years_column[index]=1
    else:
      top_10_eviction_rate_in_any_next_3_years_column[index]=0

data['top_10_percent_in_any_next_3_years'] = top_10_eviction_rate_in_any_next_3_years_column

label ='top_10_percent_in_any_next_3_years'

# Create temporal train and test sets

In [356]:
importlib.reload(pipeline)

#Create sets of train and test data, based on different split thresholds
#The split thresholds corresponds to the starting date of the testing data

#Splits according to https://docs.google.com/spreadsheets/d/1ipqsgThz7hdXXyyNpTuqa4J1inc088lop7lhFsAQ_r0/edit#gid=0
split_thresholds = [pd.Timestamp(i,1,1) for i in range (2004, 2014)]

#Indicating which is the column to be used for splitting training and test daata
date_column='year'

#Amount of data used for test set
test_window = relativedelta(years=4)

#Gap needed between training and test set
gap_training_test = relativedelta(years=3)

#Generate train and test sets
train_test_sets= pipeline.create_temp_validation_train_and_testing_sets(
  data,
  date_column,
  label,
  split_thresholds,
  test_window,
  gap_training_test)

# Process data

In [357]:
importlib.reload(pipeline)

#Impute data on continuous columns for each training and test set

#--->PENDING
#In the meantime, imputing all float columns with mean

float_columns = [column for column in data.columns if data[column].dtype=='float']

#Do not consider GEOID column
float_columns=float_columns[1:]

for train_test_set in train_test_sets:
  train_data = train_test_set['x_train']
  test_data = train_test_set['x_test']

  #fill na values with mean
  pipeline.fill_na_columns_with_mean(train_data, float_columns)
  pipeline.fill_na_columns_with_mean(test_data, float_columns)


# Create features

In [358]:
import feature_generation as fg

importlib.reload(pipeline)
importlib.reload(fg)

#We will have to generate features independently for each different train/test set
for train_test_set in train_test_sets:

#   train_features, test_features = pipeline.create_features(train_test_set)
  
  #NEW VERSION
  train_features = fg.create_features(train_test_set['x_train'])
  test_features = fg.create_features(train_test_set['x_test']) 

  
  #print(train_features)
  
  #Replace raw data in train_test_set with features generated
  train_test_set['x_train'] = train_features
  train_test_set['x_test'] = test_features

In [359]:
import feature_generation as fg

importlib.reload(pipeline)
importlib.reload(fg)

a = train_test_sets[len(train_test_sets)-1]['x_train']

a.to_csv("hi.csv", header=True)

In [360]:
for column in a.columns:
  if(a[column].isnull().values.any()):
    print(column)

In [361]:
 #OLD VERSION: 
#   



  

#train_test_sets
# len(train_test_sets[0]['x_train'].columns)

# for column in train_test_sets[0]['x_train'].columns:
#   print(column)

# # train_test_sets[0]['x_train'].drop(columns=['GEOID', 'name'], inplace=True)
  
# train_test_sets[0]['x_train']
# train_test_sets[0]['x_test'].head()

# for column in train_test_sets[0]['x_test'].columns:
#   print(column)  

# Build Clasifiers and parameters generation

In [362]:
importlib.reload(pipeline)

#We define the specific models we want to run
models_to_run=['DT','LR','RF','ET','KNN','NB','BA','AB','GB']#'SVM'

#Get all posible models and their different sets of parameters
models, parameters_grid = pipeline.get_models_and_parameters()

# Loop over models and different training/test sets

In [None]:
importlib.reload(pipeline)
importlib.reload(loop)

import warnings

warnings.filterwarnings('ignore')

results = loop.iterate_over_models_and_training_test_sets(models_to_run, models, parameters_grid, train_test_sets)
results

2019-06-06 23:56:54.870567: Running DT with params: {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2} on train/test set 2004-01-01 00:00:00
2019-06-06 23:56:55.116467: Running DT with params: {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 5} on train/test set 2004-01-01 00:00:00
2019-06-06 23:56:55.355735: Running DT with params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2} on train/test set 2004-01-01 00:00:00
2019-06-06 23:56:55.598672: Running DT with params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5} on train/test set 2004-01-01 00:00:00
2019-06-06 23:56:55.860621: Running DT with params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2} on train/test set 2004-01-01 00:00:00
2019-06-06 23:56:56.103249: Running DT with params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5} on train/test set 2004-01-01 00:00:00
2019-06-06 23:56:56.343158: Running DT with params: {'criterion': 'gini', 'max_depth':

2019-06-06 23:57:37.254356: Running ET with params: {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 100} on train/test set 2004-01-01 00:00:00
2019-06-06 23:57:37.701850: Running ET with params: {'criterion': 'entropy', 'max_depth': 5, 'n_estimators': 1000} on train/test set 2004-01-01 00:00:00
2019-06-06 23:57:39.003032: Running ET with params: {'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 100} on train/test set 2004-01-01 00:00:00
2019-06-06 23:57:39.487438: Running ET with params: {'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 1000} on train/test set 2004-01-01 00:00:00
2019-06-06 23:57:41.413598: Running KNN with params: {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'uniform'} on train/test set 2004-01-01 00:00:00
2019-06-06 23:57:41.699603: Running KNN with params: {'algorithm': 'auto', 'n_neighbors': 3, 'weights': 'distance'} on train/test set 2004-01-01 00:00:00
2019-06-06 23:57:42.150396: Running KNN with params: {'algorithm': 'auto', 'n_nei

2019-06-06 23:58:00.869563: Running DT with params: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:01.163797: Running DT with params: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:01.454425: Running DT with params: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 2} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:01.743415: Running DT with params: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_split': 5} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:02.044799: Running DT with params: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 2} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:02.343520: Running DT with params: {'criterion': 'entropy', 'max_depth': 100, 'min_samples_split': 5} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:02.632590: Running LR with params: {'C': 0.00

2019-06-06 23:58:46.499510: Running KNN with params: {'algorithm': 'ball_tree', 'n_neighbors': 3, 'weights': 'uniform'} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:46.940516: Running KNN with params: {'algorithm': 'ball_tree', 'n_neighbors': 3, 'weights': 'distance'} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:47.383914: Running KNN with params: {'algorithm': 'ball_tree', 'n_neighbors': 5, 'weights': 'uniform'} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:47.829295: Running KNN with params: {'algorithm': 'ball_tree', 'n_neighbors': 5, 'weights': 'distance'} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:48.290447: Running KNN with params: {'algorithm': 'ball_tree', 'n_neighbors': 10, 'weights': 'uniform'} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:48.721508: Running KNN with params: {'algorithm': 'ball_tree', 'n_neighbors': 10, 'weights': 'distance'} on train/test set 2005-01-01 00:00:00
2019-06-06 23:58:49.161300: Running KNN with pa

2019-06-06 23:59:40.504957: Running RF with params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 10, 'n_jobs': -1} on train/test set 2006-01-01 00:00:00
2019-06-06 23:59:40.955914: Running RF with params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 100, 'n_jobs': -1} on train/test set 2006-01-01 00:00:00
2019-06-06 23:59:41.696192: Running RF with params: {'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 10, 'n_jobs': -1} on train/test set 2006-01-01 00:00:00
2019-06-06 23:59:42.164942: Running RF with params: {'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1} on train/test set 2006-01-01 00:00:00
2019-06-06 23:59:42.900926: Running RF with params: {'max_depth': 5, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 10, 'n_jobs': -1} on train/test set 2006-01-01 00:00:00
2019-06-06 23:59:43.346568: Running RF with params

2019-06-07 00:00:23.778512: Running AB with params: {'algorithm': 'SAMME', 'n_estimators': 10} on train/test set 2006-01-01 00:00:00
2019-06-07 00:00:24.129347: Running AB with params: {'algorithm': 'SAMME', 'n_estimators': 100} on train/test set 2006-01-01 00:00:00
2019-06-07 00:00:25.569421: Running AB with params: {'algorithm': 'SAMME.R', 'n_estimators': 10} on train/test set 2006-01-01 00:00:00
2019-06-07 00:00:25.947042: Running AB with params: {'algorithm': 'SAMME.R', 'n_estimators': 100} on train/test set 2006-01-01 00:00:00
2019-06-07 00:00:27.489206: Running GB with params: {'learning_rate': 0.001, 'n_estimators': 10, 'subsample': 0.1} on train/test set 2006-01-01 00:00:00
2019-06-07 00:00:27.835390: Running GB with params: {'learning_rate': 0.001, 'n_estimators': 10, 'subsample': 1.0} on train/test set 2006-01-01 00:00:00
2019-06-07 00:00:28.217315: Running GB with params: {'learning_rate': 0.001, 'n_estimators': 100, 'subsample': 0.1} on train/test set 2006-01-01 00:00:00
20

2019-06-07 00:01:24.669165: Running RF with params: {'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 100, 'n_jobs': -1} on train/test set 2007-01-01 00:00:00
2019-06-07 00:01:25.741841: Running RF with params: {'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 10, 'n_jobs': -1} on train/test set 2007-01-01 00:00:00
2019-06-07 00:01:26.463695: Running RF with params: {'max_depth': 50, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 100, 'n_jobs': -1} on train/test set 2007-01-01 00:00:00
2019-06-07 00:01:27.400120: Running ET with params: {'criterion': 'gini', 'max_depth': 2, 'n_estimators': 100} on train/test set 2007-01-01 00:00:00
2019-06-07 00:01:27.968731: Running ET with params: {'criterion': 'gini', 'max_depth': 2, 'n_estimators': 1000} on train/test set 2007-01-01 00:00:00
2019-06-07 00:01:30.758134: Running ET with params: {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 100} on train/test se

2019-06-07 00:02:43.963969: Running DT with params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2} on train/test set 2008-01-01 00:00:00
2019-06-07 00:02:44.300145: Running DT with params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 5} on train/test set 2008-01-01 00:00:00
2019-06-07 00:02:44.621423: Running DT with params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2} on train/test set 2008-01-01 00:00:00
2019-06-07 00:02:45.012508: Running DT with params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5} on train/test set 2008-01-01 00:00:00
2019-06-07 00:02:45.376318: Running DT with params: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 2} on train/test set 2008-01-01 00:00:00
2019-06-07 00:02:45.762386: Running DT with params: {'criterion': 'gini', 'max_depth': 50, 'min_samples_split': 5} on train/test set 2008-01-01 00:00:00
2019-06-07 00:02:46.168461: Running DT with params: {'criterion': 'gini', 'max_depth

# Observe best models for each train/test set, for different metrics

In [None]:
importlib.reload(pipeline)

#Lets obtain the best model for each train/test set, for each metric
metrics_to_display = ['p_at_5','p_at_10', 'auc-roc']

best_models_per_metric = {}

for metric in metrics_to_display:
    #indices of rows that have max value in specific metric for each train/test set
    idx = results.groupby(['test_set_start_date'])[metric].transform(max) == results[metric]

    #save table of best models at the specific metric
    best_models_per_metric[metric] = results[idx]

### Best models for Precision at 5%

In [None]:
best_models_per_metric['p_at_5'].iloc[:, [0,2,3,4,11,12,13]]

### Best models for Precision at 10%

In [None]:
best_models_per_metric['p_at_10'].iloc[:, [0,2,3,4,14,15,16]]

### Best models for AUC-ROC

In [None]:
best_models_per_metric['auc-roc'].iloc[:, [0,2,3,4,26]]

### Plot of all model types performance at different train/test sets, for the different metrics

In [None]:
importlib.reload(pipeline)

for metric in metrics_to_display:
    #For each model, find the set of parameters that work the best in each train/test set
    best_models = pipeline.get_best_models_of_each_type_for_each_train_test_set(models_to_run,results,'test_set_start_date', metric)
    pipeline.plot_models_in_time(models_to_run, best_models, metric)
   