# HW3 Analysis

## Import and load data

In [1]:
import ml_pipeline as pp
import pandas as pd
import datetime as dt
import numpy as np
#from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
file = './data/projects_2012_2013.csv'
df = pp.load_csv(file)

## Data transformations

### Helper functions for hw3 specific data

#### Convert date columns to datetime

In [3]:
df.columns

Index(['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid',
       'school_latitude', 'school_longitude', 'school_city', 'school_state',
       'school_metro', 'school_district', 'school_county', 'school_charter',
       'school_magnet', 'teacher_prefix', 'primary_focus_subject',
       'primary_focus_area', 'secondary_focus_subject', 'secondary_focus_area',
       'resource_type', 'poverty_level', 'grade_level',
       'total_price_including_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'date_posted', 'datefullyfunded'],
      dtype='object')

In [4]:
df.date_posted = pp.col_datetime(df, 'date_posted')

In [5]:
df.datefullyfunded = pp.col_datetime(df,'datefullyfunded')

#### Create labels: 1 if not fully funded in 60 days, 0 if fully funded in 60 days

In [6]:
df = pp.create_label(df, pred_time=60)

#### Identify feature columns with null values

In [9]:
for x in pp.na_col(df):
    if x in feature_cols:
        print(x)

school_metro
school_district
primary_focus_subject
primary_focus_area
secondary_focus_subject
secondary_focus_area
resource_type
grade_level
students_reached


## Run variations of models: 
### Decision trees, KNN, Logistic Regression, Linear SVM, Random forests, Bagging, Boosting

In [8]:
windows = [dt.datetime(2012,1,1), dt.datetime(2012,7,1), dt.datetime(2013,1,1), dt.datetime(2013,7,1), dt.datetime(2014,1,1)]
pred_time = 60 #days
label_col = 'label'
split_col = 'date_posted'
feature_cols= [x for x in df.columns if x not in ['projectid', 'teacher_acctid', 'schoolid', 'school_ncesid', 'school_latitude', 'school_longitude', 'date_posted', 'datefullyfunded', 'label']]
impute_info = ((pp.most_freq, ['school_metro','primary_focus_subject','primary_focus_area','resource_type','grade_level']), (np.nanmedian,['students_reached']))
bucketdict= {'total_price_including_optional_support': (4, ('lowest', 'low', 'medium', 'high')), 'students_reached':(4, ('lowest', 'low', 'medium', 'high'))}
top_k = {'school_state':30, 'school_city':30,'school_district':30, 'school_county':30}
seed=12345

In [9]:
# models = [
#     {'type': 'Dtree', 'clf': DecisionTreeClassifier(), 'params': {'criterion': ['entropy', 'gini'], 'max_depth': [10,20,30],'min_samples_split': [100, 300,500], 'random_state': [seed]}},
#     {'type': 'LR', 'clf': LogisticRegression(), 'params':{'penalty': ['l1','l2'], 'C': [0.1, 1.0, 10.0, 100.0], 'solver': ['liblinear'], 'random_state': [seed]}},
#     {'type': 'SVM', 'clf': LinearSVC(), 'params':{'penalty': ['l2'], 'C': [0.1, 1.0, 10.0, 100.0], 'random_state': [seed]}},
#     {'type': 'Bagging_dtree', 'clf': BaggingClassifier(), 'params':{'n_estimators': [100, 500, 1000], 'base_estimator':[None], 'random_state':[seed]}},
#     {'type': 'ADABoost_dtree', 'clf': AdaBoostClassifier(), 'params':{'n_estimators': [100, 500, 1000], 'base_estimator':[None], 'random_state':[seed]}},
#     {'type': 'GBoost', 'clf': GradientBoostingClassifier(), 'params': {'n_estimators': [100, 500, 1000], 'min_samples_split': [100, 300,500], 'random_state':[seed]}},
#     {'type': 'ExtraTrees', 'clf': ExtraTreesClassifier(),'params': {'n_estimators': [100, 500, 1000], 'criterion': ['entropy', 'gini'],'min_samples_split': [100, 300,500], 'max_depth': [10,20,30],'random_state':[seed], 'n_jobs':[5]}},
#     {'type': 'Random Forest', 'clf': RandomForestClassifier(), 'params':{'n_estimators': [100, 500, 1000], 'criterion': ['entropy', 'gini'], 'random_state': [seed]}},
#     {'type': 'KNN', 'clf': KNeighborsClassifier(), 'params':{'n_neighbors': [5,7], 'weights': ['uniform','distance'], 'metric':['minkowski'],'p': [1,2], 'n_jobs': [4]}},
#     {'type': 'NB', 'clf': GaussianNB(),'params':{'priors':[None]}}
# ]

models = [{'type': 'Dtree', 'clf': DecisionTreeClassifier(), 'params': {'criterion': ['entropy', 'gini'], 'max_depth': [10,20,30],'min_samples_split': [100, 300,500], 'random_state': [seed]}}]
thresholds = [1, 2, 5, 10, 20,30, 50]


In [10]:
#this code was run separately and saved in csv for future use, but this is how it is run
resdf=pp.run_models(models, thresholds, windows, df, feature_cols, label_col, split_col, impute_info, bucketdict, top_k, pred_time, pred_unit = 'day', filename = './data/finalrun')


Dtree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 100, 'random_state': 12345}
Dtree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 300, 'random_state': 12345}
Dtree: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 500, 'random_state': 12345}
Dtree: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 100, 'random_state': 12345}
Dtree: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 300, 'random_state': 12345}
Dtree: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 500, 'random_state': 12345}
Dtree: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 100, 'random_state': 12345}
Dtree: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 300, 'random_state': 12345}
Dtree: {'criterion': 'entropy', 'max_depth': 30, 'min_samples_split': 500, 'random_state': 12345}
Dtree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 100, 'random_state': 12345}
Dtree: {'criterion': 'g

KeyboardInterrupt: 

In [52]:
l = set(x)- set(y)

In [55]:
new =y.drop(l,axis=1)
len(new.columns)

227

In [54]:
'teacher_prefix_Dr.' in new.columns

True

In [97]:
resdf.head()

Unnamed: 0,school_city,school_state,school_metro,school_district,school_county,school_charter,school_magnet,teacher_prefix,primary_focus_subject,primary_focus_area,secondary_focus_subject,secondary_focus_area,resource_type,poverty_level,grade_level,total_price_including_optional_support,students_reached,eligible_double_your_impact_match
1,Ventura,CA,urban,Ventura Unif School District,Ventura,f,f,Mrs.,Civics & Government,History & Civics,Literature & Writing,Literacy & Language,Books,highest poverty,Grades 3-5,"(-0.001, 0.00358]","(0.021, 0.0291]",t
2,Los Angeles,CA,urban,Los Angeles Unif Sch Dist,Los Angeles,f,f,Ms.,Literacy,Literacy & Language,Social Sciences,History & Civics,Technology,high poverty,Grades 3-5,"(0.00851, 1.0]","(0.0291, 0.0872]",f
16,Vine Grove,KY,suburban,Hardin Co School District,Hardin,f,f,Ms.,Environmental Science,Math & Science,Health & Life Science,Math & Science,Supplies,high poverty,Grades 6-8,"(0.00358, 0.00593]","(0.0872, 1.0]",f
18,Van Nuys,CA,urban,Los Angeles Unif Sch Dist,Los Angeles,f,f,Ms.,Literature & Writing,Literacy & Language,,,Books,highest poverty,Grades 3-5,"(-0.001, 0.00358]","(0.021, 0.0291]",t
23,El Monte,CA,suburban,Mountain View School District-Los Angeles,Los Angeles,f,f,Mrs.,History & Geography,History & Civics,Special Needs,Special Needs,Technology,highest poverty,Grades PreK-2,"(0.00851, 1.0]","(0.021, 0.0291]",f


In [78]:
resdf.columns

Index(['school_city', 'school_state', 'school_metro', 'school_district',
       'school_county', 'school_charter', 'school_magnet', 'teacher_prefix',
       'primary_focus_subject', 'primary_focus_area',
       'secondary_focus_subject', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level', 'eligible_double_your_impact_match'],
      dtype='object')

In [22]:
resdf = pp.load_csv('./data/finalrun2.csv')
resdf.info

<bound method DataFrame.info of               type                                            details  \
0            Dtree  criteria: entropy, depth: 10, min_leaf: 100, s...   
1            Dtree  criteria: entropy, depth: 10, min_leaf: 100, s...   
2            Dtree  criteria: entropy, depth: 10, min_leaf: 100, s...   
3            Dtree  criteria: entropy, depth: 10, min_leaf: 100, s...   
4            Dtree  criteria: entropy, depth: 10, min_leaf: 100, s...   
5            Dtree  criteria: entropy, depth: 10, min_leaf: 100, s...   
6            Dtree  criteria: entropy, depth: 10, min_leaf: 100, s...   
7            Dtree  criteria: entropy, depth: 10, min_leaf: 300, s...   
8            Dtree  criteria: entropy, depth: 10, min_leaf: 300, s...   
9            Dtree  criteria: entropy, depth: 10, min_leaf: 300, s...   
10           Dtree  criteria: entropy, depth: 10, min_leaf: 300, s...   
11           Dtree  criteria: entropy, depth: 10, min_leaf: 300, s...   
12           Dtree 

## Comparing model results

#### Start by finding models with highest precision, recall, or auc.

##### Models have the highest percision at a threshold of 1%

In [63]:
resdf.sort_values('precision', ascending=False)

Unnamed: 0,type,details,baseline,threshold_pct,precision,recall,auc,train_set_num,train_start,test_start
483,LR,"penalty: l2, c: 1.0, solver: liblinear, seed: ...",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
525,SVM,"penalty: l2, c: 100.0, seed: 12345",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
455,LR,"penalty: l1, c: 1.0, solver: liblinear, seed: ...",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
462,LR,"penalty: l1, c: 10.0, solver: liblinear, seed:...",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
469,LR,"penalty: l1, c: 100.0, solver: liblinear, seed...",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
476,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
490,LR,"penalty: l2, c: 10.0, solver: liblinear, seed:...",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
497,LR,"penalty: l2, c: 100.0, solver: liblinear, seed...",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
504,SVM,"penalty: l2, c: 0.1, seed: 12345",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01
511,SVM,"penalty: l2, c: 1.0, seed: 12345",0.315889,1,0.589862,0.018667,0.506337,2,2012-01-01,2013-01-01


##### Models have the highest recall at a threshold of 50%

In [64]:
resdf.sort_values('recall', ascending=False)

Unnamed: 0,type,details,baseline,threshold_pct,precision,recall,auc,train_set_num,train_start,test_start
405,Dtree,"criteria: gini, depth: 10, min_leaf: 500, seed...",0.315889,50,0.323844,0.512615,0.509203,2,2012-01-01,2013-01-01
454,LR,"penalty: l1, c: 0.1, solver: liblinear, seed: ...",0.315889,50,0.323844,0.512615,0.509203,2,2012-01-01,2013-01-01
342,Dtree,"criteria: entropy, depth: 10, min_leaf: 500, s...",0.315889,50,0.323844,0.512615,0.509203,2,2012-01-01,2013-01-01
468,LR,"penalty: l1, c: 10.0, solver: liblinear, seed:...",0.315889,50,0.323659,0.512323,0.508990,2,2012-01-01,2013-01-01
503,LR,"penalty: l2, c: 100.0, solver: liblinear, seed...",0.315889,50,0.323659,0.512323,0.508990,2,2012-01-01,2013-01-01
461,LR,"penalty: l1, c: 1.0, solver: liblinear, seed: ...",0.315889,50,0.323659,0.512323,0.508990,2,2012-01-01,2013-01-01
475,LR,"penalty: l1, c: 100.0, solver: liblinear, seed...",0.315889,50,0.323659,0.512323,0.508990,2,2012-01-01,2013-01-01
482,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.315889,50,0.323659,0.512323,0.508990,2,2012-01-01,2013-01-01
496,LR,"penalty: l2, c: 10.0, solver: liblinear, seed:...",0.315889,50,0.323659,0.512323,0.508990,2,2012-01-01,2013-01-01
489,LR,"penalty: l2, c: 1.0, solver: liblinear, seed: ...",0.315889,50,0.323659,0.512323,0.508990,2,2012-01-01,2013-01-01


##### Decision trees have the highest AUC

In [71]:
aucdf = resdf.sort_values('auc', ascending=False)
aucdf

Unnamed: 0,type,details,baseline,threshold_pct,precision,recall,auc,train_set_num,train_start,test_start
397,Dtree,"criteria: gini, depth: 10, min_leaf: 300, seed...",0.315889,30,0.332617,0.315882,0.511611,2,2012-01-01,2013-01-01
334,Dtree,"criteria: entropy, depth: 10, min_leaf: 300, s...",0.315889,30,0.332617,0.315882,0.511611,2,2012-01-01,2013-01-01
439,Dtree,"criteria: gini, depth: 30, min_leaf: 300, seed...",0.315889,30,0.332156,0.315444,0.511291,2,2012-01-01,2013-01-01
481,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.315889,30,0.332156,0.315444,0.511291,2,2012-01-01,2013-01-01
355,Dtree,"criteria: entropy, depth: 20, min_leaf: 300, s...",0.315889,30,0.332156,0.315444,0.511291,2,2012-01-01,2013-01-01
376,Dtree,"criteria: entropy, depth: 30, min_leaf: 300, s...",0.315889,30,0.332156,0.315444,0.511291,2,2012-01-01,2013-01-01
418,Dtree,"criteria: gini, depth: 20, min_leaf: 300, seed...",0.315889,30,0.332156,0.315444,0.511291,2,2012-01-01,2013-01-01
565,ADABoost_dtree,"n: 50, base: None",0.315889,30,0.331849,0.315152,0.511078,2,2012-01-01,2013-01-01
530,SVM,"penalty: l2, c: 100.0, seed: 12345",0.315889,30,0.331849,0.315152,0.511078,2,2012-01-01,2013-01-01
453,LR,"penalty: l1, c: 0.1, solver: liblinear, seed: ...",0.315889,30,0.331695,0.315007,0.510971,2,2012-01-01,2013-01-01


In [72]:
aucdf.groupby(['train_set_num']).first()

Unnamed: 0_level_0,type,details,baseline,threshold_pct,precision,recall,auc,train_start,test_start
train_set_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,SVM,"penalty: l2, c: 0.1, seed: 12345",0.256917,50,0.262864,0.511573,0.507787,2012-01-01,2012-07-01
2,Dtree,"criteria: gini, depth: 10, min_leaf: 300, seed...",0.315889,30,0.332617,0.315882,0.511611,2012-01-01,2013-01-01
3,LR,"penalty: l1, c: 0.1, solver: liblinear, seed: ...",0.284647,5,0.354167,0.062202,0.508534,2012-01-01,2013-07-01


#### From above, we can see that finding models that do well on a measure is too simplistic. It's better to identify models that do better on precision, recall by a threshold and time period

In [61]:
best_prec = resdf.sort_values('precision', ascending=False)
best_prec.groupby(['train_set_num', 'threshold_pct']).nth([0])

Unnamed: 0_level_0,Unnamed: 1_level_0,type,details,baseline,precision,recall,auc,train_start,test_start
train_set_num,threshold_pct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.256917,0.481818,0.018777,0.505897,2012-01-01,2012-07-01
1,2,SVM,"penalty: l2, c: 0.1, seed: 12345",0.256917,0.379363,0.029523,0.506412,2012-01-01,2012-07-01
1,5,LR,"penalty: l1, c: 1.0, solver: liblinear, seed: ...",0.256917,0.311893,0.060699,0.507199,2012-01-01,2012-07-01
1,10,LR,"penalty: l2, c: 1.0, solver: liblinear, seed: ...",0.256917,0.283981,0.110534,0.507088,2012-01-01,2012-07-01
1,20,LR,"penalty: l1, c: 1.0, solver: liblinear, seed: ...",0.256917,0.267597,0.208314,0.505594,2012-01-01,2012-07-01
1,30,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.256917,0.264563,0.308928,0.506007,2012-01-01,2012-07-01
1,50,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.256917,0.262864,0.511573,0.507787,2012-01-01,2012-07-01
2,1,LR,"penalty: l2, c: 1.0, solver: liblinear, seed: ...",0.315889,0.589862,0.018667,0.506337,2012-01-01,2013-01-01
2,2,SVM,"penalty: l2, c: 100.0, seed: 12345",0.315889,0.474654,0.030042,0.507344,2012-01-01,2013-01-01
2,5,Dtree,"criteria: gini, depth: 30, min_leaf: 300, seed...",0.315889,0.37235,0.058918,0.50653,2012-01-01,2013-01-01


In [60]:
resdf.groupby(['train_set_num', 'threshold_pct']).agg({'precision' : 'max', 'recall' : 'max', 'auc': 'max'})

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,auc
train_set_num,threshold_pct,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0.481818,0.018777,0.505897
1,2,0.379363,0.029523,0.506412
1,5,0.311893,0.060699,0.507199
1,10,0.283981,0.110534,0.507088
1,20,0.267597,0.208314,0.505594
1,30,0.264563,0.308928,0.506007
1,50,0.262864,0.511573,0.507787
2,1,0.589862,0.018667,0.506337
2,2,0.474654,0.030042,0.507344
2,5,0.37235,0.058918,0.50653


#### Choose model for the 5% threshold (target percent of population)

In [11]:
t_5 = resdf[resdf['threshold_pct']==5].sort_values('precision', ascending=False)
t_5.groupby('train_set_num').nth([0,1])

Unnamed: 0_level_0,type,details,baseline,threshold_pct,precision,recall,auc,train_start,test_start
train_set_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,LR,"penalty: l1, c: 1.0, solver: liblinear, seed: ...",0.256917,5,0.311893,0.060699,0.507199,2012-01-01,2012-07-01
1,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.256917,5,0.311893,0.060699,0.507199,2012-01-01,2012-07-01
2,Dtree,"criteria: entropy, depth: 20, min_leaf: 300, s...",0.315889,5,0.37235,0.058918,0.50653,2012-01-01,2013-01-01
2,Dtree,"criteria: entropy, depth: 30, min_leaf: 300, s...",0.315889,5,0.37235,0.058918,0.50653,2012-01-01,2013-01-01
3,LR,"penalty: l1, c: 0.1, solver: liblinear, seed: ...",0.284647,5,0.354167,0.062202,0.508534,2012-01-01,2013-07-01
3,LR,"penalty: l2, c: 0.1, solver: liblinear, seed: ...",0.284647,5,0.354167,0.062202,0.508534,2012-01-01,2013-07-01
