In [1]:
import os, sys
pathname = os.path.dirname("/home/jgutman/mvesc/Models_Results/")
full_pathname = os.path.abspath(pathname)
split_pathname = full_pathname.split(sep="mvesc")
base_pathname = os.path.join(split_pathname[0], "mvesc")
parentdir = os.path.join(base_pathname, "ETL")
sys.path.insert(0,parentdir)

In [2]:
from mvesc_utility_functions import *

In [3]:
# all model import statements
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [4]:
#from sklearn.grid_search import ParameterGrid
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import *
from sklearn.externals import joblib
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix
from sklearn.preprocessing import Imputer, StandardScaler, RobustScaler

import yaml
import numpy as np
import pandas as pd

In [5]:
query = """select * from model.outcome"""

In [6]:
with postgres_pgconnection_generator() as connection:
        with connection.cursor() as cursor:
            cursor.execute(query)
            results = cursor.fetchall()
            print(len(results))
        connection.commit()

11777


In [7]:
def df2num(rawdf):
    """ Convert data frame with numeric variables and strings to numeric dataframe

    :param pd.dataframe rawdf: raw data frame
    :returns pd.dataframe df: a data frame with strings converted to dummies, other columns unchanged
    :rtype: pd.dataframe
    Rules:
    - 1. numeric columns unchanged;
    - 2. strings converted to dummeis;
    - 3. the most frequent string is taken as reference
    - 4. new column name is: "ColumnName_Category"
    (e.g., column 'gender' with 80 'M' and 79 'F'; the dummy column left is 'gender_F')

    """
    numeric_df = rawdf.select_dtypes(include=[np.number])
    str_columns = [col for col in rawdf.columns if col not in numeric_df.columns]
    dummy_col_df = pd.get_dummies(rawdf[str_columns], dummy_na=True)
    numeric_df = numeric_df.join(dummy_col_df)
    most_frequent_values = rawdf[str_columns].mode().loc[0].to_dict()
    reference_cols = ["{}_{}".format(key, value) for key, value in most_frequent_values.items()]
    numeric_df.drop(reference_cols, axis=1, inplace=True)
    return numeric_df

In [121]:
def define_clfs_params():
    # model_options[model_classes_selected] determines which of these models
    # are actually run, all parameter options in grid run for each selected model

    clfs = {
        'logit': LogisticRegression(),
        'LR_no_penalty': LogisticRegression(C=1e6),
        'DT': DecisionTreeClassifier(),
        'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
        'SVM': svm.SVC(kernel='linear', probability=False),
        'GB': GradientBoostingClassifier(
            learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
        'NB': GaussianNB(),
        'SGD': SGDClassifier(loss="hinge", penalty="l2"),
        'KNN': KNeighborsClassifier(n_neighbors=3)
    }

    grid = {
        'logit': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1.0,10.0]},
        'LR_no_penalty': {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],
            'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100],
            'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'SGD': {'loss': ['hinge','log','perceptron'], 'penalty': ['l2','l1','elasticnet']},
        'ET': {'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,
            'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
        'GB': {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],
            'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]},
        'NB' : {},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100],
            'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
        'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
        'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],
            'algorithm': ['auto','ball_tree','kd_tree']}
    }
    return clfs, grid

In [9]:
def clf_loop(clfs, params, train_X, train_y,
        criterion, models_to_run, cv_folds):
    """
    Returns a dictionary where the keys are model nicknames (strings)
    and the values are GridSearchCV objects containing attributes like
    model.best_score_ and model.best_estimator_

    :param dict(str:estimator) clfs: clfs as returned by define_clfs_params
    :param dict(str:dict) params: grid of classifier hyperparameter options
        to grid search over as returned by define_clfs_params
    :param pandas.DataFrame train_X: index is student_lookup, columns are all
        features to train over in the model
    :param pandas.Series(int) train_y: index is student_lookup, value is 0 or 1
        for outcome label
    :param string criterion: evaluation criterion for model selection on the
        validation set, to be read in from model_options (e.g. 'f1')
    :param list[string] models_to_run: which models to actually run as read in
        from model_options (e.g. ['logit', 'DT'])
    :param sklearn.KFolds cv_folds: a KFolds generator object over the index
        given in train_X and train_y (a list of lists of student_lookups)
    :rtype dict(string: GridSearchCV)
    """
    best_validated_models = dict()
    for index,clf in enumerate([clfs[x] for x in models_to_run]):
        model_name=models_to_run[index]
        print(model_name)
        parameter_values = params[model_name]
        #param_grid = ParameterGrid(parameter_values)
        best_validated_models[model_name] = GridSearchCV(clf, parameter_values, scoring=criterion, cv=cv_folds)
        best_validated_models[model_name].fit(train_X, train_y)

        model_cv_score = best_validated_models[model_name].best_score_
        print("model: {model} cv_score: {score}".format(
            model=model_name, score=model_cv_score))
    return best_validated_models

In [10]:
def temporal_cohort_test_split(joint_df, cohort_grade_level_begin,
    cohorts_held_out, cohorts_training):
    """ Splits the given joint_df of features & outcomes and
    returns a train/test dataset
    :param pd.DataFrame joint_df:
    :param list[int] cohorts_held_out:
    """
    if (cohorts_training=='all'):
        train = joint_df[~joint_df[cohort_grade_level_begin].isin(cohorts_held_out)]
    else:
        train = joint_df[joint_df[cohort_grade_level_begin].isin(cohorts_training)]
    test = joint_df[joint_df[cohort_grade_level_begin].isin(cohorts_held_out)]
    return train, test

In [11]:
def measure_performance(outcomes, predictions):
    """ Returns a dict of model performance objects
    :param list[int] outcomes:
    :param list[float] predictions:
    """
    performance_objects = {}
    performance_objects['pr_curve'] = precision_recall_curve(outcomes, predictions)
    performance_objects['roc_curve'] = roc_curve(outcomes, predictions)
    #performance_objects['confusion_matrix'] = confusion_matrix(outcomes,predictions)
    return performance_objects

In [12]:
def build_outcomes_plus_features(model_options):
    with postgres_pgconnection_generator() as connection:
        # get labeled outcomes
        # Assumes:
        # model.outcome table contains a column (name given in cohort_grade_level_begin) for each cohort base year we choose
        # e.g. 'cohort_9th' contains the year each student is seen in 9th grade
        # and contains an outcome column (name given in outcome_name)
        # and 'student_lookup' columns
        # Usage:
        # select train, validation, and test based on values in column
        # 'cohort_grade_level_begin' according to value in 'cohorts_held_out'
        outcomes_with_student_lookup = read_table_to_df(connection,
            table_name = 'outcome', schema = 'model', nrows = -1,
            columns = ['student_lookup', model_options['outcome_name'], model_options['cohort_grade_level_begin']])
        # drop students without student_lookup, outcome, or cohort identifier
        # can use subset = [colnames] to drop based on NAs in certain columns only
        outcomes_with_student_lookup.dropna(inplace=True)
        joint_label_features = outcomes_with_student_lookup.copy()

        # get all requested input features
        # Assumes:
        # every features table contains 'student_lookup'
        # plus a column for the requested possible features

        for table, column_names in model_options['features_included'].items():
            features = read_table_to_df(connection, table_name = table,
                schema = 'model', nrows = -1,
                columns=(['student_lookup'] + column_names))
        # join to only keep features that have labeled outcomes
            joint_label_features = pd.merge(joint_label_features, features,
                how = 'left', on = 'student_lookup')

    # build dataframe containing student_lookup, outcome, cohort,
    # and all features as numeric non-categorical values
    joint_label_features.set_index('student_lookup', inplace=True)
    joint_label_features = df2num(joint_label_features)
    return joint_label_features

In [13]:
def read_in_yaml(filename=os.path.join(base_pathname,
    'Models_Results', 'model_options.yaml')):
    with open(filename, 'r') as f:
        model_options = yaml.load(f)
    assert(type(model_options)==dict)
    assert(type(model_options['features_included']==dict))
    assert(type(model_options['model_classes_selected']==list))
    assert(type(model_options['cohorts_held_out']==list))
    return model_options

In [68]:
model_options = read_in_yaml()
print(model_options)

{'validation_criterion': 'accuracy', 'n_folds': 10, 'features_included': {'demographics': ['ethnicity', 'gender']}, 'user_description': 'initial_skeleton_pipeline_test', 'cohorts_training': 'all', 'cohort_grade_level_begin': 'cohort_9th', 'random_seed': 2187, 'missing_impute_strategy': 'default_mean_mode', 'feature_scaling': 'none', 'model_test_holdout': 'temporal_cohort', 'file_save_name': 'gender_ethnicity', 'model_classes_selected': ['logit', 'DT', 'SVM'], 'cohorts_held_out': [2012], 'parameter_cross_validation_scheme': 'leave_cohort_out', 'outcome_name': 'not_on_time'}


In [15]:
# set seed for this program from model_options
np.random.seed(model_options['random_seed'])

In [16]:
# Based on options, draw in data and select the appropriate
# labeled outcome column (outcome_name)
# cohort identification column (cohort_grade_level_begin)
# subset of various feature columns from various tables (features_included)

outcome_plus_features = build_outcomes_plus_features(model_options)

In [17]:
outcome_plus_features.head()

Unnamed: 0_level_0,not_on_time,cohort_9th,ethnicity_A,ethnicity_B,ethnicity_H,ethnicity_I,ethnicity_M,ethnicity_nan,gender_F,gender_nan
student_lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
57296.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58652.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57294.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69065.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63909.0,1,2006,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
if model_options['model_test_holdout'] == 'temporal_cohort':
    # if using temporal cohort model performance validation,
    # we choose the cohorts in cohorts_held_out for the test set
    train, test = temporal_cohort_test_split(outcome_plus_features,
        model_options['cohort_grade_level_begin'],
        model_options['cohorts_held_out'],
        model_options['cohorts_training'])

else:
    # if not using temporal test set, split randomly
    train, test = train_test_split(outcome_plus_features, test_size=0.20,
        random_state=model_options['random_seed'])

In [37]:
print(pd.unique(train.cohort_9th))
print(pd.unique(test.cohort_9th))

[2006 2007 2008 2009 2010 2011]
[2012]


In [38]:
# get subtables for each for easy reference
train_X = train.drop([model_options['outcome_name'],
    model_options['cohort_grade_level_begin']],axis=1)
test_X = test.drop([model_options['outcome_name'],
    model_options['cohort_grade_level_begin']],axis=1)
train_y = train[model_options['outcome_name']]
test_y = test[model_options['outcome_name']]

In [122]:
clfs, params = define_clfs_params()

In [40]:
if model_options['parameter_cross_validation_scheme'] == 'none':
    # no need to further manipulate train dataset
    cohort_kfolds = 2 # hacky way to have GridSearchCV fit to 2 k-folds
elif model_options['parameter_cross_validation_scheme'] == 'leave_cohort_out':
    # choose another validation set amongst the training set to
    # estimate parameters and model selection across cohort folds
    print('leave_cohort_out')
    cohort_kfolds = LeaveOneLabelOut(train[model_options['cohort_grade_level_begin']])
elif model_options['parameter_cross_validation_scheme'] == 'k_fold':
    # ignore cohorts and use random folds to estimate parameter
    print('k_fold_parameter_estimation')
    cohort_kfolds = LabelKFold(train.index, n_folds=model_options['n_folds'])
else:
    print('unknown cross-validation strategy')

leave_cohort_out


In [46]:
print(cohort_kfolds.labels)
for train_fold, val_fold in cohort_kfolds:
    print('train: ', np.unique(train.iloc[train_fold].cohort_9th))
    print('validation: ', np.unique(train.iloc[val_fold].cohort_9th))

[2006 2006 2006 ..., 2011 2011 2011]
train:  [2007 2008 2009 2010 2011]
validation:  [2006]
train:  [2006 2008 2009 2010 2011]
validation:  [2007]
train:  [2006 2007 2009 2010 2011]
validation:  [2008]
train:  [2006 2007 2008 2010 2011]
validation:  [2009]
train:  [2006 2007 2008 2009 2011]
validation:  [2010]
train:  [2006 2007 2008 2009 2010]
validation:  [2011]


In [115]:
model_options['model_classes_selected'] = ['LR_no_penalty'] #'log_loss' #'accuracy' #  # 'f1'
model_options['validation_criterion'] = 'average_precision' #'log_loss' #'accuracy' #  # 'f1'

In [123]:
clfs

{'AB': AdaBoostClassifier(algorithm='SAMME',
           base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
             max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=None, splitter='best'),
           learning_rate=1.0, n_estimators=200, random_state=None),
 'DT': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=None, splitter='best'),
 'ET': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=F

In [124]:
best_validated_models = clf_loop(clfs, params, train_X, train_y,
    criterion=model_options['validation_criterion'],
    models_to_run=model_options['model_classes_selected'],
    cv_folds=cohort_kfolds)

LR_no_penalty
model: LR_no_penalty cv_score: 0.34711971229428645


In [105]:
for model_name, model in best_validated_models.items():
    clf = model.best_estimator_
    if hasattr(clf, "decision_function"):
        test_set_scores = clf.decision_function(test_X)
    else:
        test_set_scores = clf.predict_proba(test_X)[:,1]

In [126]:
best_validated_models['LR_no_penalty'].grid_scores_

[mean: 0.34712, std: 0.02759, params: {}]

In [130]:
best_validated_models['LR_no_penalty'].best_estimator_.coef_

array([[ 0.69831928,  1.14873057,  1.11487351,  1.52102882,  0.75204177,
         2.6156949 , -0.14418855,  0.        ]])

In [133]:
outcome_plus_features.columns

Index(['not_on_time', 'cohort_9th', 'ethnicity_A', 'ethnicity_B',
       'ethnicity_H', 'ethnicity_I', 'ethnicity_M', 'ethnicity_nan',
       'gender_F', 'gender_nan'],
      dtype='object')

In [109]:
predicted_train = best_validated_models['DT'].predict_proba(train_X)
predicted_test = best_validated_models['DT'].predict_proba(test_X)

In [131]:
np.mean([probs == [0.5, 0.5] for probs in predicted_train])

0.0

In [74]:
np.mean([probs == [0.5, 0.5] for probs in predicted_test])

1.0

In [132]:
predicted_train

array([[ 0.76382782,  0.23617218],
       [ 0.76382782,  0.23617218],
       [ 0.76382782,  0.23617218],
       ..., 
       [ 0.76382782,  0.23617218],
       [ 0.76382782,  0.23617218],
       [ 0.76382782,  0.23617218]])

In [89]:
pd.crosstab(outcome_plus_features.not_on_time, outcome_plus_features.gender_F)

gender_F,0.0,1.0
not_on_time,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4502,4366
1,1566,1343


In [90]:
pd.crosstab(outcome_plus_features.not_on_time, outcome_plus_features.ethnicity_B)

ethnicity_B,0.0,1.0
not_on_time,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8799,69
1,2847,62


In [93]:
outcome_plus_features.sum()

not_on_time          2909.0
cohort_9th       23666760.0
ethnicity_A            50.0
ethnicity_B           131.0
ethnicity_H           712.0
ethnicity_I            11.0
ethnicity_M           267.0
ethnicity_nan           7.0
gender_F             5709.0
gender_nan              0.0
dtype: float64

In [135]:
sum(outcome_plus_features.ethnicity_nan)

7.0

In [137]:
pd.isnull(outcome_plus_features).sum()

not_on_time      0
cohort_9th       0
ethnicity_A      0
ethnicity_B      0
ethnicity_H      0
ethnicity_I      0
ethnicity_M      0
ethnicity_nan    0
gender_F         0
gender_nan       0
dtype: int64

In [141]:
test_null = outcome_plus_features.replace({'cohort_9th': {2006: np.nan}})

In [142]:
train, test = temporal_cohort_test_split(test_null,
    model_options['cohort_grade_level_begin'],
    model_options['cohorts_held_out'],
    model_options['cohorts_training'])

In [144]:
test_null.dropna(subset=['not_on_time', 'cohort_9th'], inplace=True)

In [145]:
test_null

Unnamed: 0_level_0,not_on_time,cohort_9th,ethnicity_A,ethnicity_B,ethnicity_H,ethnicity_I,ethnicity_M,ethnicity_nan,gender_F,gender_nan
student_lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
37133.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
37132.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37131.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
37130.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37129.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37128.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37127.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37126.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
37125.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37122.0,0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [148]:
cohorts_training = [2006, 2007, 2008]
cohorts_heldout = [2011, 2012]

In [154]:
assert(max(cohorts_training) < min(cohorts_heldout)), "Training years do not completely precede test years"

In [157]:
np.min(train.cohort_9th)

2007.0

In [158]:
cohort_grade_level_begin = 'cohort_9th'

In [171]:
model_options.keys()

dict_keys(['validation_criterion', 'n_folds', 'features_included', 'user_description', 'cohorts_training', 'cohort_grade_level_begin', 'random_seed', 'missing_impute_strategy', 'feature_scaling', 'model_test_holdout', 'file_save_name', 'model_classes_selected', 'cohorts_held_out', 'parameter_cross_validation_scheme', 'outcome_name'])

In [174]:
required_keys = set(('validation_criterion', 'features_included', 'cohorts_training', 'cohorts_held_out', 
                   'file_save_name', 'model_classes_selected', 'outcome_name', 'cohort_grade_level_begin',
                    'model_test_holdout', 'random_seed'))

In [183]:
assert(all([key in model_options.keys() for key in required_keys])), \
    "error message"