In [1]:
import os, sys
pathname = os.path.dirname("/home/jgutman/mvesc/Models_Results/")
full_pathname = os.path.abspath(pathname)
split_pathname = full_pathname.split(sep="mvesc")
base_pathname = os.path.join(split_pathname[0], "mvesc")
parentdir = os.path.join(base_pathname, "ETL")
sys.path.insert(0,parentdir)

In [2]:
from mvesc_utility_functions import *

In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import ParameterGrid
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import *
from sklearn.externals import joblib
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve

import yaml
import numpy as np
import pandas as pd

In [3]:
query = """select * from model.outcome"""

In [102]:
with postgres_pgconnection_generator() as connection:
        with connection.cursor() as cursor:
            cursor.execute(query)
            results = cursor.fetchall()
            print(len(results))
        connection.commit()

11777


In [6]:
np.random.seed(model_options['randomSeed'])

In [134]:
def build_outcomes_plus_features(model_options):
    with postgres_pgconnection_generator() as connection:
        # get labeled outcomes
        # Assumes:
        # model.outcome table contains a column (name given in cohort_grade_level_begin) for each cohort base year we choose
        # e.g. 'cohort_9th' contains the year each student is seen in 9th grade
        # and contains an outcome column (name given in outcome_name)
        # and 'student_lookup' columns
        # Usage:
        # select train, validation, and test based on values in column
        # 'cohort_grade_level_begin' according to value in 'cohorts_held_out'
        outcomes_with_student_lookup = read_table_to_df(connection,
            table_name = 'outcome', schema = 'model', nrows = -1,
            columns = ['student_lookup', model_options['outcome_name'], model_options['cohort_grade_level_begin']])
        # drop students without student_lookup, outcome, or cohort identifier
        # can use subset = [colnames] to drop based on NAs in certain columns only
        outcomes_with_student_lookup.dropna(inplace=True)
        joint_label_features = outcomes_with_student_lookup.copy()

        # get all requested input features
        # Assumes:
        # every features table contains 'student_lookup'
        # plus a column for the requested possible features

        for table, column_names in model_options['features_included'].items():
            features = read_table_to_df(connection, table_name = table,
                schema = 'model', nrows = -1,
                columns=(['student_lookup'] + column_names))
        # join to only keep features that have labeled outcomes
            joint_label_features = pd.merge(joint_label_features, features,
                how = 'left', on = 'student_lookup')

    # build dataframe containing student_lookup, outcome, cohort,
    # and all features as numeric non-categorical values
    joint_label_features.set_index('student_lookup', inplace=True)
    joint_label_features = df2num(joint_label_features)
    return joint_label_features

In [75]:
def df2num(rawdf):
    """ Convert data frame with numeric variables and strings to numeric dataframe

    :param pd.dataframe rawdf: raw data frame
    :returns pd.dataframe df: a data frame with strings converted to dummies, other columns unchanged
    :rtype: pd.dataframe
    Rules:
    - 1. numeric columns unchanged;
    - 2. strings converted to dummeis;
    - 3. the most frequent string is taken as reference
    - 4. new column name is: "ColumnName_Category"
    (e.g., column 'gender' with 80 'M' and 79 'F'; the dummy column left is 'gender_F')

    """
    numeric_df = rawdf.select_dtypes(include=[np.number])
    str_columns = [col for col in rawdf.columns if col not in numeric_df.columns]
    dummy_col_df = pd.get_dummies(rawdf[str_columns], dummy_na=True)
    numeric_df = numeric_df.join(dummy_col_df)
    most_frequent_values = rawdf[str_columns].mode().loc[0].to_dict()
    reference_cols = ["{}_{}".format(key, value) for key, value in most_frequent_values.items()]
    numeric_df.drop(reference_cols, axis=1, inplace=True)
    return numeric_df

In [93]:
from sklearn.cross_validation import LeaveOneLabelOut

In [97]:
cohort_kfolds = LeaveOneLabelOut(joint_label_features[modelOptions['cohort_grade_level_begin']])
len(cohort_kfolds)

7

In [103]:
def define_clfs_params():
    clfs = {'logit': LogisticRegression(),
    'DT': DecisionTreeClassifier()
    }

    grid = {'logit': {},
        'DT': {}
    }
    return clfs, grid

In [104]:
clfs, params = define_clfs_params()

In [105]:
params

{'DT': {}, 'logit': {}}

In [118]:
with open('model_options.yaml', 'r') as f:
    model_options = yaml.load(f)
assert(type(model_options)==dict)
assert(type(model_options['features_included']))

In [135]:
outcome_plus_features = build_outcomes_plus_features(model_options)

In [110]:
model_options['features_included']

[{'demographics': ['ethnicity', 'gender']}, {'grades': ['gpa_8th']}]

In [115]:
type(model_options['features_included'])

dict

In [136]:
outcome_plus_features

Unnamed: 0_level_0,not_on_time,cohort_9th,ethnicity_A,ethnicity_B,ethnicity_H,ethnicity_I,ethnicity_M,ethnicity_nan,gender_F,gender_nan
student_lookup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
57296.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58652.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57294.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69065.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
63909.0,1,2006,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
57292.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
57290.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
57288.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57285.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57284.0,0,2006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [122]:
def temporal_cohort_test_split(joint_df, cohort_grade_level_begin,
    cohorts_held_out):
    """ Splits the given joint_df of features & outcomes and
    returns a train/test dataset
    :param pd.DataFrame joint_df:
    :param list[int] cohorts_held_out:
    """
    train = joint_df[~joint_df[cohort_grade_level_begin].isin(cohorts_held_out)]
    test = joint_df[joint_df[cohort_grade_level_begin].isin(cohorts_held_out)]
    return train, test

In [138]:
train, test = temporal_cohort_test_split(outcome_plus_features,
    model_options['cohort_grade_level_begin'],
    model_options['cohorts_held_out'])

In [128]:
print(pd.unique(train.cohort_9th))
print(pd.unique(test.cohort_9th))

[2006 2007 2008 2009 2010 2011]
[2012]


In [141]:
cohort_kfolds = LeaveOneLabelOut(train[model_options['cohort_grade_level_begin']])

In [142]:
random_kfolds = LabelKFold(train.index,
            n_folds=model_options[n_folds],
            random_state=model_options['random_seed'])

NameError: name 'n_folds' is not defined

In [None]:
def clf_loop(clfs, params, criterion, models_to_run, cv_folds, X_train, X_test, y_train, y_test):
    best_validated_models = dict()
    for index,clf in enumerate([clfs[x] for x in models_to_run]):
        model_name=models_to_run[index]
        print(model_name)
        parameter_values = params[models_to_run[index]]
        param_grid = ParameterGrid(parameter_values)
        best_validated_models[model_name] = GridSearchCV(clf, param_grid, scoring=criterion, cv=cv_folds)
        model_cv_score = best_validated_models[model_name].best_score_
        print("model: {model} score: {score}".format(model=model_name), score=model_cv_score)
    return best_validated_models