In [1]:
import os, sys
pathname = os.path.dirname("/home/jgutman/mvesc/Models_Results/")
full_pathname = os.path.abspath(pathname)
split_pathname = full_pathname.split(sep="mvesc")
base_pathname = os.path.join(split_pathname[0], "mvesc")
parentdir = os.path.join(base_pathname, "ETL")
sys.path.insert(0,parentdir)

In [2]:
from mvesc_utility_functions import *

In [3]:
query = """select * from model.outcome"""

In [79]:
with postgres_pgconnection_generator() as connection:
        with connection.cursor() as cursor:
            cursor.execute(query)
            results = cursor.fetchall()
            print(len(results))
        connection.commit()

11777


In [80]:
modelOptions = {'modelClassSelected' : 'logit',
    'model_performance_estimate_scheme' : 'temporal_cohort',
    'parameter_cross_validation_scheme' : 'leave_cohort_out',
    'n_folds' : 10,
    'file_save_name' : 'gender_ethnicity_logit.pkl',
    'randomSeed' : 2187,
    'user_description' : """initial skeleton pipeline test""",
    'cohort_grade_level_begin' : 'cohort_9th',
    'cohorts_held_out' : [2012],
    # features_included is a dictionary where key is table name and
    # value is a list of column names from that table
    'features_included' : {'demographics': ['ethnicity', 'gender']},
    'outcome_name' : 'not_on_time' #'is_dropout'
    }

In [6]:
import numpy as np
np.random.seed(modelOptions['randomSeed'])

In [91]:
with postgres_pgconnection_generator() as connection:
    outcomes_with_student_lookup = read_table_to_df(connection, table_name = 'outcome',
        schema = 'model', nrows = -1, columns = ['student_lookup',
        modelOptions['outcome_name'], modelOptions['cohort_grade_level_begin']])
    # drop students without student_lookup, outcome, or cohort identifier
    # can use subset = [colnames] to drop based on NAs in certain columns only
    outcomes_with_student_lookup.dropna(inplace=True)
    joint_label_features = outcomes_with_student_lookup.copy()
    
    for table, column_names in modelOptions['features_included'].items():
        features = read_table_to_df(connection, table_name = table,
            schema = 'model', nrows = -1, columns=(['student_lookup'] + column_names))
        
        # join to only keep features that have labeled outcomes
        joint_label_features = pd.merge(joint_label_features, features,
            how = 'left', on = 'student_lookup')

joint_label_features.set_index('student_lookup', inplace=True)
joint_label_features = df2num(joint_label_features)

In [94]:
joint_label_features.cohort_9th

student_lookup
57296.0     2006
58652.0     2006
57294.0     2006
69065.0     2006
63909.0     2006
57292.0     2006
57290.0     2006
57288.0     2006
57285.0     2006
57284.0     2006
57282.0     2006
41726.0     2006
57279.0     2006
57278.0     2006
57277.0     2006
57276.0     2006
57275.0     2006
57274.0     2006
57273.0     2006
57271.0     2006
57270.0     2006
57268.0     2006
57266.0     2006
57265.0     2006
57264.0     2006
57259.0     2006
58523.0     2006
36739.0     2006
57255.0     2006
57254.0     2006
            ... 
70804.0     2012
19548.0     2012
699419.0    2012
19308.0     2012
19154.0     2012
19150.0     2012
19149.0     2012
19123.0     2012
19061.0     2012
19055.0     2012
700797.0    2012
19044.0     2012
19040.0     2012
19039.0     2012
19038.0     2012
19037.0     2012
19036.0     2012
19035.0     2012
19034.0     2012
700539.0    2012
19033.0     2012
700311.0    2012
19032.0     2012
19028.0     2012
19027.0     2012
19026.0     2012
19025.0     2012

In [75]:
def df2num(rawdf):
    """ Convert data frame with numeric variables and strings to numeric dataframe

    :param pd.dataframe rawdf: raw data frame
    :returns pd.dataframe df: a data frame with strings converted to dummies, other columns unchanged
    :rtype: pd.dataframe
    Rules:
    - 1. numeric columns unchanged;
    - 2. strings converted to dummeis;
    - 3. the most frequent string is taken as reference
    - 4. new column name is: "ColumnName_Category"
    (e.g., column 'gender' with 80 'M' and 79 'F'; the dummy column left is 'gender_F')

    """
    numeric_df = rawdf.select_dtypes(include=[np.number])
    str_columns = [col for col in rawdf.columns if col not in numeric_df.columns]
    dummy_col_df = pd.get_dummies(rawdf[str_columns], dummy_na=True)
    numeric_df = numeric_df.join(dummy_col_df)
    most_frequent_values = rawdf[str_columns].mode().loc[0].to_dict()
    reference_cols = ["{}_{}".format(key, value) for key, value in most_frequent_values.items()]
    numeric_df.drop(reference_cols, axis=1, inplace=True)
    return numeric_df

In [93]:
from sklearn.cross_validation import LeaveOneLabelOut

In [97]:
cohort_kfolds = LeaveOneLabelOut(joint_label_features[modelOptions['cohort_grade_level_begin']])
len(cohort_kfolds)

7