In [1]:
import os, sys
pathname = os.path.dirname("/home/jgutman/mvesc/Models_Results/")
full_pathname = os.path.abspath(pathname)
split_pathname = full_pathname.split(sep="mvesc")
base_pathname = os.path.join(split_pathname[0], "mvesc")
parentdir = os.path.join(base_pathname, "ETL")
sys.path.insert(0,parentdir)

In [2]:
from mvesc_utility_functions import *

In [3]:
query = """select * from model.outcome"""

In [4]:
with postgres_pgconnection_generator() as connection:
        with connection.cursor() as cursor:
            cursor.execute(query)
            results = cursor.fetchall()
            print(len(results))
        connection.commit()

11997


In [5]:
modelOptions = {'modelClassSelected' : 'logit',
    'model_performance_estimate_scheme' : 'temporal_cohort',
    'parameter_cross_validation_scheme' : 'leave_cohort_out',
    'n_folds' : 10,
    'file_save_name' : 'gender_ethnicity_logit.pkl',
    'randomSeed' : 2187,
    'user_description' : """initial skeleton pipeline test""",
    'cohort_grade_level_begin' : 'cohort_9th',
    'cohorts_held_out' : [2012],
    # features_included is a dictionary where key is table name and
    # value is a list of column names from that table
    'features_included' : {'demographics': ['ethnicity', 'gender']},
    'outcome_name' : 'is_dropout'
    }

In [6]:
import numpy as np
np.random.seed(modelOptions['randomSeed'])

In [77]:
with postgres_pgconnection_generator() as connection:
    outcomes_with_student_lookup = read_table_to_df(connection, table_name = 'outcome',
        schema = 'model', nrows = -1, columns = ['student_lookup',
        modelOptions['outcome_name'], modelOptions['cohort_grade_level_begin']])

    joint_label_features = outcomes_with_student_lookup.copy()
    
    for table, column_names in modelOptions['features_included'].items():
        features = read_table_to_df(connection, table_name = table,
            schema = 'model', nrows = -1, columns=(['student_lookup'] + column_names))
        
        # join to only keep features that have labeled outcomes
        joint_label_features = pd.merge(joint_label_features, features,
            how = 'left', on = ['student_lookup'])

joint_label_features = df2num(joint_label_features)

In [78]:
joint_label_features

Unnamed: 0,student_lookup,is_dropout,cohort_9th,ethnicity_A,ethnicity_B,ethnicity_H,ethnicity_I,ethnicity_M,ethnicity_nan,gender_F,gender_nan
0,59136.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,54013.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,54216.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,54382.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,58677.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,57110.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,57727.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,57705.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,58340.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,54577.0,1,2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
def df2num(rawdf):
    """ Convert data frame with numeric variables and strings to numeric dataframe

    :param pd.dataframe rawdf: raw data frame
    :returns pd.dataframe df: a data frame with strings converted to dummies, other columns unchanged
    :rtype: pd.dataframe
    Rules:
    - 1. numeric columns unchanged;
    - 2. strings converted to dummeis;
    - 3. the most frequent string is taken as reference
    - 4. new column name is: "ColumnName_Category"
    (e.g., column 'gender' with 80 'M' and 79 'F'; the dummy column left is 'gender_F')

    """
    numeric_df = rawdf.select_dtypes(include=[np.number])
    str_columns = [col for col in rawdf.columns if col not in numeric_df.columns]
    dummy_col_df = pd.get_dummies(rawdf[str_columns], dummy_na=True)
    numeric_df = numeric_df.join(dummy_col_df)
    most_frequent_values = rawdf[str_columns].mode().loc[0].to_dict()
    reference_cols = ["{}_{}".format(key, value) for key, value in most_frequent_values.items()]
    numeric_df.drop(reference_cols, axis=1, inplace=True)
    return numeric_df