In [72]:
import os, sys

parentdir = os.path.abspath('/home/xcheng/mvesc/ETL')
sys.path.insert(0,parentdir)
from mvesc_utility_functions import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from IPython.display import Image
from IPython.display import display
from numpy import random
import pickle
from estimate_prediction_model import *
from write_to_database import write_scores_to_db
from optparse import OptionParser
import re
from make_predictions_for_unlabeled_students import * 
import sqlalchemy
%load_ext autotime
%matplotlib inline

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 10.9 ms


In [97]:
"""
Generate individual Risk Scores and Factors
"""
def topK_features_logit(model, data, feature_names, topK=3):
    """ 
    Find topK features in logistic regression for a single observation
    We may generate similar functions for other methods
    :param sklearn.object model: model; it should be LogisticRegression()
    :param 1D np.array data: data of one student's all feature data in the right sequence
    :param list[str] feature_names: a list of feature names
    :param int topK: number of top features to return
    :return list[str]: list of topK features names
    :rtype list of str:
    """
    importances = np.transpose(model.coef_)[:, 0]*data
    indices = importances.argsort()
    indices = indices[::-1]
    return(list(np.array(feature_names)[indices[:topK]]))

def risk_score2level(score, percentiles, risk_levels):
    """ 
    Find risk levels based on risk score and threshold
    :param float score: risk score/probability; e.g. 0.862
    :parma percentiles: threshold scores for risk levels, e.g. [0.9552, 0.8977, 0.7821]
    :param risk_levels: risk levels top to bottom, e.g. ['High', 'Medium', 'Low', 'Safe']
    """
    ind = (percentiles>score).sum()
    return(risk_levels[ind])

def get_school_district(df, grade, year=2015):
    """ 
    Add school, district information to the table and return only current student at a grade
    :param pd.dataframe df: data frame with at least student lookups
    :param int grade: the only grade to return
    :param int year: school year
    :return pd.dataframe df: inner joined dataframe with only current grade at the year 
    """
    with postgres_pgconnection_generator() as conn:
        with conn.cursor() as cursor:
            select_current_grade = """
            select student_lookup, grade, school_year, school_code, district
            from clean.all_snapshots
            where grade={g} and school_year={yr}
            """.format(g=grade-1, yr=2015)
            df_school_etc = pd.read_sql_query(select_current_grade, conn)
    return df.merge(df_school_etc, on='student_lookup')

def column_names_mathing_processed2raw(processed_feature_column_names, raw_feature_column_names):
    matching = {}
    gender, ethnicity = 'gender', 'ethnicity'
    for c in processed_feature_column_names:
        if c in raw_feature_column_names:
            matching[c] = c
        elif gender in c or ethnicity in c:
            matching[c] = c.split('_')[0]
        else:
            parts = c.split('_gr_')
            matching[c] = parts[0]+'_gr_'+ parts[1].split('_')[0]
    return(matching)
#def main():
# filename_list = ['08_17_2016_grade_6_param_set_8_logit_jg_97',
#                  '08_17_2016_grade_7_param_set_17_logit_jg_98',
#                  '08_17_2016_grade_8_param_set_16_logit_jg_111',
#                  '08_17_2016_grade_9_param_set_16_logit_jg_111',
#                  '08_17_2016_grade_10_param_set_22_logit_jg_122']
filename_list = ['08_17_2016_grade_7_param_set_17_logit_jg_98']
topK = 3
schema, table = 'model', 'individual_risk_scores_factors_test'
dir_pkls = '/mnt/data/mvesc/Models_Results/pkls'
if_exists = 'append'
threshold_percentiles = [0.95, 0.85, 0.70]
risk_levels = ['High', 'Medium', 'Low', 'Safe']

random_seed = 62571


for filename in filename_list:
    print("- Processing pkl: ", filename)
    
    # load saved model
    model_name = filename.split('_')[-3]
    clf, options = read_in_model(filename, model_name)
    grade = options['prediction_grade_level']

    # fetch and process feature data
    features_num, features_raw = build_test_feature_set(options, current_year=2016, return_raw=True)
    features_processed = test_impute_and_scale(features_num, options)

    # create matching from processed feature name to processed feature names
    raw_feature_column_names = set(features_raw.columns)
    processed_feature_column_names = set(features_processed)
    colnames_matching_processed2raw = column_names_mathing_processed2raw(processed_feature_column_names, 
                                                                         raw_feature_column_names)

    
    # predict and find top factors
    if hasattr(clf, "predict_proba"):
        risk_probas = clf.predict_proba(features_processed)[:,1]
    else:
        risk_probas = clf.decision_function(features_processed)

    top_individual_features = []
    for i in range(features_processed.shape[0]):
        x = np.array(features_processed.iloc[i, :])
        top_feature_names_raw = [colnames_matching_processed2raw[c] for c in topK_features_logit(clf, x, features_processed.columns, topK=topK)]
        top_individual_features.append(top_feature_names_raw)
    top_risk_factor_names = ['risk_factor_'+str(i) for i in range(1, topK+1)]
    top_individual_features = pd.DataFrame(top_individual_features, 
                                           columns=top_risk_factor_names)

    # individual risk score, level & factors
    individual_scores_factors = pd.DataFrame()
    individual_scores_factors['student_lookup'] = features_raw.index

    # assign risk score & levels
    individual_scores_factors['risk_score'] =  risk_probas
    percentiles = individual_scores_factors.risk_score.quantile(q=threshold_percentiles)
    student_risk_levels = [risk_score2level(s, percentiles, risk_levels) for s in individual_scores_factors.risk_score]
    individual_scores_factors['risk_level'] = student_risk_levels
    individual_scores_factors = pd.concat([individual_scores_factors, top_individual_features], axis=1)

    # get top risk values
    top_feature_values = {'risk_factor_'+str(i):[] for i in range(1, topK+1)}
    for risk_i in top_feature_values:
        for student_i in range(features_processed.shape[0]):
            column_in_features_raw = individual_scores_factors.ix[student_i, risk_i]
            top_feature_values[risk_i].append(str(features_raw[column_in_features_raw].iloc[student_i]))
    top_feature_values = pd.DataFrame(top_feature_values)
    top_feature_values = top_feature_values.rename(columns={x:x+'_value' for x in top_feature_values.columns})
    individual_scores_factors = pd.concat([individual_scores_factors, top_feature_values], axis=1)
    
    # subset the data to only include current students and corrent grades
    individual_scores_factors = get_school_district(individual_scores_factors, grade)

    # model and its file name
    individual_scores_factors['model'] = model_name
    individual_scores_factors['model_file'] = filename
    individual_scores_factors.sort_values(by=['risk_score', 'district', 'school_code'],inplace=True, ascending=False)


    # output to postgres
    eng = postgres_engine_generator()
    individual_scores_factors.to_sql(table, eng, schema = schema, if_exists=if_exists, index=False)
    print('- Processed ', filename)


- Processing pkl:  08_17_2016_grade_7_param_set_17_logit_jg_98
- Processed  08_17_2016_grade_7_param_set_17_logit_jg_98
time: 26.3 s


In [80]:
#dict_dtype = {'grade':sqlalchemy.types.Integer, 'school_year':sqlalchemy.types.Integer}
individual_scores_factors.dtypes

student_lookup         float64
risk_score             float64
risk_level              object
risk_factor_1           object
risk_factor_2           object
risk_factor_3           object
risk_factor_1_value     object
risk_factor_2_value     object
risk_factor_3_value     object
grade                    int64
school_year              int64
school_code             object
district                object
model                   object
model_file              object
dtype: object

time: 4.18 ms


In [83]:
type(individual_scores_factors.iloc[1, 7])

numpy.float64

time: 2.66 ms


In [86]:
df = individual_scores_factors[['risk_factor_1_value', 'risk_factor_2_value', 'risk_factor_3_value']]
df.head()

Unnamed: 0,risk_factor_1_value,risk_factor_2_value,risk_factor_3_value
1174,0.0,21.0,-3.25057
1911,0.5,45.55,-2.04246
1232,0.0,-3.25057,59.0
2373,1.15,-2.30091,31.0
1726,0.0,-1.93167,0.9


time: 9.3 ms


In [92]:
type(df.iloc[407, 0])

str

time: 3.17 ms


In [96]:
str(None)

'None'

time: 2.04 ms


In [55]:
# all features in raw ends with gr_num
def column_names_mathing_processed2raw(processed_feature_column_names, raw_feature_column_names):
    matching = {}
    gender, ethnicity = 'gender', 'ethnicity'
    for c in processed_feature_column_names:
        if c in raw_feature_column_names:
            matching[c] = c
        elif gender in c or ethnicity in c:
            matching[c] = c.split('_')[0]
        else:
            parts = c.split('_gr_')
            matching[c] = parts[0]+'_gr_'+ parts[1].split('_')[0]
    return(matching)
column_names_mathing_processed2raw(processed_feature_column_names, raw_feature_column_names)

{'absence_gr_5': 'absence_gr_5',
 'absence_gr_6': 'absence_gr_6',
 'academic_inv_gr_5': 'academic_inv_gr_5',
 'academic_inv_gr_6': 'academic_inv_gr_6',
 'art_gpa_gr_5': 'art_gpa_gr_5',
 'art_gpa_gr_5_isnull': 'art_gpa_gr_5',
 'art_gpa_gr_6': 'art_gpa_gr_6',
 'art_gpa_gr_6_isnull': 'art_gpa_gr_6',
 'atheletics_gr_6': 'atheletics_gr_6',
 'avg_address_change_to_gr_5': 'avg_address_change_to_gr_5',
 'avg_address_change_to_gr_5_isnull': 'avg_address_change_to_gr_5',
 'avg_address_change_to_gr_6': 'avg_address_change_to_gr_6',
 'avg_address_change_to_gr_6_isnull': 'avg_address_change_to_gr_6',
 'avg_city_change_to_gr_5': 'avg_city_change_to_gr_5',
 'avg_city_change_to_gr_5_isnull': 'avg_city_change_to_gr_5',
 'avg_city_change_to_gr_6': 'avg_city_change_to_gr_6',
 'avg_city_change_to_gr_6_isnull': 'avg_city_change_to_gr_6',
 'avg_district_change_to_gr_5': 'avg_district_change_to_gr_5',
 'avg_district_change_to_gr_5_isnull': 'avg_district_change_to_gr_5',
 'avg_district_change_to_gr_6': 'avg_d

time: 12.5 ms


In [53]:
'gender' in 'gender_M'

True

time: 2.09 ms


In [None]:
### Generate a CSV for our partner
schema, table = 'model', 'individual_risk_scores_factors'
csvfile = 'current_student_predictions_logit_20160817.csv'
with postgres_pgconnection_generator() as conn:
    with conn.cursor() as cursor:
        sql_select = """
        select student_lookup, grade, school_year, school_code, district,
        risk_score, risk_level, risk_factor_1, risk_factor_2, risk_factor_3, 
        risk_factor_1_value, risk_factor_2_value, risk_factor_3_value
        from {s}.{t}
        order by grade, district, school_code, risk_score desc;
        """.format(s=schema, t=table)
        df = pd.read_sql_query(sql_select, conn)
        
df.to_csv(csvfile, index=False)

In [5]:
tab_reports = 'model.reports' # it has batch_name, precision, recall
tab_feature_scores = 'model.feature_scores'
tab_prediction = 'model.predictions'
dir_pkls = '/mnt/data/mvesc/Models_Results/pkls'
pkls_logit = ['08_12_2016_grade_7_param_set_17_logit_ht_19082']
pkls_logit = os.path.join(dir_pkls, pkls_logit[0])+'_logit.pkl'
with open(pkls_logit, 'rb') as handle:
    pkl = pickle.load(handle)
    
with open(dir_pkls+'/08_12_2016_grade_7_param_set_17_logit_ht_19078_logit.pkl', 'rb') as handle:
    pkl2 = pickle.load(handle)
    

time: 53.6 ms


In [6]:
pkl['model_options']['prediction_grade_level']

7

time: 4.45 ms


In [12]:
int(pkl['model_options']['cohort_grade_level_begin'].split('_')[-1][:-2])

7

time: 2.52 ms


'11'

time: 2.29 ms


In [3]:
# it seems the robusted-scaled are centered to median and rescaled by IQR;
# we may assume all features has a simiar scales

"""
Procedures to generate individual scores and top risk factors of logistic regression
1. load the features of current students at a certain grade;
2. load the corresonding model pickle file;
3. make predictions to get raw score (0, 1)
4. rank the scores and assiged risk level and risk score (5% high risk, 5% medium, 5% low; rescale 20% to 1 to 10);
5. find the top risk factors/features/column names;
6. save as pdf output to csv
"""
schema, table = 'model', 'individual_risk_scores_factors'
dir_pkls = '/mnt/data/mvesc/Models_Results/pkls'
pickle_file = '08_12_2016_grade_7_param_set_17_logit_ht_19082_logit.pkl'
if_exists = 'replace'
random_seed = 62571
topK = 3
both_positive_negative = True
num_students = 20
student_column = 'student_lookup'
pkl_model_key = 'estimator'

def topK_features_logit(model, data, feature_name, topN=3):
    importances = np.transpose(model.coef_)[:, 0]*data
    indices = importances.argsort()
    indices = indices[::-1]
    #print(indices[:3])
    return(list(np.array(features)[indices[:3]]))

with open(os.path.join(dir_pkls, pickle_file), 'rb') as handle:
    pkl = pickle.load(handle)
features = list(pkl['features']) # to pull feature data later

###!!! generate random all_features_dataframe with student_lookups for testing 
np.random.seed(random_seed)
all_data4prediction = np.random.rand(num_students, len(features))
all_data4prediction = pd.DataFrame(all_data4prediction, columns=features)
all_data4prediction[student_column] = range(1, all_data4prediction.shape[0]+1)
all_data4prediction = all_data4prediction[[student_column]+features]
###!!! random all_features_dataframe generated

student_lookups = all_data4prediction[student_column]
all_feature_data = all_data4prediction[features]
# all_feature_data = Robust_Scale(all_feature_data) #processed
risk_probas = pkl[pkl_model_key].predict_proba(all_feature_data)[:,1]
predictions = pkl[pkl_model_key].predict(all_feature_data)
top_individual_features = []
for i in range(all_feature_data.shape[0]):
    x = np.array(all_feature_data.iloc[i, :])
    top_individual_features.append(topN_features_logit(pkl[pkl_model_key], x, features, topN=3))

top_risk_factor_names = ['risk_factor_'+str(i) for i in range(1, topK+1)]
top_individual_features = pd.DataFrame(top_individual_features, columns=top_risk_factor_names)

# individual risk score & factors
individual_scores_factors = pd.DataFrame()
individual_scores_factors[student_column] = student_lookups
individual_scores_factors['risk_score'] =  risk_probas
individual_scores_factors = pd.concat([individual_scores_factors, top_individual_features], axis=1)

# individual risk facotrs values
top_feature_values = {'risk_factor_'+str(i):[] for i in range(1, topK+1)}
for risk_i in top_feature_values:
    for student_i in range(all_data4prediction.shape[0]):
        column_in_alldata = individual_scores_factors.ix[student_i, risk_i]
        top_feature_values[risk_i].append(all_data4prediction.ix[student_i, column_in_alldata])
top_feature_values = pd.DataFrame(top_feature_values)
top_feature_values = top_feature_values.rename(columns={x:x+'_value' for x in top_feature_values.columns})
individual_scores_factors = pd.concat([individual_scores_factors, top_feature_values], axis=1)

# model and its file name
individual_scores_factors['model'] = str(pkl[pkl_model_key])
individual_scores_factors['model_file'] = pickle_file

eng = postgres_engine_generator()
individual_scores_factors.to_sql(table, eng, schema = schema, if_exists=if_exists, index=False)

time: 3.37 s


In [4]:
individual_scores_factors.head()

Unnamed: 0,student_lookup,risk_score,risk_factor_1,risk_factor_2,risk_factor_3,risk_factor_1_value,risk_factor_2_value,risk_factor_3_value,model,model_file
0,1,0.758139,ethnicity_M,district_gr_6_Maysville,read_normalized_gr_5_isnull,0.722477,0.848921,0.876151,"LogisticRegression(C=1.0, class_weight=None, d...",08_12_2016_grade_7_param_set_17_logit_ht_19082...
1,2,0.2131,ethnicity_M,disadvantagement_gr_6_economic,art_gpa_gr_6,0.416974,0.965883,0.760293,"LogisticRegression(C=1.0, class_weight=None, d...",08_12_2016_grade_7_param_set_17_logit_ht_19082...
2,3,0.493108,ethnicity_M,art_gpa_gr_6,disadvantagement_gr_6_economic,0.654893,0.943058,0.789732,"LogisticRegression(C=1.0, class_weight=None, d...",08_12_2016_grade_7_param_set_17_logit_ht_19082...
3,4,0.516004,ethnicity_M,district_gr_6_Maysville,read_normalized_gr_5_isnull,0.87806,0.996202,0.76615,"LogisticRegression(C=1.0, class_weight=None, d...",08_12_2016_grade_7_param_set_17_logit_ht_19082...
4,5,0.792288,ethnicity_M,district_gr_6_Maysville,art_gpa_gr_6,0.900475,0.863978,0.803051,"LogisticRegression(C=1.0, class_weight=None, d...",08_12_2016_grade_7_param_set_17_logit_ht_19082...


time: 18.6 ms


In [6]:
def topK_features_logit(model, data, feature_names, topK=3):
    importances = np.transpose(model.coef_)[:, 0]*data
    indices = importances.argsort()
    indices = indices[::-1]
    #print(indices[:3])
    return(list(np.array(feature_names)[indices[:topK]]))

def risk_score2level(score, percentiles, risk_levels):
    ind = (percentiles>score).sum()
    return(risk_levels[ind])

def get_school_district(df, grade, year=2015):
    with postgres_pgconnection_generator() as conn:
        with conn.cursor() as cursor:
            select_current_grade = """
            select student_lookup, grade, school_year, school_code, district
            from clean.all_snapshots
            where grade={g} and school_year={yr}
            """.format(g=grade-1, yr=2015)
            df_school_etc = pd.read_sql_query(select_current_grade, conn)
    return df.merge(df_school_etc, on='student_lookup')

#def main():
filename_list = ['08_17_2016_grade_6_param_set_8_logit_jg_97',
                 '08_17_2016_grade_7_param_set_17_logit_jg_98',
                 '08_17_2016_grade_8_param_set_16_logit_jg_111',
                 '08_17_2016_grade_9_param_set_16_logit_jg_111',
                 '08_17_2016_grade_10_param_set_22_logit_jg_122']
#filename_list = ['08_17_2016_grade_7_param_set_17_logit_jg_98']
topK = 3
schema, table = 'model', 'individual_risk_scores_factors'
dir_pkls = '/mnt/data/mvesc/Models_Results/pkls'
if_exists = 'append'
random_seed = 62571


for filename in filename_list:
    # load saved model
    print("- Processing pkl: ", filename)
    model_name = filename.split('_')[-3]
    clf, options = read_in_model(filename, model_name)
    grade = options['prediction_grade_level']
    #print(len(np.transpose(clf.coef_)[:, 0]))

    # fetch and process feature data
    features_num, features_raw = build_test_feature_set(options, current_year=2016, return_raw=True)
    #print(features_num.shape, features_raw.shape)
    features_processed = test_impute_and_scale(features_num, options)
    #print(features_processed.shape)

    # predict and find top factors
    if hasattr(clf, "predict_proba"):
        risk_probas = clf.predict_proba(features_processed)[:,1]
    else:
        risk_probas = clf.decision_function(features_processed)

    predictions = clf.predict(features_processed)
    top_individual_features = []
    for i in range(features_processed.shape[0]):
        x = np.array(features_processed.iloc[i, :])
        top_individual_features.append(topK_features_logit(clf, x, features_processed.columns, topK=topK))

    top_risk_factor_names = ['risk_factor_'+str(i) for i in range(1, topK+1)]
    top_individual_features = pd.DataFrame(top_individual_features, 
                                           columns=top_risk_factor_names)

    # individual risk score, level & factors
    individual_scores_factors = pd.DataFrame()
    individual_scores_factors['student_lookup'] = features_raw.index

    # assign risk score & levels
    individual_scores_factors['risk_score'] =  risk_probas
    percentiles = individual_scores_factors.risk_score.quantile(q=[0.95, 0.85, 0.70])
    risk_levels = ['High', 'Medium', 'Low', 'Safe']
    student_risk_levels = [risk_score2level(s, percentiles, risk_levels) for s in individual_scores_factors.risk_score]
    individual_scores_factors['risk_level'] = student_risk_levels
    individual_scores_factors = pd.concat([individual_scores_factors, top_individual_features], axis=1)

    top_feature_values = {'risk_factor_'+str(i):[] for i in range(1, topK+1)}
    for risk_i in top_feature_values:
        for student_i in range(features_processed.shape[0]):
            column_in_features_processed = individual_scores_factors.ix[student_i, risk_i]
            top_feature_values[risk_i].append(features_processed[column_in_features_processed].iloc[student_i])
    top_feature_values = pd.DataFrame(top_feature_values)
    top_feature_values = top_feature_values.rename(columns={x:x+'_value' for x in top_feature_values.columns})
    individual_scores_factors = pd.concat([individual_scores_factors, top_feature_values], axis=1)

    # subset the data to only include current students and corrent grades
    individual_scores_factors = get_school_district(individual_scores_factors, grade)

    # model and its file name
    individual_scores_factors['model'] = model_name
    individual_scores_factors['model_file'] = filename
    individual_scores_factors.sort_values(by=['risk_score', 'district', 'school_code'],inplace=True, ascending=False)


    # output to postgres
    eng = postgres_engine_generator()
    individual_scores_factors.to_sql(table, eng, schema = schema, if_exists=if_exists, index=False)
    print('- Processed ', filename)


- Processing pkl:  08_17_2016_grade_6_param_set_8_logit_jg_97
- Processed  08_17_2016_grade_6_param_set_8_logit_jg_97
- Processing pkl:  08_17_2016_grade_7_param_set_17_logit_jg_98
- Processed  08_17_2016_grade_7_param_set_17_logit_jg_98
- Processing pkl:  08_17_2016_grade_8_param_set_16_logit_jg_111
- Processed  08_17_2016_grade_8_param_set_16_logit_jg_111
- Processing pkl:  08_17_2016_grade_9_param_set_16_logit_jg_111
- Processed  08_17_2016_grade_9_param_set_16_logit_jg_111
- Processing pkl:  08_17_2016_grade_10_param_set_22_logit_jg_122
- Processed  08_17_2016_grade_10_param_set_22_logit_jg_122
time: 36.3 s


In [9]:
feaures_num.shape

(2379, 283)

time: 2.54 ms


In [7]:
### Generate a CSV for our partner
schema, table = 'model', 'individual_risk_scores_factors'
csvfile = 'current_student_predictions_logit_20160817.csv'
with postgres_pgconnection_generator() as conn:
    with conn.cursor() as cursor:
        sql_select = """
        select student_lookup, grade, school_year, school_code, district,
        risk_score, risk_level, risk_factor_1, risk_factor_2, risk_factor_3, 
        risk_factor_1_value, risk_factor_2_value, risk_factor_3_value
        from {s}.{t}
        order by grade, district, school_code, risk_score desc;
        """.format(s=schema, t=table)
        df = pd.read_sql_query(sql_select, conn)
        
df.to_csv(csvfile, index=False)

time: 786 ms


In [33]:
get_school_district(individual_scores_factors, grade)

Unnamed: 0,student_lookup,risk_score,risk_level,risk_factor_1,risk_factor_2,risk_factor_3,risk_factor_1_value,risk_factor_2_value,risk_factor_3_value,model,model_file,grade,school_code,district
0,23779.0,1.000000,High,discipline_incidents_gr_6,art_gpa_gr_5,humanities_gpa_gr_5,21.000000,-4.000000,-2.600000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking
1,27633.0,0.999906,High,discipline_incidents_gr_6,humanities_gpa_gr_5,stem_gpa_gr_5,9.000000,-2.600000,-2.000000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking
2,23991.0,0.999836,High,discipline_incidents_gr_6,humanities_gpa_gr_5,stem_gpa_gr_5,10.000000,-1.950000,-1.150000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking
3,23945.0,0.999737,High,discipline_incidents_gr_6,humanities_gpa_gr_5,num_pf_classes_gr_6,10.000000,-1.950000,1.000000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking
4,26101.0,0.999229,High,discipline_incidents_gr_6,humanities_gpa_gr_5,stem_gpa_gr_5,9.000000,-1.250000,-1.000000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking
5,13240.0,0.999032,High,discipline_incidents_gr_6,humanities_gpa_gr_5,iss_gr_6,3.000000,-2.250000,5.000000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,MAMS,Maysville
6,25973.0,0.998392,High,discipline_incidents_gr_6,humanities_gpa_gr_5,num_pf_classes_gr_6,8.000000,-1.400000,1.000000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking
7,24084.0,0.998275,High,discipline_incidents_gr_6,stem_gpa_gr_5,humanities_gpa_gr_5,9.000000,-1.000000,-0.550000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking
8,25723.0,0.998094,High,discipline_incidents_gr_6,read_normalized_gr_4,num_pf_classes_gr_6,9.000000,-0.682836,1.000000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking
9,23955.0,0.998005,High,discipline_incidents_gr_6,humanities_gpa_gr_5,oss_gr_5,5.000000,-1.750000,3.000000,logit,08_12_2016_grade_7_param_set_17_logit_ht_19082,6,8,Logan_Hocking


time: 443 ms


In [28]:
inddf.columns

Index(['student_lookup', 'risk_score', 'risk_level', 'risk_factor_1',
       'risk_factor_2', 'risk_factor_3', 'risk_factor_1_value',
       'risk_factor_2_value', 'risk_factor_3_value', 'model', 'model_file',
       'grade', 'school_code', 'district'],
      dtype='object')

time: 2.46 ms
