In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
#import math
import opticl
import os
from pyomo import environ
from pyomo.environ import *
from imp import reload
import embed_mip as em
import math

# NOTE: 
There is a difference in approach between OptiCL and JANOS in that we are learning constraints, while JANOS is predicting and optimizing variables.

We will proceed as if learning a constraint or the objective function. We will therefore not be using `college_applications6000` data.

# Step 1: Setup Optimization Problem

In [2]:
"""
Model constants
"""
scholarships = [0, 2.5]  # lower and upper bound of the scholarship
#n_simulations = 5
#student_sizes = [50, 100, 500, 1000]

student_size = 100
n_applications = student_size
BUDGET = int(0.2 * n_applications)


In [89]:
def init_conceptual_model():
    model = ConcreteModel('StudentEnroll')
    
    '''
    Decision variables
    '''
    N = range(n_applications)   # number of applications
    model.x = Var(['merit'], domain = NonNegativeReals, bounds = (0, 2.5))  # the data is scaled so the max should be 1 and not 2.5
    
    
    '''
    Objective function
    '''
    def obj_function(model):
        return 0
    
    model.OBJ = Objective(rule = obj_function, sense=maximize)    
    
    '''
    Constraints
    '''    
    # 1. Budget constraint
    def budget_constraint(model):
        return sum(model.x[i] for i in ['merit']) <= 1
    
    model.constraint1 = Constraint(rule = budget_constraint)
    

    return model   
   

# Step 2: Data Processing

In [90]:
# Import the data used for training the predictive models
enrolment_data = pd.read_csv('college_student_enroll-s1-1.csv')
enrolment_data.head()

Unnamed: 0,StudentID,SAT,GPA,merit,enroll
0,1,1507,3.72,1.64,0
1,2,1532,3.93,0.52,0
2,3,1487,3.77,1.67,0
3,4,1259,3.05,1.21,1
4,5,1354,3.39,1.65,1


In [91]:
# Standardize SAT and GPA scores for both datasets using StandardScaler()
scaler_sat = MinMaxScaler().fit(enrolment_data[["SAT"]])
scaler_gpa = MinMaxScaler().fit(enrolment_data[["GPA"]])
enrolment_data['SAT'] = scaler_sat.transform(enrolment_data[['SAT']])
enrolment_data['GPA'] = scaler_gpa.transform(enrolment_data[['GPA']])

enrolment_data = enrolment_data.drop(['StudentID'], axis=1)

enrolment_data.head()

Unnamed: 0,SAT,GPA,merit,enroll
0,0.874138,0.770492,1.64,0
1,0.917241,0.885246,0.52,0
2,0.839655,0.797814,1.67,0
3,0.446552,0.404372,1.21,1
4,0.610345,0.590164,1.65,1


In [92]:
enrolment_data.to_csv('enrolment_data.csv', index=False)

In [93]:
# For 'student_size' number of students, randomly sample from enrolment_data
sampled_data = enrolment_data.sample(student_size)

y = sampled_data['enroll']
X = sampled_data.drop(['enroll'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

# Step 3: Learn the Predictive Models


In [94]:
version = 'StudentEnroll_v1'
alg_list = ['mlp']
outcome_list = {'offer_accepted': {'outcome_type': ['objective', 1], 'task_type': 'binary', 'alg_list':alg_list, 
                                   'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test}}

In [95]:
performance = pd.DataFrame()

if not os.path.exists('results/'):
    os.makedirs('results/')

for outcome in outcome_list.keys():
    print(f'Learning a constraint for {outcome}')
    
    alg_list = outcome_list[outcome]['alg_list']
    task_type = outcome_list[outcome]['task_type']
    for alg in alg_list:
        X_train = outcome_list[outcome]['X_train']
        y_train = outcome_list[outcome]['y_train']
        X_test = outcome_list[outcome]['X_test']
        y_test = outcome_list[outcome]['y_test']
        
        if not os.path.exists('results/%s/' % alg):
            os.makedirs('results/%s/' % alg)
        print(f'Training {alg}')
        s = 1

        ## Run shallow/small version of RF
        alg_run = 'rf_shallow' if alg == 'rf' else alg

        m, perf = opticl.run_model(X_train, y_train, X_test, y_test, alg_run, outcome, task = task_type,
                               seed = s, cv_folds = 5, 
                               # metric = 'r2',
                               save = False
                              )

        ## Save model
        constraintL = opticl.ConstraintLearning(X_train, y_train, m, alg)
        constraint_add = constraintL.constraint_extrapolation(task_type)
        constraint_add.to_csv('results/%s/%s_%s_model.csv' % (alg, version, outcome), index = False)

        ## Extract performance metrics
        try:
            perf['auc_train'] = roc_auc_score(y_train >= threshold, m.predict(X_train))
            perf['auc_test'] = roc_auc_score(y_test >= threshold, m.predict(X_test))
        except: 
            perf['auc_train'] = np.nan
            perf['auc_test'] = np.nan

        perf['seed'] = s
        perf['outcome'] = outcome
        perf['alg'] = alg
        perf['save_path'] = 'results/%s/%s_%s_model.csv' % (alg, version, outcome)
        
            
        perf.to_csv('results/%s/%s_%s_performance.csv' % (alg, version, outcome), index = False)
        
        performance = performance.append(perf)
        print()
print('Saving the performance...')
performance.to_csv('results/%s_performance.csv' % version, index = False)
print('Done!')

Learning a constraint for offer_accepted
Training mlp
------------- Initialize grid  ----------------
------------- Running model  ----------------
Algorithm = mlp, metric = None
saving... results/mlp_offer_accepted_trained.pkl
------------- Model evaluation  ----------------
-------------------training evaluation-----------------------
Train Score: 1.0
-------------------testing evaluation-----------------------
Test Score: 0.9987212276214834

Saving the performance...
Done!


In [96]:
performance

Unnamed: 0,save_path,seed,cv_folds,task,parameters,best_params,valid_score,train_score,test_score,auc_train,auc_test,outcome,alg
0,results/mlp/StudentEnroll_v1_offer_accepted_mo...,1,5,binary,"{'hidden_layer_sizes': [(10,), (20,), (50,), (...","{'hidden_layer_sizes': (10,)}",1.0,1.0,0.998721,,,offer_accepted,mlp


# Step 4: Predictive Model Selection and Optimization

In [113]:
model_master = opticl.model_selection(performance, outcome_list)
model_master[['lb', 'ub', 'SCM_counterfactuals', 'features', 'trust_region', 'dataset_path',
              'clustering_model', 'max_violation', 'enlargement', 'var_features', 'contex_features']] = None
model_master.loc[0, 'lb'] = None
model_master.loc[0, 'ub'] = None
model_master.loc[0, 'SCM_counterfactuals'] = None
model_master.at[0, 'features'] = [col for col in X.columns]
model_master.loc[0, 'trust_region'] = True
model_master.loc[0, 'dataset_path'] = 'enrolment_data.csv'
model_master.loc[0, 'clustering_model'] = None
model_master.loc[0, 'max_violation'] = None
model_master.at[0, 'enlargement'] = [0, 0, 0] # 0: no enlargment, 1: CH-enlar; 0: constraint, 1: obj penalty; UB/penalty ceoff
model_master.at[0, 'var_features'] = ['merit']
model_master.at[0, 'contex_features'] = {'GPA': 0.6, 'SAT':0.5}
model_master

          outcome model_type  \
0  offer_accepted        mlp   

                                           save_path    task  objective  
0  results/mlp/StudentEnroll_v1_offer_accepted_mo...  binary          1  


Unnamed: 0,outcome,model_type,save_path,task,objective,lb,ub,SCM_counterfactuals,features,trust_region,dataset_path,clustering_model,max_violation,enlargement,var_features,contex_features
0,offer_accepted,mlp,results/mlp/StudentEnroll_v1_offer_accepted_mo...,binary,1,,,,"[SAT, GPA, merit]",True,enrolment_data.csv,,,"[0, 0, 0]",[merit],"{'GPA': 0.6, 'SAT': 0.5}"


In [110]:
def getSolution(model, X):
    solution = {}
    palatability = 0
    count = 0
    for v in model.getVars():
        if 'y[' in v.varName:
            solution[list(X.columns)[count]] = [v.y]
            print(v.varName)
            count += 1
    return solution

In [114]:
result = {}
conceptual_model = init_conceptual_model()
MIP_final_model = em.optimization_MIP(conceptual_model, model_master)
opt = SolverFactory('gurobi')
results = opt.solve(MIP_final_model)
value(MIP_final_model.OBJ), value(MIP_final_model.x['merit']),1 / (1 + math.exp(-value(MIP_final_model.OBJ)))

Generating constraints for the trust region using 20000 samples.
... Trust region defined.
Embedding objective function for offer_accepted


(0.9307728045853338, 1.0, 0.7172320444840796)