In [None]:
import os
import sys
import pandas as pd
import numpy as np

DATAFRAME_COLUMNS = [
    'RESULT_ID',
    'GENDER',
    'NATIONALITY',
    'ELEMENTARY_SCHOOL',
    'AGE_WHEN_STARTED',
    'CLASSES_LAST_SEMESTER',
    'CREDITS_LAST_SEMESTER',
    'CREDITS_COMPLETED',
    'COURSE_OF_STUDY',
    'CLASS_BASED_SCHOOL',
    'SCHOOL',
    'ELEMENTARY_GRADE_9',
    'ELEMENTARY_GRADE_1',
    'ELEMENTARY_GRADE_2',
    'ELEMENTARY_GRADE_3',
    'ELEMENTARY_GRADE_4',
    'ELEMENTARY_GRADE_AVG',
    'SMALL_PERIOD_ON_TIME',
    'SMALL_PERIOD_ABSENT',
    'SMALL_PERIOD_LATE',
    'LARGE_PERIOD_ON_TIME',
    'LARGE_PERIOD_ABSENT',
    'LARGE_PERIOD_LATE',
    'LARGE_PERIOD_AVG_GRADE',
    'SMALL_PERIOD_AVG_ASSIGNMENT_GRADE',
    'LARGE_PERIOD_AVG_ASSIGNMENT_GRADE',
    'LATEST_GRADE_VARIATION',
    'SUPERVISOR_GROUP_SIZE',
    'SEMESTER_COUNT',
]

#
# Import
#
from sklearn.externals import joblib

full_pipeline_with_predictor = joblib.load("my_model.pkl")

In [None]:
#
# Test and verify functionality of classifier.
# 

test_df = pd.DataFrame(columns=DATAFRAME_COLUMNS)

test_base = {
    'NATIONALITY': 'IS',
    'ELEMENTARY_SCHOOL': None,
    'CLASSES_LAST_SEMESTER': 6,
    'CREDITS_LAST_SEMESTER': 20,
    'COURSE_OF_STUDY': 'Félagsfræðabraut',
    'CLASS_BASED_SCHOOL': False,
    'SCHOOL': 'Fjölbrautaskólinn í Garðabæ',
}

test_data_list = [
    {
        'GENDER':  1,   'AGE_WHEN_STARTED': 20, 'CREDITS_COMPLETED': 20,  'GRADES': 2.0, 
        'ON_TIME': 0.1, 'ABSENT': 0.9, 'LATEST_GRADE_VARIATION': -2.0,
        'SUPERVISOR_GROUP_SIZE': 100, 'SEMESTER_COUNT': 2,
        'THE_RESULT': True, 'THE_PROBABILITY': lambda x: x > 0.90
    },
    {
        'GENDER': 1, 'AGE_WHEN_STARTED': 16, 'CREDITS_COMPLETED': 200, 'GRADES': 8.0,
        'ON_TIME': 0.9, 'ABSENT': 0.1, 'LATEST_GRADE_VARIATION': 0.0,
        'SUPERVISOR_GROUP_SIZE': 20, 'SEMESTER_COUNT': 6,
        'THE_RESULT': False, 'THE_PROBABILITY': lambda x: x < 0.10
    },
    {
        'GENDER':  2,   'AGE_WHEN_STARTED': 20, 'CREDITS_COMPLETED': 20,  'GRADES': 2.0, 
        'ON_TIME': 0.1, 'ABSENT': 0.9, 'LATEST_GRADE_VARIATION': -2.0,
        'SUPERVISOR_GROUP_SIZE': 100, 'SEMESTER_COUNT': 2,
        'THE_RESULT': True, 'THE_PROBABILITY': lambda x: x > 0.90
    },
    {
        'GENDER': 2, 'AGE_WHEN_STARTED': 16, 'CREDITS_COMPLETED': 200, 'GRADES': 8.0,
        'ON_TIME': 0.9, 'ABSENT': 0.1, 'LATEST_GRADE_VARIATION': 0.0,
        'SUPERVISOR_GROUP_SIZE': 20, 'SEMESTER_COUNT': 6,
        'THE_RESULT': False, 'THE_PROBABILITY': lambda x: x < 0.10
    },
]

for idx, test_data in enumerate(test_data_list):
    _GRADES = test_data.pop('GRADES')
    _ON_TIME = test_data.pop('ON_TIME')
    _ABSENT = test_data.pop('ABSENT')

    # Append a "bad" student.
    test_df = test_df.append({        
        'RESULT_ID': -1 - idx,
        'GENDER': test_data.pop('GENDER'),
        'NATIONALITY': test_base['NATIONALITY'],
        'ELEMENTARY_SCHOOL': test_base['ELEMENTARY_SCHOOL'],
        'AGE_WHEN_STARTED': test_data.pop('AGE_WHEN_STARTED'),
        'CLASSES_LAST_SEMESTER': test_base['CLASSES_LAST_SEMESTER'],
        'CREDITS_LAST_SEMESTER': test_base['CREDITS_LAST_SEMESTER'],
        'CREDITS_COMPLETED': test_data.pop('CREDITS_COMPLETED'),
        'COURSE_OF_STUDY': test_base['COURSE_OF_STUDY'],
        'CLASS_BASED_SCHOOL': test_base['CLASS_BASED_SCHOOL'],
        'SCHOOL': test_base['SCHOOL'],
        'ELEMENTARY_GRADE_9': _GRADES,
        'ELEMENTARY_GRADE_1': _GRADES,
        'ELEMENTARY_GRADE_2': _GRADES,
        'ELEMENTARY_GRADE_3': _GRADES,
        'ELEMENTARY_GRADE_4': _GRADES,
        'ELEMENTARY_GRADE_AVG': _GRADES,
        'SMALL_PERIOD_ON_TIME': _ON_TIME,
        'SMALL_PERIOD_ABSENT': _ABSENT,
        'SMALL_PERIOD_LATE': 0.0,
        'LARGE_PERIOD_ON_TIME': _ON_TIME,
        'LARGE_PERIOD_ABSENT': _ABSENT,
        'LARGE_PERIOD_LATE': 0.0,
        'LARGE_PERIOD_AVG_GRADE': _GRADES,
        'SMALL_PERIOD_AVG_ASSIGNMENT_GRADE': _GRADES,
        'LARGE_PERIOD_AVG_ASSIGNMENT_GRADE': _GRADES,
        'LATEST_GRADE_VARIATION': test_data.pop('LATEST_GRADE_VARIATION'),
        'SUPERVISOR_GROUP_SIZE': test_data.pop('SUPERVISOR_GROUP_SIZE'),
        'SEMESTER_COUNT': test_data.pop('SEMESTER_COUNT'),

    }, ignore_index=True)
    
    assert list(test_data.keys()) == ['THE_RESULT', 'THE_PROBABILITY'], \
      'All feature data should have been added to the DataFrame.'

test_df = test_df.drop("RESULT_ID", axis=1)

# Make binary predictions on verification test data.
test_predictions = full_pipeline_with_predictor.predict(test_df)

# Assert that the predictions are correct.
assert test_predictions.tolist() == [t['THE_RESULT'] for t in test_data_list]

# Make probability on verification test data.
test_probability = full_pipeline_with_predictor.predict_proba(test_df)[:, 1]

# Assert that the probability predictions are correct.
for idx, test_data in enumerate(test_data_list):
    assert test_data['THE_PROBABILITY'](test_probability[idx])