In [None]:
# Constants and imports.

BASE_NUM = 1
RANDOM_STATE = None
CV = 5
TEST_SIZE = 0.2

import os
import itertools
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# Load data.

data = pd.read_csv(os.path.join('datasets', 'base_{}.csv'.format(BASE_NUM)), sep=';')

data.head()

In [None]:
# Split train / test

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=RANDOM_STATE)
for train_index, test_index in split.split(data, data['DROPPED_OUT']):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

In [None]:
# Verify that the training set has an equal split of students that dropped out and graduated.

train_set['DROPPED_OUT'].value_counts() / len(train_set)

In [None]:
# Verify that the testing set has an equal split of students that dropped out and graduated.

test_set['DROPPED_OUT'].value_counts() / len(test_set)

In [None]:
# Rename the training set to 'data' for convenience.

data = train_set.copy()

In [None]:
data = train_set.drop("DROPPED_OUT", axis=1) # drop labels for training set
data_labels = train_set["DROPPED_OUT"].copy()

test_data = test_set.drop("DROPPED_OUT", axis=1) # drop labels for testing set
test_labels = test_set["DROPPED_OUT"].copy()

In [None]:
# Pipeline for standard scaling and translating categories to numbers.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Imputer

from msc_preprocessing import CourseOfStudyNamer, CategoricalEncoder, DataFrameSelector
from msc_preprocessing import ElementaryNameFixer, ElementarySchoolDistance
from msc_preprocessing import NationalitySelector

cat_attribs = ['COURSE_OF_STUDY', 'SCHOOL', 'NATIONALITY', ]

num_attribs = list(data.drop(cat_attribs + ['ELEMENTARY_SCHOOL'], axis=1)) + ['ELEMENTARY_SCHOOL_DISTANCE']

# A pipeline for numerical attributes.
num_pipeline = Pipeline([
        ('elementary_school_fix_names', ElementaryNameFixer()),
        ('elementary_school_distance', ElementarySchoolDistance()),
        ('selector', DataFrameSelector(num_attribs)), # Select only data that has numbers.
        ('imputer', Imputer(strategy="median")), # Replace NULL values with averages.
        ('std_scaler', RobustScaler()), # Scale all numerical values to the same scale.
    ])

# A pipeline for categorial attributes.
cat_pipeline = Pipeline([
        ('course_of_study_fix_names', CourseOfStudyNamer()),
        ('nationality_selector', NationalitySelector()),
        ('selector', DataFrameSelector(cat_attribs)), # Select only data that has categories.
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense", handle_unknown='ignore')), # Translate categories to numbers.
    ])

In [None]:
from sklearn.pipeline import FeatureUnion

# Merge the numerical and categorical pipelines.
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [None]:
# GradientBoostingClassifier with the optimized parameters.

from sklearn.ensemble import GradientBoostingClassifier

best_params = {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 800}

full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("cls", GradientBoostingClassifier(random_state=RANDOM_STATE, **best_params))
    ])

In [None]:
full_pipeline_with_predictor.fit(data, data_labels)

In [None]:
cls = full_pipeline_with_predictor.named_steps['cls']

cls

In [None]:
#
# Run test data (20% split).
#

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_curve

In [None]:
#
#
# Performance of binary on test data (20% split).
#
#

final_predictions = full_pipeline_with_predictor.predict(test_data)

print('')
print('ac', accuracy_score(test_labels, final_predictions))
print('f1', f1_score(test_labels, final_predictions))
print('pr', precision_score(test_labels, final_predictions))
print('re', recall_score(test_labels, final_predictions))
print('ro', roc_auc_score(test_labels, final_predictions))

In [None]:
#
#
# Performance of probability on test data (20% split).
#
#

final_probability = full_pipeline_with_predictor.predict_proba(test_data)[:, 1]

roc_auc_score(test_labels, final_probability)

In [None]:
fpr, tpr, thresholds = roc_curve(test_labels, final_probability)

plt.plot(fpr, tpr, linewidth=2)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')

plt.show()

In [None]:
#
# Export
#
from sklearn.externals import joblib

joblib.dump(full_pipeline_with_predictor, "my_model.pkl")