In [None]:
# Constants and imports.

BASE_NUM = 1
RANDOM_STATE = None
CV = 5
TEST_SIZE = 0.2

import os
import itertools
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
# Load data.

data = pd.read_csv(os.path.join('datasets', 'base_{}.csv'.format(BASE_NUM)), sep=';')

data.head()

In [None]:
# Split train / test

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=RANDOM_STATE)
for train_index, test_index in split.split(data, data['DROPPED_OUT']):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

In [None]:
# Verify that the training set has an equal split of students that dropped out and graduated.

train_set['DROPPED_OUT'].value_counts() / len(train_set)

In [None]:
# Verify that the testing set has an equal split of students that dropped out and graduated.

test_set['DROPPED_OUT'].value_counts() / len(test_set)

In [None]:
# Rename the training set to 'data' for convenience.

data = train_set.copy()

In [None]:
data = train_set.drop("DROPPED_OUT", axis=1) # drop labels for training set
data_labels = train_set["DROPPED_OUT"].copy()

test_data = test_set.drop("DROPPED_OUT", axis=1) # drop labels for testing set
test_labels = test_set["DROPPED_OUT"].copy()

In [None]:
# Pipeline for standard scaling and translating categories to numbers.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Imputer

from msc_preprocessing import CourseOfStudyNamer, CategoricalEncoder, DataFrameSelector
from msc_preprocessing import ElementaryNameFixer, ElementarySchoolDistance
from msc_preprocessing import NationalitySelector

cat_attribs = ['COURSE_OF_STUDY', 'SCHOOL', 'NATIONALITY', ]

num_attribs = list(data.drop(cat_attribs + ['ELEMENTARY_SCHOOL'], axis=1)) + ['ELEMENTARY_SCHOOL_DISTANCE']

# A pipeline for numerical attributes.
num_pipeline = Pipeline([
        ('elementary_school_fix_names', ElementaryNameFixer()),
        ('elementary_school_distance', ElementarySchoolDistance()),
        ('selector', DataFrameSelector(num_attribs)), # Select only data that has numbers.
        ('imputer', Imputer(strategy="median")), # Replace NULL values with averages.
        ('std_scaler', RobustScaler()), # Scale all numerical values to the same scale.
    ])

# A pipeline for categorial attributes.
cat_pipeline = Pipeline([
        ('course_of_study_fix_names', CourseOfStudyNamer()),
        ('nationality_selector', NationalitySelector()),
        ('selector', DataFrameSelector(cat_attribs)), # Select only data that has categories.
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense", handle_unknown='ignore')), # Translate categories to numbers.
    ])

In [None]:
from sklearn.pipeline import FeatureUnion

# Merge the numerical and categorical pipelines.
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [None]:
# Fit and transform the data for training.
data_prepared = full_pipeline.fit_transform(data)
data_prepared

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# RandomForestClassifier Grid Search.

param_grid = {
    'n_estimators': [5, 15, 30, 50, 150, 200, 400, 600, 800, 1000, 1500],
    'max_features': [None, 'sqrt', 'log2'],
    'bootstrap': [True, False],
}

cls = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)

grid_search = GridSearchCV(cls, 
                           param_grid, 
                           cv=CV,
                           n_jobs=-1,
                           verbose=2,
                           scoring='f1')
grid_search.fit(data_prepared, data_labels)

print(grid_search.best_params_)

cls = grid_search.best_estimator_

In [None]:
# Visualize the Grid Search.

import itertools

cv_results = grid_search.cv_results_

# Parameters and names.
grid_param_1 = param_grid['n_estimators']
grid_param_2 = param_grid['max_features']
grid_param_3 = param_grid['bootstrap']
name_param_1 = 'N Estimators'
name_param_2 = 'Max Features'
name_param_3 = 'Bootstrap'

# Get f1 scores for each grid search
scores_mean = cv_results['mean_test_score']
scores_mean = np.array(scores_mean).reshape(len(grid_param_2) * len(grid_param_3), len(grid_param_1))

# Plot scores
plt.rcParams['figure.facecolor'] = '#FFFFFF'
_, ax = plt.subplots(1,1, figsize=(15, 15))

# Param1 is the X-axis, Param 2 and 3 are the Y-axis.
for idx, val in enumerate(itertools.product(grid_param_3, grid_param_2)):
    #if (
    #   False or
    #   (val[0] == True  and val[1] is None) or
    #   (val[0] == False and val[1] is None) or
    #   (val[0] == True  and val[1] == 'sqrt') or
    #   (val[0] == False and val[1] == 'sqrt') or
    #   (val[0] == True and val[1] == 'log2') or
    #   (val[0] == False and val[1] == 'log2') or
    #   False
    #):
    ax.plot(
        grid_param_1, 
        scores_mean[idx,:], 
        '-o', 
        label='{}: {}, {}: {}'.format(name_param_3, val[0], name_param_2, val[1])
    )

# Format plot
ax.set_title('RandomForestClassifier')
ax.set_xlabel(name_param_1)
ax.set_ylabel('CV Average Score')
ax.legend(loc='best', fontsize=12)
ax.grid(True)

plt.show()

In [None]:
# Print all the scores.

for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(mean_score, params)

In [None]:
# Print feature importances.

feature_importances = grid_search.best_estimator_.feature_importances_

extra_attribs = [] # Not needed now. Keep for future possibilities.
cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
# k-fold cross-validation with k=CV

from sklearn.model_selection import cross_val_predict

# Binary classifier. Classifies as true / false.
y_train_pred_binary = cross_val_predict(cls, data_prepared, data_labels, cv=CV, method='predict')

y_train_pred_binary

In [None]:
#
#
# Performance of binary classifier
#
#

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

In [None]:
confusion_matrix(data_labels, y_train_pred_binary)

In [None]:
print('ac', accuracy_score(data_labels, y_train_pred_binary))
print('f1', f1_score(data_labels, y_train_pred_binary))
print('pr', precision_score(data_labels, y_train_pred_binary))
print('re', recall_score(data_labels, y_train_pred_binary))

In [None]:
#
# Run test data (x% split).
#

test_data_prepared = full_pipeline.transform(test_data)

In [None]:
#
#
# Performance of binary on test data (x% split).
#
#

final_predictions = cls.predict(test_data_prepared)

In [None]:
confusion_matrix(test_labels, final_predictions)

In [None]:
print('ac', accuracy_score(test_labels, final_predictions))
print('f1', f1_score(test_labels, final_predictions))
print('pr', precision_score(test_labels, final_predictions))
print('re', recall_score(test_labels, final_predictions))