In [None]:
# Constants and imports.

RANDOM_STATE = None
BASE_NUM = 1
CV = 5
TEST_SIZE = 0.2

import os
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler, RobustScaler, Imputer

from msc_preprocessing import CourseOfStudyNamer, CategoricalEncoder, DataFrameSelector
from msc_preprocessing import ElementaryNameFixer, ElementarySchoolDistance
from msc_preprocessing import NationalitySelector

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import precision_score, recall_score

from sklearn.model_selection import cross_val_predict

from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
# Load data.

data = pd.read_csv(os.path.join('datasets', 'base_{}.csv'.format(BASE_NUM)), sep=';')

data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Split train / test

split = StratifiedShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=RANDOM_STATE)
for train_index, test_index in split.split(data, data['DROPPED_OUT']):
    train_set = data.loc[train_index]
    test_set = data.loc[test_index]

In [None]:
# Verify that the training set has an equal split of students that dropped out and graduated.

train_set['DROPPED_OUT'].value_counts() / len(train_set)

In [None]:
# Verify that the testing set has an equal split of students that dropped out and graduated.

test_set['DROPPED_OUT'].value_counts() / len(test_set)

In [None]:
# Print correlation from numerical attributes to dropout.

corr_matrix = train_set.corr()

corr_matrix["DROPPED_OUT"].sort_values(ascending=False)

In [None]:
# Rename the training set to 'data' for convenience.

data = train_set.drop("DROPPED_OUT", axis=1) # drop labels for training set
data_labels = train_set["DROPPED_OUT"].copy()

test_data = test_set.drop("DROPPED_OUT", axis=1) # drop labels for testing set
test_labels = test_set["DROPPED_OUT"].copy()

# Pipeline for standard scaling and translating categories to numbers.

# Define categorical attributes that need to be translated to numbers.
cat_attribs = ['COURSE_OF_STUDY', 'SCHOOL', 'NATIONALITY', ]

# Define numerical attributes, data that has numbers.
num_attribs = list(data.drop(cat_attribs + ['ELEMENTARY_SCHOOL'], axis=1)) + ['ELEMENTARY_SCHOOL_DISTANCE']

# A pipeline for numerical attributes.
num_pipeline = Pipeline([
        ('elementary_school_fix_names', ElementaryNameFixer()),
        ('elementary_school_distance', ElementarySchoolDistance()),
        ('selector', DataFrameSelector(num_attribs)), # Select only data that has numbers.
        ('imputer', Imputer(strategy="median")), # Replace NULL values with averages.
        ('std_scaler', RobustScaler()), # Scale all numerical values to the same scale.
    ])

# A pipeline for categorial attributes.
cat_pipeline = Pipeline([
        ('course_of_study_fix_names', CourseOfStudyNamer()),
        ('nationality_selector', NationalitySelector()),
        ('selector', DataFrameSelector(cat_attribs)), # Select only data that has categories.
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense", handle_unknown='ignore')), # Translate categories to numbers.
    ])

# Merge the numerical and categorical pipelines.
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

# Fit and transform the data for training.
data_prepared = full_pipeline.fit_transform(data)
data_prepared

In [None]:
# Evaluate classifiers

# A list of the classifiers to evaluate.
other_cls_list = [
    RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1), # With no parameter tuning.
    AdaBoostClassifier(random_state=RANDOM_STATE),
    GradientBoostingClassifier(random_state=RANDOM_STATE),
    LogisticRegression(random_state=RANDOM_STATE),
    SVC(),
    LinearSVC(),
    NuSVC(),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    SGDClassifier(random_state=RANDOM_STATE, n_jobs=-1, max_iter=5, tol=None),
    SGDClassifier(random_state=RANDOM_STATE, n_jobs=-1, max_iter=1000, tol=1e-3),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
]

# Objects to store cross validated scores.
accuracy_scores = np.zeros(len(other_cls_list))
f1_scores = np.zeros(len(other_cls_list))
precision_scores = np.zeros(len(other_cls_list))
recall_scores = np.zeros(len(other_cls_list))
roc_scores = np.zeros(len(other_cls_list))

# Objects to store test data scores.
split_accuracy_scores = np.zeros(len(other_cls_list))
split_f1_scores = np.zeros(len(other_cls_list))
split_precision_scores = np.zeros(len(other_cls_list))
split_recall_scores = np.zeros(len(other_cls_list))
split_roc_scores = np.zeros(len(other_cls_list))

for idx, other_cls in enumerate(other_cls_list):
    print (idx, other_cls)
    
    other_cls.fit(data_prepared, data_labels)

    # Predict using cross validation.
    y_train_pred_binary_other = cross_val_predict(other_cls, data_prepared, data_labels, cv=CV, method='predict')

    # Record cross validated scores.
    accuracy_scores[idx] = accuracy_score(data_labels, y_train_pred_binary_other)
    f1_scores[idx] = f1_score(data_labels, y_train_pred_binary_other)
    precision_scores[idx] = precision_score(data_labels, y_train_pred_binary_other)
    recall_scores[idx] = recall_score(data_labels, y_train_pred_binary_other)
    roc_scores[idx] = roc_auc_score(data_labels, y_train_pred_binary_other)

    full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("cls", other_cls),
    ])
    
    # Predict using test data.
    y_train_pred_binary_split = full_pipeline_with_predictor.predict(test_data)

    # Record test data scores.
    split_accuracy_scores[idx] = accuracy_score(test_labels, y_train_pred_binary_split)
    split_f1_scores[idx] = f1_score(test_labels, y_train_pred_binary_split)
    split_precision_scores[idx] = precision_score(test_labels, y_train_pred_binary_split)
    split_recall_scores[idx] = recall_score(test_labels, y_train_pred_binary_split)
    split_roc_scores[idx] = roc_auc_score(test_labels, y_train_pred_binary_split)

# Results

# Cross validated DataFrame.
other_df = pd.DataFrame(columns=['Name', 'Accuracy', 'F1', 'Precision', 'Recall', 'ROC', 'Average'])

# Test set DataFrame.
other_df_split = pd.DataFrame(columns=['Name', 'Accuracy', 'F1', 'Precision', 'Recall', 'ROC', 'Average'])

# Record values into DataFrames.
for i, o in enumerate(other_cls_list):
    other_df.loc[i] = [
        '{}'.format(o.__class__.__name__),
        accuracy_scores[i],
        f1_scores[i],
        precision_scores[i],
        recall_scores[i],
        roc_scores[i],
        np.average([
            accuracy_scores[i],
            f1_scores[i],
            precision_scores[i],
            recall_scores[i],
            roc_scores[i],
        ])
    ]

    other_df_split.loc[i] = [
        '{}'.format(o.__class__.__name__),
        split_accuracy_scores[i],
        split_f1_scores[i],
        split_precision_scores[i],
        split_recall_scores[i],
        split_roc_scores[i],
        np.average([
            split_accuracy_scores[i],
            split_f1_scores[i],
            split_precision_scores[i],
            split_recall_scores[i],
            split_roc_scores[i],
        ])
    ]

In [None]:
# Visualize Cross Validated scores.

other_df.sort_values('Average', ascending=False).head(25)

In [None]:
# Visualize test set scores.

other_df_split.sort_values('Average', ascending=False).head(25)

In [None]:
# Save results to file for averaging.

other_df.to_json(os.path.join('datasets', 'base_{}_evaluation_result_cv.json'.format(BASE_NUM)))
other_df_split.to_json(os.path.join('datasets', 'base_{}_evaluation_result_split.json'.format(BASE_NUM)))