# Semi Supervised Learning - Baseline Model

This notebook is to perform supervised learning on a particular classifier from assignment 1 to construct a baseline model for the semi-supervised learning. See the Gradient Boosting model handled in `CSI5155 Assignment 1 Modelling Part- Kelvin Mock 300453668.ipynb` (assignment 1).

## Import Necessary Packages

In [1]:
import numpy as np;
import os;
import sys;
import joblib;
import random;
# importing custom modules
sys.path.append("../Assignment 1 - Model Comparison/");
from fileOrganizer import unpack, organize;
import constants;

## Load the Column Names

In [2]:
columns = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.COLUMNS_DIR,
    constants.COLUMNS_FILENAME
));
print("Columns in a sample: ", columns);

Columns in a sample:  ['age' 'gender' 'education' 'country' 'ethnicity' 'nscore' 'escore'
 'oscore' 'ascore' 'cscore' 'impuslive' 'ss']


## Load the Data

Note: We load the original samples and labels array from the training set and test set respectively. The data are stratified during the split in assignment 1. The size of the test set is of the ratio 1/3. 

In [3]:
mush_X_train = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.TRAIN_DIR,
    constants.X_TRAIN_FILENAME
));
print("Number of Samples in Training Set: ", len(mush_X_train));
print("Number of Features in Training Set: ", len(mush_X_train[random.randint(0, len(mush_X_train))]));
print("Value range of the original Training Set: ", np.min(mush_X_train), np.max(mush_X_train));

Number of Samples in Training Set:  1256
Number of Features in Training Set:  12
Value range of the original Training Set:  -3.46436 3.46436


In [4]:
mush_y_train = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.TRAIN_DIR,
    constants.Y_TRAIN_FILENAME
));
print("Number of Samples in Training Set: ", len(mush_y_train));
print("Unique Labels in Training Set: ", np.unique(mush_y_train));

Number of Samples in Training Set:  1256
Unique Labels in Training Set:  ['non-user' 'user']


In [5]:
mush_X_test = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.TEST_DIR,
    constants.X_TEST_FILENAME
));
print("Number of Samples in Testing Set: ", len(mush_X_test));
print("Number of Features in Testing Set: ", len(mush_X_test[random.randint(0, len(mush_X_test))]));
print("Value range of the original Testing Set: ", np.min(mush_X_test), np.max(mush_X_test));

Number of Samples in Testing Set:  629
Number of Features in Testing Set:  12
Value range of the original Testing Set:  -3.27393 3.27393


In [6]:
mush_y_test = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.TEST_DIR,
    constants.Y_TEST_FILENAME
));
print("Number of Samples in Testing Set: ", len(mush_y_test));
print("Unique Labels in Testing Set: ", np.unique(mush_y_test));

Number of Samples in Testing Set:  629
Unique Labels in Testing Set:  ['non-user' 'user']


### Loading also the Normalized Data

In [7]:
mush_X_train_norm = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.TRAIN_DIR,
    constants.X_TRAIN_NORMALIZED_FILENAME
));
print("Number of Normalized Samples in Training Set: ", len(mush_X_train_norm));
print("Number of Features in the Normalized Training Set: ", len(mush_X_train_norm[random.randint(0, len(mush_X_train_norm))]));
print("Value range of the original Training Set: ", np.min(mush_X_train_norm), np.max(mush_X_train_norm));

Number of Normalized Samples in Training Set:  1256
Number of Features in the Normalized Training Set:  12
Value range of the original Training Set:  -4.827248760778992 13.457018899436779


In [8]:
mush_y_train_norm = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.TRAIN_DIR,
    constants.Y_TRAIN_NORMALIZED_FILENAME
));
print("Number of Samples in Training Set: ", len(mush_y_train_norm));
print("Unique Labels in Training Set: ", np.unique(mush_y_train_norm));

Number of Samples in Training Set:  1256
Unique Labels in Training Set:  [0 1]


In [9]:
mush_X_test_norm = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.TEST_DIR,
    constants.X_TEST_NORMALIZED_FILENAME
));
print("Number of Normalized Samples in Testing Set: ", len(mush_X_test_norm));
print("Number of Features in the Normalized Testing Set: ", len(mush_X_test_norm[random.randint(0, len(mush_X_test_norm))]));
print("Value range of the Normalized Testing Set: ", np.min(mush_X_test_norm), np.max(mush_X_test_norm));

Number of Normalized Samples in Testing Set:  629
Number of Features in the Normalized Testing Set:  12
Value range of the Normalized Testing Set:  -4.827248760778992 13.457018899436779


In [10]:
mush_y_test_norm = joblib.load(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.TEST_DIR,
    constants.Y_TEST_NORMALIZED_FILENAME
));
print("Number of Samples in Testing Set: ", len(mush_y_test_norm));
print("Unique Labels in Testing Set: ", np.unique(mush_y_test_norm));

Number of Samples in Testing Set:  629
Unique Labels in Testing Set:  [0 1]


## Load the Model

Note: We load the trained version of the model.

In [11]:
# Gradient Boost (Trained)
supervisedModel = unpack(os.path.join(
    os.pardir,
    constants.ASM1_DIR,
    constants.MUSH_DIR,
    constants.POSTTRAINED_DIR,
    constants.MODEL_FILENAME
));
supervisedModel

## Save the Baseline Model and Related Data

In [12]:
# save the model in the current project directory
if (constants.MODEL_DIR not in os.listdir(os.curdir)):
    os.mkdir(constants.MODEL_DIR);
joblib.dump(supervisedModel, os.path.join(
    constants.MODEL_DIR,
    constants.MODEL_FILENAME
));

In [16]:
# save the training set data in the current project directory
if (constants.TRAIN_DIR not in os.listdir(os.curdir)):
    os.mkdir(constants.TRAIN_DIR);
# original training set
joblib.dump(mush_X_train, os.path.join(
    constants.TRAIN_DIR,
    constants.X_TRAIN_FILENAME
));
joblib.dump(mush_y_train, os.path.join(
    constants.TRAIN_DIR,
    constants.Y_TRAIN_FILENAME
));
# normalized training set
joblib.dump(mush_X_train_norm, os.path.join(
    constants.TRAIN_DIR,
    constants.X_TRAIN_NORMALIZED_FILENAME
));
joblib.dump(mush_y_train_norm, os.path.join(
    constants.TRAIN_DIR,
    constants.Y_TRAIN_NORMALIZED_FILENAME
));

In [17]:
# save the testing set data in the current project directory
if (constants.TEST_DIR not in os.listdir(os.curdir)):
    os.mkdir(constants.TEST_DIR);
# original testing set
joblib.dump(mush_X_test, os.path.join(
    constants.TEST_DIR,
    constants.X_TEST_FILENAME
));
joblib.dump(mush_y_test, os.path.join(
    constants.TEST_DIR,
    constants.Y_TEST_FILENAME
));
# normalized testing set
joblib.dump(mush_X_test_norm, os.path.join(
    constants.TEST_DIR,
    constants.X_TEST_NORMALIZED_FILENAME
));
joblib.dump(mush_y_test_norm, os.path.join(
    constants.TEST_DIR,
    constants.Y_TEST_NORMALIZED_FILENAME
));

In [15]:
# save the column names in the current project directory
if (constants.COLUMNS_DIR not in os.listdir(os.curdir)):
    os.mkdir(constants.COLUMNS_DIR);
joblib.dump(columns, os.path.join(
    constants.COLUMNS_DIR,
    constants.COLUMNS_FILENAME
));