## Privileged Logistic Regression and Logistic Regression on Simulated Data

### 0. Install Necessary Packages

In [1]:
# !pip install --user -r requirements.txt

### 1. Load Model

In [1]:
# import the models
from privileged_lr import PrivilegedLogisticRegression
from cvxpy_lr import CvxpyLogisticRegression
from sklearn.linear_model import LogisticRegression

### 2. Prepare the Data for Learning Using Priviledged Information

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [19]:
import numpy as np
np.random.seed(0)

In [27]:
n_total, n_informative = 12, 6

# create a simluated dataset
X, y = make_classification(n_samples=2000, n_features=n_total, 
                           n_informative=n_informative, 
                           n_redundant=0, random_state=0)

# split the dataset into train, validation and test set based on the ratio
train_ratio, validation_ratio, test_ratio = 0.4, 0.3, 0.3

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=(1-train_ratio), random_state=0)

# X_train, X_val, y_train, y_val = train_test_split(
#     X_train, y_train, test_size=test_ratio/(test_ratio+validation_ratio), random_state=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=0)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=validation_ratio/(train_ratio+validation_ratio), random_state=1)

# select out all the informtive columns as privileged information
x_train_star = X_train[:, :(n_informative)]

# the rest are used as base features
x_train = X_train[:, (n_informative):]

# in the val/test set, we only keep the base features
x_val = X_val[:, n_informative:]
x_test = X_test[:, n_informative:]

### Running with PLR Model (`cvxpy` implementation) - Training, Hyper-parameter Selection and Testing

Learning Using Privliged Information

In [28]:
import pandas as pd
import itertools
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score

In [29]:
# initialize the hyperparameters searching grid
param_grid_plr = {
    'lambda_base': [0.01, 0.1, 1, 10],
    'lambda_star': [0.01, 0.1, 1, 10],
    'alpha': [0.01, 0.1, 1, 10],
    'xi_link': [0.01, 0.1, 1, 10],
    'penalty': ['l1']
    }
all_hyperparam_combinations = list(itertools.product(*map(param_grid_plr.get, list(param_grid_plr))))
# initialize the dataframes to store the results
df_train_plr = pd.DataFrame(columns=['lambda_base', 'lambda_star', 'alpha', 'xi_link', 'penalty', 'auroc', 'f1'])

In [30]:
# create a dictionary for each hyperparameter combination and iterate over it
for i, hyper_param_values in enumerate(all_hyperparam_combinations):
    kwarg = dict(zip(list(param_grid_plr.keys()), hyper_param_values))

    # initialize the model with the hyperparameters
    plr_model = PrivilegedLogisticRegression(**kwarg)
    
    # fit the plr_model
    plr_model.fit(x_train, y_train, X_star=x_train_star, y_star=y_train)

    # obtain the prediction
    y_val_pred = plr_model.predict_proba(x_val)

    # calculate the AUROC
    auroc = roc_auc_score(y_val, y_val_pred[:, 1])
    f1 = f1_score(y_val, y_val_pred.argmax(axis=1))
    # store the validation results
    df_train_plr.loc[i] = list(hyper_param_values) + [auroc, f1]


In [31]:
# obtain the best hyperparameters
best_hyperparam = df_train_plr.sort_values(by='f1', ascending=False).iloc[1] 

# only keep the best hyperparameters in param_grid.keys()
best_hyperparam = best_hyperparam[list(param_grid_plr.keys())]

# apply the best hyperparameters to the best_plr_model
best_plr_model = PrivilegedLogisticRegression(**best_hyperparam.to_dict())

# fit the best_plr_model
best_plr_model.fit(x_train, y_train, X_star=x_train_star, y_star=y_train)

# obtain the prediction
y_test_pred = best_plr_model.predict_proba(x_test)

# calculate the AUROC, accuracy, f1, precision and recall
auroc = roc_auc_score(y_test, y_test_pred[:, 1])
acc = accuracy_score(y_test, y_test_pred.argmax(axis=1))
f1 = f1_score(y_test, y_test_pred.argmax(axis=1))
precision = precision_score(y_test, y_test_pred.argmax(axis=1))
recall = recall_score(y_test, y_test_pred.argmax(axis=1))

print('(PLR model) AUROC on test set: {}'.format(auroc))
print('(PLR model) Accuracy on test set: {}'.format(acc))
print('(PLR model) F1 score on test set: {}'.format(f1))
print('(PLR model) Precision on test set: {}'.format(precision))
print('(PLR model) Recall on test set: {}'.format(recall))

(PLR model) AUROC on test set: 0.8661017137646104
(PLR model) Accuracy on test set: 0.7933333333333333
(PLR model) F1 score on test set: 0.8154761904761906
(PLR model) Precision on test set: 0.7548209366391184
(PLR model) Recall on test set: 0.8867313915857605


### 4. LR Model (`sklearn` implementation) - Training, Hyper-parameter Selection and Testing

Classical Learning Paradigm w/o Privileged Information

In [32]:
# initialize the hyperparameters searching grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1']
    }
all_hyperparam_combinations = list(itertools.product(*map(param_grid.get, list(param_grid))))
# initialize the dataframes to store the results
df_train = pd.DataFrame(columns=['C', 'penalty', 'auroc', 'f1'])

# create a dictionary for each hyperparameter combination and iterate over it
for i, hyper_param_values in enumerate(all_hyperparam_combinations):
    kwarg = dict(zip(list(param_grid.keys()), hyper_param_values))

    # initialize the model with the hyperparameters
    lr_model = LogisticRegression(solver='saga', **kwarg)

    # fit the lr_model
    lr_model.fit(x_train, y_train)
    
    # obtain the prediction
    y_val_pred = lr_model.predict_proba(x_val)

    # calculate the AUROC
    auroc = roc_auc_score(y_val, y_val_pred[:, 1])
    f1 = f1_score(y_val, y_val_pred.argmax(axis=1))

    # store the validation results
    df_train.loc[i] = list(hyper_param_values) + [auroc, f1]

# obtain the best hyperparameters
best_hyperparam = df_train.sort_values(by='f1', ascending=False).iloc[0]


# only keep the best hyperparameters in param_grid.keys()
best_hyperparam = best_hyperparam[list(param_grid.keys())]

# apply the best hyperparameters to the best_lr_model
best_lr_model = LogisticRegression(solver='saga', **best_hyperparam.to_dict())

# fit the best_lr_model
best_lr_model.fit(x_train, y_train)

# obtain the prediction
y_test_pred = best_lr_model.predict_proba(x_test)

# calculate the AUROC, accuracy, f1, precision and recall
auroc = roc_auc_score(y_test, y_test_pred[:, 1])
acc = accuracy_score(y_test, y_test_pred.argmax(axis=1))
f1 = f1_score(y_test, y_test_pred.argmax(axis=1))
precision = precision_score(y_test, y_test_pred.argmax(axis=1))
recall = recall_score(y_test, y_test_pred.argmax(axis=1))

print('(LR model sklearn) AUROC on test set: {}'.format(auroc))
print('(LR model sklearn) Accuracy on test set: {}'.format(acc))
print('(LR model sklearn) F1 score on test set: {}'.format(f1))
print('(LR model sklearn) Precision on test set: {}'.format(precision))
print('(LR model sklearn) Recall on test set: {}'.format(recall))


(LR model sklearn) AUROC on test set: 0.861352995473704
(LR model sklearn) Accuracy on test set: 0.795
(LR model sklearn) F1 score on test set: 0.8025682182985555
(LR model sklearn) Precision on test set: 0.7961783439490446
(LR model sklearn) Recall on test set: 0.8090614886731392


### 5. LR Model (`cvxpy` implementation) - Training, Hyper-parameter Selection and Testing

Classical Learning Paradigm w/o Privileged Information

In [33]:
# initialize the hyperparameters searching grid
param_grid = {
    'lambda_': [0.01, 0.1, 1, 10],
    'penalty': ['l1']
    }
all_hyperparam_combinations = list(itertools.product(*map(param_grid.get, list(param_grid))))
# initialize the dataframes to store the results
df_train = pd.DataFrame(columns=['lambda_', 'penalty', 'auroc', 'f1'])

# create a dictionary for each hyperparameter combination and iterate over it
for i, hyper_param_values in enumerate(all_hyperparam_combinations):
    kwarg = dict(zip(list(param_grid.keys()), hyper_param_values))

    # initialize the model with the hyperparameters
    lr_model = CvxpyLogisticRegression(**kwarg)

    # fit the lr_model
    lr_model.fit(x_train, y_train)
    
    # obtain the prediction
    y_val_pred = lr_model.predict_proba(x_val)

    # calculate the AUROC
    auroc = roc_auc_score(y_val, y_val_pred[:, 1])
    f1 = f1_score(y_val, y_val_pred.argmax(axis=1))

    # store the validation results
    df_train.loc[i] = list(hyper_param_values) + [auroc, f1]

# obtain the best hyperparameters
best_hyperparam = df_train.sort_values(by='f1', ascending=False).iloc[0]

# only keep the best hyperparameters in param_grid.keys()
best_hyperparam = best_hyperparam[list(param_grid.keys())]

# apply the best hyperparameters to the best_lr_model
best_lr_model = CvxpyLogisticRegression(**best_hyperparam.to_dict())

# fit the best_lr_model
best_lr_model.fit(x_train, y_train)

# obtain the prediction
y_test_pred = best_lr_model.predict_proba(x_test)

# calculate the AUROC, accuracy, f1, precision and recall
auroc = roc_auc_score(y_test, y_test_pred[:, 1])
acc = accuracy_score(y_test, y_test_pred.argmax(axis=1))
f1 = f1_score(y_test, y_test_pred.argmax(axis=1))
precision = precision_score(y_test, y_test_pred.argmax(axis=1))
recall = recall_score(y_test, y_test_pred.argmax(axis=1))

print('(LR model cvxpy) AUROC on test set: {}'.format(auroc))
print('(LR model cvxpy) Accuracy on test set: {}'.format(acc))
print('(LR model cvxpy) F1 score on test set: {}'.format(f1))
print('(LR model cvxpy) Precision on test set: {}'.format(precision))
print('(LR model cvxpy) Recall on test set: {}'.format(recall))

(LR model cvxpy) AUROC on test set: 0.8655234155184112
(LR model cvxpy) Accuracy on test set: 0.795
(LR model cvxpy) F1 score on test set: 0.8098918083462133
(LR model cvxpy) Precision on test set: 0.7751479289940828
(LR model cvxpy) Recall on test set: 0.8478964401294499
