In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
# import KFold
from sklearn.model_selection import KFold

## Step 1: Generating Training Set

In part a, we will generate a training set with 2 label, each is drawn from 2 different normal distribution. The test set will be subjected to label-shift bias model, and the class prior will the different from the training set.

In [2]:
# generate the training dataset with two labels, 0 and 1
training_set = []

# first, generate the datapoint with label 0, which is drawn from 2 normal distributions with mean 0 and 1, variance 1 and 2, respectively
# the weight of the first normal distribution is 0.3 and the weight of the second normal distribution is 0.7
# the number of data points is 1000

mu_1, mu_2 = 0, 1
sigma_1, sigma_2 = 1, 2
label_0 = np.concatenate((np.random.normal(mu_1, sigma_1, 300), np.random.normal(mu_2, sigma_2, 700)))
for i in range(len(label_0)):
    training_set.append([label_0[i], 0])
    
# next, generate the datapoint with label 1, which is drawn from 2 normal distributions with mean 5 and 2, variance 7 and 4, respectively
# the weight of the first normal distribution is 0.5 and the weight of the second normal distribution is 0.5
# the number of data points is 1500

mu_3, mu_4 = 5, 2
sigma_3, sigma_4 = 7, 4
label_1 = np.concatenate((np.random.normal(mu_3, sigma_3, 750), np.random.normal(mu_4, sigma_4, 750)))
for i in range(len(label_1)):
    training_set.append([label_1[i], 1])
    
training_set = np.array(training_set)

X_train = training_set[:, 0].reshape(-1, 1)
y_train = training_set[:, 1]

print(X_train.shape)
print(y_train.shape)

(2500, 1)
(2500,)


## Step 2: Generating Testing Set

In part b, the test set will have different class prior, so we will generate 200 points from class 0 and 100 points from class 1.

In [3]:
# generate the test dataset with two labels, 0 and 1
testing_set = []

# first, generate the datapoint with label 0, which is drawn from 2 normal distributions with mean 0 and 1, variance 1 and 2, respectively
# the weight of the first normal distribution is 0.3 and the weight of the second normal distribution is 0.7
# the number of data points is 100

label_0 = np.concatenate((np.random.normal(mu_1, sigma_1, 60), np.random.normal(mu_2, sigma_2, 140)))
for i in range(len(label_0)):
    testing_set.append([label_0[i], 0])
    
# next, generate the datapoint with label 1, which is drawn from 2 normal distributions with mean 5 and 2, variance 7 and 4, respectively
# the weight of the first normal distribution is 0.5 and the weight of the second normal distribution is 0.5
# the number of data points is 150

label_1 = np.concatenate((np.random.normal(mu_3, sigma_3, 50), np.random.normal(mu_4, sigma_4, 50)))
for i in range(len(label_1)):
    testing_set.append([label_1[i], 1])
    
testing_set = np.array(testing_set)

X_test = testing_set[:, 0].reshape(-1, 1)
y_test = testing_set[:, 1]

print(X_test.shape)
print(y_test.shape)

(300, 1)
(300,)


## Step 3: Training and test the models on the generated datasets

The first model we will be looking at is the logistic regression model

In [4]:
# first, use logistic regression to classify the data
print("Training the Logistic Regression model...")
#initialize the 10 fold cross validation
cv = KFold(n_splits=10, random_state=42, shuffle=True)

auc_scores = []
# fit the model on the training set
for train_index, val_index in cv.split(X_train):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]
    # initialize the model
    lr_clf = LogisticRegression()
    lr_clf.fit(X_train_cv, y_train_cv)
    
    # predict the labels on the validation set
    y_pred = lr_clf.predict_proba(X_val_cv)
    
    #report the AUC score
    auc_scores.append(roc_auc_score(y_val_cv, y_pred[:, 1]))
    
print("The average AUC score for 10 fold cross validation on the training set is: ", np.mean(auc_scores))

# train the model on the whole training set
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)

# predict the labels on the test set
y_pred = lr_clf.predict_proba(X_test)

# calculate the AUC score
auc = roc_auc_score(y_test, y_pred[:, 1])
print("The AUC score on the test set is: ", auc)


Training the Logistic Regression model...
The average AUC score for 10 fold cross validation on the training set is:  0.6741395050464556
The AUC score on the test set is:  0.70775


The second model we will look at is the Naive Bayes model

In [5]:
#initialize the 10 fold cross validation
print("Training the Naive Bayes model...")
cv = KFold(n_splits=10, random_state=42, shuffle=True)

auc_scores = []
# fit the model on the training set
for train_index, val_index in cv.split(X_train):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]
    # initialize the naive bayes model
    nb_clf = GaussianNB()
    nb_clf.fit(X_train_cv, y_train_cv)
    
    # predict the labels on the validation set
    y_pred = nb_clf.predict_proba(X_val_cv)
    
    #report the AUC score
    auc_scores.append(roc_auc_score(y_val_cv, y_pred[:, 1]))
    
print("The average AUC score for 10 fold cross validation on the training set is: ", np.mean(auc_scores))

# train the model on the whole training set
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

# predict the labels on the test set
y_pred = nb_clf.predict_proba(X_test)

# calculate the AUC score
auc = roc_auc_score(y_test, y_pred[:, 1])
print("The AUC score on the test set is: ", auc)

Training the Naive Bayes model...
The average AUC score for 10 fold cross validation on the training set is:  0.8087365655788517
The AUC score on the test set is:  0.7888499999999999


The next model we will be looking at is the Multi Layer Perceptron with 3 hidden layer

In [6]:
#initialize the 10 fold cross validation
print("Training the MLPClassifier model...")
cv = KFold(n_splits=10, random_state=42, shuffle=True)

auc_scores = []
# fit the model on the training set
for train_index, val_index in cv.split(X_train):
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train[train_index], y_train[val_index]
    # initialize the mlp model with 2 hidden layers, each with 3 neurons
    # activation function is relu
    mlp = MLPClassifier(hidden_layer_sizes=(3, 3), max_iter=1000, activation='relu', solver='adam')
    mlp.fit(X_train_cv, y_train_cv)
    
    # predict the labels on the validation set
    y_pred = mlp.predict_proba(X_val_cv)
    
    #report the AUC score
    auc_scores.append(roc_auc_score(y_val_cv, y_pred[:, 1]))
    
print("The average AUC score for 10 fold cross validation on the training set is: ", np.mean(auc_scores))

# train the model on the whole training set
mlp = MLPClassifier(hidden_layer_sizes=(3, 3), max_iter=1000, activation='relu', solver='adam')
mlp.fit(X_train, y_train)

# predict the labels on the test set
y_pred = mlp.predict_proba(X_test)

# calculate the AUC score
auc = roc_auc_score(y_test, y_pred[:, 1])
print("The AUC score on the test set is: ", auc)

Training the MLPClassifier model...
The average AUC score for 10 fold cross validation on the training set is:  0.8125942961656956
The AUC score on the test set is:  0.7932
