# Import libraries

In [1]:
import numpy as np
import pandas as pd

# Load data

In [3]:
encoded_applications = pd.read_csv("../Dataset/processed_applications.csv")
credits = pd.read_csv("../Dataset/scores.csv")

# Adjust Data
Splitting of data for Semi-supervised learning

In [18]:
updated_applicants = pd.merge(encoded_applications, credits, on='id', how='left')
updated_applicants = updated_applicants.drop(['id'], axis=1)

In [19]:
applicants_w_records = updated_applicants.dropna(subset = ["total_score"])
applicants_no_records = updated_applicants.loc[updated_applicants["total_score"].isna()]


We split away around 20% of the applicants with records as test data.

In [20]:
removal_index_labeled = range(0, len(applicants_w_records), 5)
test_data = applicants_w_records.reset_index().filter(removal_index_labeled, axis=0)
train_data = applicants_w_records.reset_index().drop(removal_index_labeled, axis=0)

In [21]:
x_test_data = test_data.drop(['credit_history_score', 'duration_score', 'total_score'], axis=1).to_numpy(dtype=float)
y_test_data = test_data['total_score'].to_numpy(dtype=float)
x_train_data = train_data.drop(['credit_history_score', 'duration_score', 'total_score'], axis=1).to_numpy(dtype=float)
y_train_data = train_data['total_score'].to_numpy(dtype=float)

We split the unlabelled data into 3 sets.

In [22]:
removal_index_unlabelled1 = range(0, len(applicants_no_records), 3)
removal_index_unlabelled2 = range(1, len(applicants_no_records), 3)
removal_index_unlabelled3 = range(2, len(applicants_no_records), 3)
unlabelled_data_1 = applicants_no_records.reset_index().filter(removal_index_unlabelled1, axis=0)
unlabelled_data_2 = applicants_no_records.reset_index().filter(removal_index_unlabelled2, axis=0)
unlabelled_data_3 = applicants_no_records.reset_index().filter(removal_index_unlabelled3, axis=0)

x_unlabelled_data_1 = unlabelled_data_1.drop(['credit_history_score', 'duration_score', 'total_score'], axis=1).to_numpy(dtype=float)
x_unlabelled_data_2 = unlabelled_data_2.drop(['credit_history_score', 'duration_score', 'total_score'], axis=1).to_numpy(dtype=float)
x_unlabelled_data_3 = unlabelled_data_3.drop(['credit_history_score', 'duration_score', 'total_score'], axis=1).to_numpy(dtype=float)

We remove first column as they are indexes.

In [23]:
x_test_data = np.delete(x_test_data, 0, axis=1)
x_train_data = np.delete(x_train_data, 0, axis=1)
x_unlabelled_data_1 = np.delete(x_unlabelled_data_1, 0, axis=1)
x_unlabelled_data_2 = np.delete(x_unlabelled_data_2, 0, axis=1)
x_unlabelled_data_3 = np.delete(x_unlabelled_data_3, 0, axis=1)

# Semi-supervised Learning
We will now use the learning model to do logistic regression for classification

Datasets:
* x_test_data
* y_test_data
* x_train_data
* y_train_data
* x_unlabelled_data_1
* x_unlabelled_data_2
* x_unlabelled_data_3

Scores above 0.35 are classified as good.

In [24]:
y_test_data = (y_test_data > 0.35).astype(int)
y_train_data = (y_train_data > 0.35).astype(int)

from imblearn.over_sampling import SMOTE
oversample = SMOTE()
x_train_data, y_train_data = oversample.fit_resample(x_train_data, y_train_data)

# Logistic Regression

We train our first set of logistic regression model.

In [25]:
from sklearn.linear_model import LogisticRegression
log_model_one = LogisticRegression(max_iter = 10000)
log_model_one.fit(x_train_data, y_train_data)

First set of pseudo-labels for linear regression.

In [26]:
logreg_labels_1 = log_model_one.predict_proba(x_unlabelled_data_1)

Filter out confident pseudo-labels and add into current set of labelled data.

In [27]:
indexing = np.where((logreg_labels_1[:,1] >= 0.9) | (logreg_labels_1[:,1] <= 0.01))
logreg_labels_1 = logreg_labels_1[(logreg_labels_1[:,1]  >= 0.9) | (logreg_labels_1[:,1] <= 0.01)]
logreg_labels_1 = (logreg_labels_1[:,1] > 0.5).astype(int)
good_logreg_unlabelled_data_1 = x_unlabelled_data_1[indexing[0]]

new_x_train_data_logreg = np.vstack((x_train_data, good_logreg_unlabelled_data_1))
new_y_train_data_logreg = np.append(y_train_data, logreg_labels_1)

new_x_train_data_logreg, new_y_train_data_logreg = oversample.fit_resample(new_x_train_data_logreg, new_y_train_data_logreg)

### Second Linear Regression Training

In [28]:
log_model_two = LogisticRegression(max_iter = 10000)
log_model_two.fit(new_x_train_data_logreg, new_y_train_data_logreg)
logreg_labels_2 = log_model_two.predict_proba(x_unlabelled_data_2)

Adjustment to training data.

In [29]:
indexing = np.where((logreg_labels_2[:,1] >= 0.9) | (logreg_labels_2[:,1] <= 0.01))
logreg_labels_2 = logreg_labels_2[(logreg_labels_2[:,1]  >= 0.9) | (logreg_labels_2[:,1] <= 0.01)]
logreg_labels_2 = (logreg_labels_2[:,1] > 0.5).astype(int)
good_logreg_unlabelled_data_2 = x_unlabelled_data_2[indexing[0]]

new_x_train_data_logreg = np.vstack((new_x_train_data_logreg, good_logreg_unlabelled_data_2))
new_y_train_data_logreg = np.append(new_y_train_data_logreg, logreg_labels_2)

new_x_train_data_logreg, new_y_train_data_logreg = oversample.fit_resample(new_x_train_data_logreg, new_y_train_data_logreg)

### Third training

In [30]:
log_model_three = LogisticRegression(max_iter = 10000)
log_model_three.fit(new_x_train_data_logreg, new_y_train_data_logreg)
logreg_labels_3 = log_model_three.predict_proba(x_unlabelled_data_3)

Adjustment to training data.

In [31]:
indexing = np.where((logreg_labels_3[:,1] >= 0.9) | (logreg_labels_3[:,1] <= 0.01))
logreg_labels_3 = logreg_labels_3[(logreg_labels_3[:,1]  >= 0.9) | (logreg_labels_3[:,1] <= 0.01)]
logreg_labels_3 = (logreg_labels_3[:,1] > 0.5).astype(int)
good_logreg_unlabelled_data_3 = x_unlabelled_data_3[indexing[0]]

new_x_train_data_logreg = np.vstack((new_x_train_data_logreg, good_logreg_unlabelled_data_3))
new_y_train_data_logreg = np.append(new_y_train_data_logreg, logreg_labels_3)

new_x_train_data_logreg, new_y_train_data_logreg = oversample.fit_resample(new_x_train_data_logreg, new_y_train_data_logreg)

### Fourth training

In [32]:
log_model_four = LogisticRegression(max_iter = 10000)
log_model_four.fit(new_x_train_data_logreg, new_y_train_data_logreg)

### Testing accuracy of linear model
We will now test the accuracy of our linear model.

In [33]:
from sklearn.metrics import accuracy_score

logreg_accuracy = log_model_four.predict(x_test_data)
logreg_accuracy = (logreg_accuracy > 0.5).astype(int)
accuracy_score(y_test_data, logreg_accuracy)

0.48998902907295666

# How about simply using labelled data?
Logistic Regression's non-SSL model is basically the first training iteration.

In [34]:
labelled_data_accuracy = log_model_one.predict(x_test_data)
labelled_data_accuracy = (labelled_data_accuracy > 0.5).astype(int)
accuracy_score(y_test_data, labelled_data_accuracy)

0.5068568294020844

Training accuracy for each training

In [35]:
labelled_data_accuracy = log_model_one.predict(x_test_data)
labelled_data_accuracy = (labelled_data_accuracy > 0.5).astype(int)
print(accuracy_score(y_test_data, labelled_data_accuracy))

labelled_data_accuracy = log_model_two.predict(x_test_data)
labelled_data_accuracy = (labelled_data_accuracy > 0.5).astype(int)
print(accuracy_score(y_test_data, labelled_data_accuracy))

labelled_data_accuracy = log_model_three.predict(x_test_data)
labelled_data_accuracy = (labelled_data_accuracy > 0.5).astype(int)
print(accuracy_score(y_test_data, labelled_data_accuracy))

labelled_data_accuracy = log_model_four.predict(x_test_data)
labelled_data_accuracy = (labelled_data_accuracy > 0.5).astype(int)
print(accuracy_score(y_test_data, labelled_data_accuracy))

0.5068568294020844
0.502194185408667
0.49506308283049916
0.48998902907295666
