# Semi-supervised - label propagation

In [5]:
import pandas as pd

from sklearn.metrics import roc_curve,auc,precision_recall_curve,confusion_matrix,accuracy_score,classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
import statistics
from sklearn.multiclass import OneVsRestClassifier

data=pd.read_csv(r'C:\Users\ADMIN\Desktop\new_df.csv')
df = pd.DataFrame(data)
test_df = df

print(df.shape)


(67836, 22)


In [7]:
label_encoder = LabelEncoder()
df['readmitted']= label_encoder.fit_transform(df['readmitted']) 
print(df['readmitted'].shape)
print(df['readmitted'].head(5))

(67836,)
0    1
1    2
2    2
3    2
4    1
Name: readmitted, dtype: int32


In [8]:
y = data['readmitted']
x = data.drop('readmitted', axis=1)
print('target : ', y.shape)
print('features : ',x.shape)

target :  (67836,)
features :  (67836, 21)


In [9]:
x = pd.get_dummies(x,drop_first=True)
print(x.shape)




(67836, 36)


In [10]:
# Balanced sampling technique - SMOTEMEK - imbalanced data

from collections import Counter
from imblearn.under_sampling import TomekLinks 
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE

smote_sampler = SMOTE(sampling_strategy='minority')
tomek_sampler = TomekLinks(sampling_strategy='majority')

print('\nBalanced sampling using SMOTETomek:')

smt = SMOTETomek(random_state=52,smote=smote_sampler,tomek=tomek_sampler)
X, Y = smt.fit_resample(x, y)
print('SMOTETomek Resampled dataset shape %s' % Counter(Y))


Balanced sampling using SMOTETomek:
SMOTETomek Resampled dataset shape Counter({2: 39953, 0: 39598, 1: 21751})


In [11]:
# user-defined unlabelling function

import random

def maskfunc(true_target, percentage):
    if (percentage >=0 and percentage <= 100):
       
        n_total_samples = len(true_target)
        unlabel = (100-percentage)
        n_labeled_points = int((unlabel * n_total_samples)/100)
        indices = np.arange(n_total_samples)
        unlabeled_set = indices[n_labeled_points:]
        
        labels = true_target.copy()
        labels[unlabeled_set] = -1
        
        print('Total samples considered :\t\t',n_total_samples)
        print('Count of labelled points at ',percentage,'% :\t',n_labeled_points)
        print('Count of unlabelled points at ',percentage,'% :\t',(n_total_samples - n_labeled_points))
        return labels,unlabeled_set
    else: 
        return 'Percentage not in range of (0,100)'
    



In [12]:
# classification report and confusion matrix function

def report_conf(unlabeled_set_value, percen):
    unlabeled_set = unlabeled_set_value
    predicted_labels = label_prop_model.transduction_[unlabeled_set]
    true_labels = Y[unlabeled_set]
    print('Classifcation report at ',percen,'% :\n')
    print(classification_report(true_labels, predicted_labels))

    cm = confusion_matrix(true_labels, predicted_labels, labels=label_prop_model.classes_)
    print('Confusion matrix at ',percen,'% :\n',cm)

In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn import datasets
from sklearn.semi_supervised import LabelPropagation

score = {}
#RUN THE MODEL
label_prop_model = LabelPropagation(kernel='knn', n_neighbors=8, max_iter=100, gamma=20,n_jobs=None)

# 100% labelled data
ml,unlabeled_set_0 = maskfunc(Y, 0)
label_prop_model.fit(X, ml)
score_zero = label_prop_model.score(X, ml)
print('\nAccuracy with 0 % unlabelled data\t',round(score_zero,4))

print('\nMetrics:\n')
predicted_labels = label_prop_model.transduction_[unlabeled_set_0]
true_labels = Y[unlabeled_set_0]
cm = confusion_matrix(true_labels, predicted_labels, labels=label_prop_model.classes_)
print('Classifcation report at 0% :\n')
print(classification_report(true_labels, predicted_labels))
print('Confusion matrix at 0% :\n',cm)



Total samples considered : 101250
Count of labelled points at  0 % : 101250
Count of unlabelled points at  0 % : 0

Accuracy with 0 % unlabelled data 0.6733


In [264]:
# 90% labelled data
ml,unlabeled_set_10 = maskfunc(Y, 10)
label_prop_model.fit(X, ml)
score_ten = label_prop_model.score(X, ml)
print('\nAccuracy with 10 % unlabelled data\t',round(score_ten,4))
print('\nMetrics:\n')
report_conf(unlabeled_set_10,10)

Total samples considered :		 101250
Count of labelled points at  10 % :	 91125
Count of unlabelled points at  10 % :	 10125
Accuracy with 10 % unlabelled data	 0.5931
Classifcation report at  10 % :

              precision    recall  f1-score   support

           0       1.00      0.93      0.96     10125
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0

    accuracy                           0.93     10125
   macro avg       0.33      0.31      0.32     10125
weighted avg       1.00      0.93      0.96     10125

Confusion matrix at  10 % :
 [[9374  217  534]
 [   0    0    0]
 [   0    0    0]]


In [265]:
# 80% labelled data
ml,unlabeled_set_20 = maskfunc(Y, 20)
label_prop_model.fit(X, ml)
score_twenty = label_prop_model.score(X, ml)
print('\nAccuracy with 20 % unlabelled data\t',round(score_twenty,4))
print('\nMetrics:\n')
report_conf(unlabeled_set_20,20)

Total samples considered :		 101250
Count of labelled points at  20 % :	 81000
Count of unlabelled points at  20 % :	 20250
Accuracy with 20 % unlabelled data	 0.513

Metrics:

Classifcation report at  20 % :

              precision    recall  f1-score   support

           0       1.00      0.87      0.93     20250
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0

    accuracy                           0.87     20250
   macro avg       0.33      0.29      0.31     20250
weighted avg       1.00      0.87      0.93     20250

Confusion matrix at  20 % :
 [[17575   728  1947]
 [    0     0     0]
 [    0     0     0]]


In [266]:
# 50% labelled data
ml,unlabeled_set_50 = maskfunc(Y, 50)
label_prop_model.fit(X, ml)
score_fifty = label_prop_model.score(X, ml)
print('\nAccuracy with 50 % unlabelled data\t',round(score_fifty,4))
print('\nMetrics:\n')
report_conf(unlabeled_set_50,50)

Total samples considered :		 101250
Count of labelled points at  50 % :	 50625
Count of unlabelled points at  50 % :	 50625

Accuracy with 50 % unlabelled data	 0.1771

Metrics:

Classifcation report at  50 % :

              precision    recall  f1-score   support

           0       0.73      0.84      0.78     34807
           1       0.14      0.13      0.13      4193
           2       0.37      0.21      0.27     11625

    accuracy                           0.64     50625
   macro avg       0.41      0.39      0.39     50625
weighted avg       0.60      0.64      0.61     50625

Confusion matrix at  50 % :
 [[29356  1994  3457]
 [ 2946   534   713]
 [ 7918  1289  2418]]


In [267]:

# 10% labelled data
ml,unlabeled_set_90 = maskfunc(Y, 90)
label_prop_model.fit(X, ml)
score_ninety = label_prop_model.score(X, ml)
print('\nAccuracy with 90 % unlabelled data\t',round(score_ninety,4))
print('\nMetrics:\n')
report_conf(unlabeled_set_90,90)


Total samples considered :		 101250
Count of labelled points at  90 % :	 10125
Count of unlabelled points at  90 % :	 91125

Accuracy with 90 % unlabelled data	 0.016

Metrics:

Classifcation report at  90 % :

              precision    recall  f1-score   support

           0       0.43      0.98      0.59     38588
           1       0.21      0.01      0.02     18072
           2       0.48      0.02      0.04     34465

    accuracy                           0.42     91125
   macro avg       0.37      0.34      0.22     91125
weighted avg       0.40      0.42      0.27     91125

Confusion matrix at  90 % :
 [[37707   383   498]
 [17610   215   247]
 [33369   412   684]]


In [268]:
# 5% labelled data
ml,unlabeled_set_95 = maskfunc(Y, 95)
label_prop_model.fit(X, ml)
score_ninefive = label_prop_model.score(X, ml)
print('\nAccuracy with 95 % unlabelled data\t',round(score_ninefive,4))
print('\nMetrics:\n')
report_conf(unlabeled_set_95,95)

Total samples considered :		 101250
Count of labelled points at  95 % :	 5062
Count of unlabelled points at  95 % :	 96188

Accuracy with 95 % unlabelled data	 0.0072

Metrics:

Classifcation report at  95 % :

              precision    recall  f1-score   support

           0       0.41      0.99      0.58     39065
           1       0.21      0.01      0.01     19791
           2       0.54      0.01      0.02     37332

    accuracy                           0.41     96188
   macro avg       0.39      0.34      0.20     96188
weighted avg       0.42      0.41      0.25     96188

Confusion matrix at  95 % :
 [[38643   207   215]
 [19532   121   138]
 [36673   237   422]]


# Semisupervised learning - label spreading

In [13]:
from sklearn.semi_supervised import LabelSpreading

label_spread_model = LabelSpreading(kernel='knn',n_neighbors=7)



In [22]:
# classification report and confusion matrix function
from sklearn.metrics import roc_curve,auc,precision_recall_curve,confusion_matrix,accuracy_score,classification_report
import time
import numpy as np
def report_conf_spread(unlabeled_set_value, percen):
    unlabeled_set = unlabeled_set_value
    predicted_labels = label_spread_model.transduction_[unlabeled_set]
    true_labels = Y[unlabeled_set]
    
    cm = confusion_matrix(true_labels, predicted_labels, labels=label_spread_model.classes_)
    print('Classifcation report at ',percen,'% :\n')
    print(classification_report(true_labels, predicted_labels))

    print('Confusion matrix at ',percen,'% :\n',cm)

In [275]:
# 100% labelled data
start = time.time()
label_spread_model.fit(X, Y)
sp_score_zero = label_spread_model.score(X, Y)
print('Accuracy with 0 % unlabelled data\t',round(sp_score_zero,4))
end = time.time()
print('Run time :\t',round(end-start, 2),' seconds')

Accuracy with 0 % unlabelled data	 0.6856
Run time :	 490.89  seconds


In [277]:
# 90% labelled data
start = time.time()
ml,unlabeled_set_10 = maskfunc(Y, 10)
label_spread_model.fit(X, ml)
sp_score_ten = label_spread_model.score(X, ml)
print('\nAccuracy with 10 % unlabelled data\t',round(sp_score_ten,4))
end = time.time()
print('Run time :\t',round(end-start, 2),' seconds')
print('\nMetrics:\n')
report_conf_spread(unlabeled_set_10,10)

Total samples considered :		 101250
Count of labelled points at  10 % :	 91125
Count of unlabelled points at  10 % :	 10125

Accuracy with 10 % unlabelled data	 0.6002
Run time :	 501.21  seconds

Metrics:

Classifcation report at  10 % :

              precision    recall  f1-score   support

           0       1.00      0.90      0.95     10125
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0

    accuracy                           0.90     10125
   macro avg       0.33      0.30      0.32     10125
weighted avg       1.00      0.90      0.95     10125

Confusion matrix at  10 % :
 [[9105  291  729]
 [   0    0    0]
 [   0    0    0]]


In [23]:
import warnings
warnings.filterwarnings('ignore')

# 80% labelled data
start = time.time()
ml,unlabeled_set_20 = maskfunc(Y, 20)
label_spread_model.fit(X, ml)
sp_score_twenty = label_spread_model.score(X, ml)
print('\nAccuracy with 20 % unlabelled data\t',round(sp_score_twenty,4))
end = time.time()
print('Run time :\t',round(end-start, 2),' seconds')
print('\nMetrics:\n')
report_conf_spread(unlabeled_set_20,20)

Total samples considered :		 101302
Count of labelled points at  20 % :	81041
Count of unlabelled points at  20 % :	20261

Accuracy with 20 % unlabelled data	 0.5218
Run time :	349.12  seconds

Metrics:

Classifcation report at  20 % :

              precision    recall  f1-score   support

           0       1.00      0.81      0.90     20261
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0

    accuracy                           0.81     20261
   macro avg       0.33      0.27      0.30     20261
weighted avg       1.00      0.81      0.90     20261

Confusion matrix at  20 % :
 [[16482  1122  2657]
 [    0     0     0]
 [    0     0     0]]


In [24]:
# 50% labelled data
start = time.time()
ml,unlabeled_set_50 = maskfunc(Y, 50)
label_spread_model.fit(X, ml)
sp_score_fifty = label_spread_model.score(X, ml)
print('\nAccuracy with 50 % unlabelled data\t',round(sp_score_fifty,4))
end = time.time()
print('Run time :\t',round(end-start, 2),' seconds')
print('\nMetrics:\n')
report_conf_spread(unlabeled_set_50,50)

Total samples considered :		 101302
Count of labelled points at  50 % :	 50651
Count of unlabelled points at  50 % :	 50651

Accuracy with 50 % unlabelled data	 0.3342
Run time :	 363.44  seconds

Metrics:

Classifcation report at  50 % :

              precision    recall  f1-score   support

           0       0.94      0.55      0.69     34839
           1       0.14      0.38      0.20      4191
           2       0.36      0.58      0.45     11621

    accuracy                           0.54     50651
   macro avg       0.48      0.50      0.45     50651
weighted avg       0.74      0.54      0.60     50651

Confusion matrix at  50 % :
 [[19158  6127  9554]
 [  370  1612  2209]
 [  872  4054  6695]]


In [25]:
# 90% labelled data
start = time.time()
ml,unlabeled_set_90 = maskfunc(Y, 90)
label_spread_model.fit(X, ml)
sp_score_ninety = label_spread_model.score(X, ml)
print('\nAccuracy with 90 % unlabelled data\t',round(sp_score_ninety,4))
end = time.time()
print('Run time :\t',round(end-start, 2),' seconds')
print('\nMetrics:\n')
report_conf_spread(unlabeled_set_90,90)

Total samples considered :		 101302
Count of labelled points at  90 % :	 10130
Count of unlabelled points at  90 % :	 91172

Accuracy with 90 % unlabelled data	 0.0785
Run time :	 397.12  seconds

Metrics:

Classifcation report at  90 % :

              precision    recall  f1-score   support

           0       0.65      0.58      0.61     38644
           1       0.25      0.31      0.28     18066
           2       0.49      0.49      0.49     34462

    accuracy                           0.49     91172
   macro avg       0.46      0.46      0.46     91172
weighted avg       0.51      0.49      0.50     91172

Confusion matrix at  90 % :
 [[22257  6658  9729]
 [ 4519  5617  7930]
 [ 7713  9870 16879]]


In [None]:
# 95% labelled data
start = time.time()
ml,unlabeled_set_95 = maskfunc(Y, 95)
label_spread_model.fit(X, ml)
sp_score_ninetyfive = label_spread_model.score(X, ml)
print('\nAccuracy with 95 % unlabelled data\t',round(sp_score_ninetyfive,2))
end = time.time()
print('Run time :\t',round(end-start, 2),' seconds')
print('\nMetrics:\n')
report_conf_spread(unlabeled_set_95,95)

Total samples considered :		 101302
Count of labelled points at  95 % :	 5065
Count of unlabelled points at  95 % :	 96237


# semisupervised learning - particle competition and cooperation