In [1]:
import pandas as pd

data = pd.read_csv('../data/preprocessed_data_a.csv')

features = data.drop(data.iloc[:, 2:10], axis=1)
targets = data.iloc[:, 2:10]

X = features.to_numpy()
Y = targets.to_numpy()

ref: https://github.com/Seal-Li/process-method-for-label-imbalance

# MLSMOTE

In [2]:
# -*- coding: utf-8 -*-
"""
@article{charte2015MLSMOTE,
  title={MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation},
  author={Charte, Francisco and Rivera, Antonio J and del Jesus, Mar{\'\i}a J and Herrera, Francisco},
  journal={Knowledge-Based Systems},
  volume={89},
  pages={385--397},
  year={2015},
  publisher={Elsevier}
}
"""
import numpy as np
from sklearn.neighbors import NearestNeighbors


def IRLbl(Y):
    # imbalance ratio per label
    pos_nums_per_label = np.sum(Y, axis=0)
    max_pos_nums = np.max(pos_nums_per_label)
    return max_pos_nums / pos_nums_per_label


def MeanIR(Y):
    # average imbalance ratio
    IRLbl_VALUE = IRLbl(Y)
    return np.mean(IRLbl_VALUE)


def TailLabel(Y):
    n, m = Y.shape
    irlbl = IRLbl(Y)
    mean_ir = MeanIR(Y)
    return np.where(irlbl>=mean_ir)[0]


def MinBag(X, Y, label_index):
    pos = np.where(Y[:,label_index]==1)
    sample_index = list(set(pos[0]))
    X_minor, Y_minor = X[sample_index,:], Y[sample_index,:]
    return X_minor, Y_minor


def NN_index(X, k=5):
    # n_neighbors including the sample itself, 
    # so we take the number of n_neighbors as k+1 (as the following shows),
    # then delete itself from the neighbors.
    nn = NearestNeighbors(n_neighbors=k+1, metric='euclidean', 
                           algorithm='auto').fit(X)
    euclidean, index = nn.kneighbors(X)
    return index[:,1:]


def MLSMOTE(X_minor, Y_minor, k=5):
    n, p = X_minor.shape
    m = Y_minor.shape[1]
    X_synth = np.zeros((n,p))
    Y_synth = np.zeros((n,m))
    
    nn_index = NN_index(X_minor, k=5)
    for i in range(n):
        # generate sample feature, that is, X
        sample_X = X_minor[i,:]
        rand_ind = np.random.randint(0, k)
        ref_index = nn_index[i,rand_ind]
        refNeigh = X_minor[ref_index,:]
        diff = sample_X - refNeigh
        offset = diff*np.random.uniform(0, 1, (1,p))
        X_synth[i,:] = sample_X + offset
        
        # generate sample labels Y with the voting method
        sample_nn_index = nn_index[i,:]
        nn_label = Y_minor[sample_nn_index,:]
        Y_synth[i,:] = (np.sum(nn_label, axis=0)>=((k+1)/2))
    X_new = np.r_[X_minor, X_synth]
    Y_new = np.r_[Y_minor, Y_synth]
    return X_new, Y_new

In [3]:
label_index = TailLabel(Y)

X_minor, Y_minor = MinBag(X, Y, label_index)
X_new, Y_new = MLSMOTE(X_minor, Y_minor, k=5)

print(X_new.shape, Y_new.shape)

(14, 77) (14, 8)


In [4]:
mlsmote_data = pd.DataFrame(X_new, columns=data.columns[:2].tolist()+data.columns[10:].tolist())
mlsmote_data.describe()

Unnamed: 0,sex,age,height,weight,BMI,Time in Bed(min),Total sleep time(min),Sleep latency(min),N2 sleep latency(min),REM sleep latency(min),...,Lowest SaO2 (%),ODI,90%ODI,EtCO2 (>50mmHg),Total LMI =PLMI(/h),Total LM Arousal#,Total LM AI(/h),PLM Arousal#,PLM AI(/h),MAI(/h)
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,...,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,0.449512,31.223814,165.354785,59.458612,21.110492,447.656153,413.521667,6.434447,3.570324,81.446754,...,93.843842,1.172463,0.0,0.0,2.973592,1.65227,0.35266,4.08442,0.416332,0.706832
std,0.667332,14.38811,10.080765,13.931357,3.926252,36.348318,28.650919,6.102241,8.065626,30.834373,...,2.007903,1.380754,0.0,0.0,22.208194,5.761948,1.085948,17.254744,2.266397,4.359757
min,-0.473626,10.081052,139.518057,35.47478,13.255465,370.607446,363.80296,-1.19984,-14.341737,28.426269,...,89.623513,-0.407948,0.0,0.0,-33.884496,-5.627436,-0.895353,-22.655938,-4.072774,-5.295148
25%,0.0,25.864203,162.5,52.0,18.725,432.875,396.384131,2.657998,0.0,64.553454,...,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,29.145797,167.5,61.0,21.249024,458.971218,411.029395,4.75,4.741384,80.757222,...,94.5,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,35.635344,169.003192,67.742848,23.25,467.07244,440.401467,9.5,6.25,97.611556,...,95.0,1.940814,0.0,0.0,0.0,0.75,0.15,0.0,0.0,0.15
max,1.840754,59.39072,181.670086,83.165848,29.25443,493.815663,457.191742,21.567384,17.068392,152.845118,...,96.435244,3.87698,0.0,0.0,61.132561,15.22137,3.274424,52.966313,5.682046,12.597887


In [5]:
mlsmote_label = pd.DataFrame(Y_new, columns=data.columns[2:10].tolist())
mlsmote_label.describe()

Unnamed: 0,OSA,insomnia,"PLMD, RLS",parasomnia,circadian rhythm sleep-wake disorder,"Narcolepsy, EDS, Hypersomnia",Catathrenia,control
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,0.0,0.0,0.071429,0.0,0.0,0.0,1.0,0.0
std,0.0,0.0,0.267261,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


# MLeNN

In [6]:
# -*- coding: utf-8 -*-
"""
@inproceedings{charte2014MLeNN,
  title={MLeNN: a first approach to heuristic multilabel undersampling},
  author={Charte, Francisco and Rivera, Antonio J and del Jesus, Mar{\'\i}a J and Herrera, Francisco},
  booktitle={International Conference on Intelligent Data Engineering and Automated Learning},
  pages={1--9},
  year={2014},
  organization={Springer}
}
"""

import numpy as np
from sklearn.neighbors import NearestNeighbors


def caculate_IRLbl(Y):
    # imbalance ratio per label
    posNumsPerLabel = np.sum(Y, axis=0)
    maxPosNums = np.max(posNumsPerLabel)
    return maxPosNums / posNumsPerLabel


def caculate_meanIR(Y):
    # average imbalance ratio
    IRLbl = caculate_IRLbl(Y)
    return np.mean(IRLbl)


def get_minBag(Y):
    n, m = Y.shape
    IRLbl = caculate_IRLbl(Y)
    meanIR = caculate_meanIR(Y)
    return [i for i in range(m) if IRLbl[i] > meanIR]


def get_minMajInstInd(Y, minBag):
    n, m = Y.shape
    minInstInd = []
    majInstInd = []
    for i in range(n):
        if (Y[i, minBag]==1).any():
            minInstInd.append(i)
        else:
            majInstInd.append(i)
    return minInstInd, majInstInd


def adjust_hamming_distance(y1, y2):
    flag1 = np.sum(y1)
    flag2 = np.sum(y2)
    if flag := (flag1 and flag2):
        ele = np.sum((y1 + y2)==1)
        den = flag1 + flag2
        return ele / den
    else:
        return 1


def NN_index(X, k=5):
    # n_neighbors including the sample itself, 
    # so we take the number of n_neighbors as k+1 (as the following shows),
    # then delete itself from the neighbors.
    nn = NearestNeighbors(n_neighbors=k+1, metric='euclidean', 
                           algorithm='auto').fit(X)
    euclidean, index = nn.kneighbors(X)
    return index[:,1:]


def MLeNN(X, Y, NN=3, HT=0.75):
    # MLeNN (MultiLabel edited Nearest Neighbor)
    nnIndex = NN_index(X, NN)
    minBag = get_minBag(Y)
    minInstInd, majInstInd = get_minMajInstInd(Y, minBag)
    markForRemoving = []
    for sampleIndex in majInstInd:
        numDifferences = 0
        sampleNNIndexs = nnIndex[sampleIndex,:]
        for sampleNNIndex in sampleNNIndexs:
            adjustedHammingDist = adjust_hamming_distance(Y[sampleIndex,:],
                                                          Y[sampleNNIndex,:])
            if adjustedHammingDist > HT:
                numDifferences = numDifferences + 1
        if numDifferences >= (NN/2):
#             print("Remove:", sampleIndex)
            markForRemoving.append(sampleIndex)
    
    X_new = np.delete(X, markForRemoving, axis=0)
    Y_new = np.delete(Y, markForRemoving, axis=0)
    return X_new, Y_new

In [16]:
X_new, Y_new = MLeNN(X, Y, NN=5, HT=0.75)
print(X_new.shape, Y_new.shape)

(4149, 77) (4149, 8)


In [17]:
mlenn_label = pd.DataFrame(Y_new, columns=data.columns[2:10].tolist())
mlenn_label.describe()

Unnamed: 0,OSA,insomnia,"PLMD, RLS",parasomnia,circadian rhythm sleep-wake disorder,"Narcolepsy, EDS, Hypersomnia",Catathrenia,control
count,4149.0,4149.0,4149.0,4149.0,4149.0,4149.0,4149.0,4149.0
mean,0.899012,0.04242,0.103398,0.028923,0.007231,0.010123,0.001687,0.046517
std,0.301349,0.201569,0.304515,0.167609,0.084736,0.100114,0.041045,0.210628
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
X_new, Y_new = MLeNN(X, Y, NN=3, HT=0.75)
print(X_new.shape, Y_new.shape)

(4166, 77) (4166, 8)


In [8]:
mlenn_data = pd.DataFrame(X_new, columns=data.columns[:2].tolist()+data.columns[10:].tolist())
mlenn_data.describe()

Unnamed: 0,sex,age,height,weight,BMI,Time in Bed(min),Total sleep time(min),Sleep latency(min),N2 sleep latency(min),REM sleep latency(min),...,Lowest SaO2 (%),ODI,90%ODI,EtCO2 (>50mmHg),Total LMI =PLMI(/h),Total LM Arousal#,Total LM AI(/h),PLM Arousal#,PLM AI(/h),MAI(/h)
count,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,...,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0
mean,0.233797,52.109698,168.040927,73.826668,26.01788,428.199208,351.875636,11.557873,7.643063,103.592895,...,83.026164,22.390374,8.353865,0.020883,10.45144,1.545607,0.271003,3.911426,0.695199,0.967883
std,0.423296,14.353845,8.609825,14.718925,4.120377,52.329486,66.5604,17.899349,13.86142,63.741051,...,8.17213,22.985435,17.245257,0.953251,21.570983,4.024202,0.704712,13.199861,2.340126,2.677805
min,0.0,13.0,130.0,38.0,13.5,96.9,60.0,0.0,0.0,0.0,...,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,42.0,163.0,64.0,23.5,400.025,314.125,3.0,1.5,65.0,...,79.0,2.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,54.0,169.0,73.0,25.5,433.0,358.5,6.5,4.0,86.5,...,85.0,15.95,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,62.0,174.0,82.0,28.07,462.5,399.0,12.5,8.0,126.0,...,89.0,34.5,7.3,0.0,11.6,1.0,0.3,1.0,0.1,0.6
max,1.0,91.0,194.0,181.0,57.02,642.0,520.5,303.0,224.0,446.5,...,99.0,141.9,169.4,44.5,187.3,129.0,21.5,218.0,37.5,38.8


In [9]:
mlenn_label = pd.DataFrame(Y_new, columns=data.columns[2:10].tolist())
mlenn_label.describe()

Unnamed: 0,OSA,insomnia,"PLMD, RLS",parasomnia,circadian rhythm sleep-wake disorder,"Narcolepsy, EDS, Hypersomnia",Catathrenia,control
count,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0,4166.0
mean,0.887902,0.045847,0.105857,0.029525,0.006961,0.011522,0.00168,0.048008
std,0.315525,0.209179,0.307691,0.169292,0.083152,0.106732,0.040962,0.213808
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
