In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../data/isi_only_train.csv')
test = pd.read_csv('../data/isi_only_test.csv')

data = pd.concat([train, test], axis=0)

In [28]:
data['ISI'] = data.ISI1a + data.ISI1b + data.ISI1c + data.ISI2 + data.ISI3 + data.ISI4 + data.ISI5
train['ISI'] = train.ISI1a + train.ISI1b + train.ISI1c + train.ISI2 + train.ISI3 + train.ISI4 + train.ISI5
test['ISI'] = test.ISI1a + test.ISI1b + test.ISI1c + test.ISI2 + test.ISI3 + test.ISI4 + test.ISI5
data.head()

Unnamed: 0,ISI1a,ISI1b,ISI1c,ISI2,ISI3,ISI4,ISI5,OSA,insomnia,"PLMD, RLS",parasomnia,circadian rhythm sleep-wake disorder,"Narcolepsy, EDS, Hypersomnia",Catathrenia,control,ISI,ISI cut
0,1,1,0,2,3,3,3,0,0,0,0,0,0,0,0,13,1
1,0,3,3,3,2,2,2,1,1,1,0,0,0,0,0,15,2
2,2,2,2,2,2,0,3,0,1,0,0,0,0,0,0,13,1
3,2,2,2,3,2,1,3,1,0,0,0,0,0,0,0,15,2
4,1,1,1,3,2,2,3,1,0,0,0,0,0,0,0,13,1


In [29]:
bins = [0, 8, 15, 22, train['ISI'].max()+1]
labels = ['none', 'mild', 'moderate', 'severe']
data['ISI cut'] = pd.cut(data['ISI'], bins, labels=labels, right=False)
train['ISI cut'] = pd.cut(train['ISI'], bins, labels=labels, right=False)
test['ISI cut'] = pd.cut(test['ISI'], bins, labels=labels, right=False)

In [5]:
data.head()

Unnamed: 0,ISI1a,ISI1b,ISI1c,ISI2,ISI3,ISI4,ISI5,OSA,insomnia,"PLMD, RLS",parasomnia,circadian rhythm sleep-wake disorder,"Narcolepsy, EDS, Hypersomnia",Catathrenia,control,ISI,ISI cut
0,1,1,0,2,3,3,3,0,0,0,0,0,0,0,0,13,mild
1,0,3,3,3,2,2,2,1,1,1,0,0,0,0,0,15,moderate
2,2,2,2,2,2,0,3,0,1,0,0,0,0,0,0,13,mild
3,2,2,2,3,2,1,3,1,0,0,0,0,0,0,0,15,moderate
4,1,1,1,3,2,2,3,1,0,0,0,0,0,0,0,13,mild


In [6]:
data['ISI cut'].value_counts()

mild        2127
none        1591
moderate    1406
severe       356
Name: ISI cut, dtype: int64

In [7]:
data.isnull().sum()

ISI1a                                   0
ISI1b                                   0
ISI1c                                   0
ISI2                                    0
ISI3                                    0
ISI4                                    0
ISI5                                    0
OSA                                     0
insomnia                                0
PLMD, RLS                               0
parasomnia                              0
circadian rhythm sleep-wake disorder    0
Narcolepsy, EDS, Hypersomnia            0
Catathrenia                             0
control                                 0
ISI                                     0
ISI cut                                 0
dtype: int64

In [30]:
data['ISI cut'] = data['ISI cut'].replace(labels, [0, 1, 2, 3])
train['ISI cut'] = train['ISI cut'].replace(labels, [0, 1, 2, 3])
test['ISI cut'] = test['ISI cut'].replace(labels, [0, 1, 2, 3])

In [9]:
data.iloc[:, 7:15]

Unnamed: 0,OSA,insomnia,"PLMD, RLS",parasomnia,circadian rhythm sleep-wake disorder,"Narcolepsy, EDS, Hypersomnia",Catathrenia,control
0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
1091,0,0,0,1,0,0,0,0
1092,0,0,0,0,0,0,0,1
1093,1,0,0,1,0,0,0,0
1094,1,0,0,0,0,0,0,0


In [33]:
test.to_csv('../data/sampled/isi_only_test_mlsmote.csv', index=False)

In [10]:
y_data = data.iloc[:, 7:15]
y_train = train.iloc[:, 7:15]

features = [x for x in data.columns if x not in y_data.columns]
targets = list(y_data.columns)

X_data = data.loc[:, features]
X_train = train.loc[:, features]

# MLSMOTE

In [11]:
# -*- coding: utf-8 -*-
"""
@article{charte2015MLSMOTE,
  title={MLSMOTE: Approaching imbalanced multilabel learning through synthetic instance generation},
  author={Charte, Francisco and Rivera, Antonio J and del Jesus, Mar{\'\i}a J and Herrera, Francisco},
  journal={Knowledge-Based Systems},
  volume={89},
  pages={385--397},
  year={2015},
  publisher={Elsevier}
}
"""
import numpy as np
from sklearn.neighbors import NearestNeighbors


def IRLbl(Y):
    # imbalance ratio per label
    pos_nums_per_label = np.sum(Y, axis=0)
    max_pos_nums = np.max(pos_nums_per_label)
    return max_pos_nums / pos_nums_per_label


def MeanIR(Y):
    # average imbalance ratio
    IRLbl_VALUE = IRLbl(Y)
    return np.mean(IRLbl_VALUE)


def TailLabel(Y):
    n, m = Y.shape
    irlbl = IRLbl(Y)
    mean_ir = MeanIR(Y)
    return np.where(irlbl>=mean_ir)[0]


def MinBag(X, Y, label_index):
    pos = np.where(Y[:,label_index]==1)
    sample_index = list(set(pos[0]))
    X_minor, Y_minor = X[sample_index,:], Y[sample_index,:]
    return X_minor, Y_minor


def NN_index(X, k=5):
    # n_neighbors including the sample itself, 
    # so we take the number of n_neighbors as k+1 (as the following shows),
    # then delete itself from the neighbors.
    nn = NearestNeighbors(n_neighbors=k+1, metric='euclidean', 
                           algorithm='auto').fit(X)
    euclidean, index = nn.kneighbors(X)
    return index[:,1:]


def MLSMOTE(X_minor, Y_minor, k=5):
    n, p = X_minor.shape
    m = Y_minor.shape[1]
    X_synth = np.zeros((n,p))
    Y_synth = np.zeros((n,m))
    
    nn_index = NN_index(X_minor, k=5)
    for i in range(n):
        # generate sample feature, that is, X
        sample_X = X_minor[i,:]
        rand_ind = np.random.randint(0, k)
        ref_index = nn_index[i,rand_ind]
        refNeigh = X_minor[ref_index,:]
        diff = sample_X - refNeigh
        offset = diff*np.random.uniform(0, 1, (1,p))
        X_synth[i,:] = sample_X + offset
        
        # generate sample labels Y with the voting method
        sample_nn_index = nn_index[i,:]
        nn_label = Y_minor[sample_nn_index,:]
        Y_synth[i,:] = (np.sum(nn_label, axis=0)>=((k+1)/2))
    X_new = np.r_[X_minor, X_synth]
    Y_new = np.r_[Y_minor, Y_synth]
    return X_new, Y_new

In [12]:
label_index = TailLabel(y_data.values)

X_minor, Y_minor = MinBag(X_data.values, y_data.values, label_index)
X_new, Y_new = MLSMOTE(X_minor, Y_minor, k=5)

print(X_new.shape, Y_new.shape)

(16, 9) (16, 8)


In [13]:
X = pd.DataFrame(X_new, columns=features)
y = pd.DataFrame(Y_new, columns=targets)

In [14]:
X_mlsmote = pd.concat([X_train, X], axis=0)
y_mlsmote = pd.concat([y_train, y], axis=0)

In [16]:
print(X_train.shape, X_mlsmote.shape)
print(y_train.shape, y_mlsmote.shape)

(4384, 9) (4400, 9)
(4384, 8) (4400, 8)


In [17]:
data_mlsmote = pd.concat([X_mlsmote, y_mlsmote], axis=1)
data_mlsmote.head()

Unnamed: 0,ISI1a,ISI1b,ISI1c,ISI2,ISI3,ISI4,ISI5,ISI,ISI cut,OSA,insomnia,"PLMD, RLS",parasomnia,circadian rhythm sleep-wake disorder,"Narcolepsy, EDS, Hypersomnia",Catathrenia,control
0,1.0,1.0,0.0,2.0,3.0,3.0,3.0,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,3.0,3.0,3.0,2.0,2.0,2.0,15.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2.0,2.0,2.0,2.0,2.0,0.0,3.0,13.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,2.0,2.0,3.0,2.0,1.0,3.0,15.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,3.0,2.0,2.0,3.0,13.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
data_mlsmote.to_csv('../data/isi_only_train_mlsmote.csv', index=False)

# MLeNN

In [19]:
# -*- coding: utf-8 -*-
"""
@inproceedings{charte2014MLeNN,
  title={MLeNN: a first approach to heuristic multilabel undersampling},
  author={Charte, Francisco and Rivera, Antonio J and del Jesus, Mar{\'\i}a J and Herrera, Francisco},
  booktitle={International Conference on Intelligent Data Engineering and Automated Learning},
  pages={1--9},
  year={2014},
  organization={Springer}
}
"""

import numpy as np
from sklearn.neighbors import NearestNeighbors


def caculate_IRLbl(Y):
    # imbalance ratio per label
    posNumsPerLabel = np.sum(Y, axis=0)
    maxPosNums = np.max(posNumsPerLabel)
    return maxPosNums / posNumsPerLabel


def caculate_meanIR(Y):
    # average imbalance ratio
    IRLbl = caculate_IRLbl(Y)
    return np.mean(IRLbl)


def get_minBag(Y):
    n, m = Y.shape
    IRLbl = caculate_IRLbl(Y)
    meanIR = caculate_meanIR(Y)
    return [i for i in range(m) if IRLbl[i] > meanIR]


def get_minMajInstInd(Y, minBag):
    n, m = Y.shape
    minInstInd = []
    majInstInd = []
    for i in range(n):
        if (Y[i, minBag]==1).any():
            minInstInd.append(i)
        else:
            majInstInd.append(i)
    return minInstInd, majInstInd


def adjust_hamming_distance(y1, y2):
    flag1 = np.sum(y1)
    flag2 = np.sum(y2)
    if flag := (flag1 and flag2):
        ele = np.sum((y1 + y2)==1)
        den = flag1 + flag2
        return ele / den
    else:
        return 1


def NN_index(X, k=5):
    # n_neighbors including the sample itself, 
    # so we take the number of n_neighbors as k+1 (as the following shows),
    # then delete itself from the neighbors.
    nn = NearestNeighbors(n_neighbors=k+1, metric='euclidean', 
                           algorithm='auto').fit(X)
    euclidean, index = nn.kneighbors(X)
    return index[:,1:]


def MLeNN(X, Y, NN=3, HT=0.75):
    # MLeNN (MultiLabel edited Nearest Neighbor)
    nnIndex = NN_index(X, NN)
    minBag = get_minBag(Y)
    minInstInd, majInstInd = get_minMajInstInd(Y, minBag)
    markForRemoving = []
    for sampleIndex in majInstInd:
        numDifferences = 0
        sampleNNIndexs = nnIndex[sampleIndex,:]
        for sampleNNIndex in sampleNNIndexs:
            adjustedHammingDist = adjust_hamming_distance(Y[sampleIndex,:],
                                                          Y[sampleNNIndex,:])
            if adjustedHammingDist > HT:
                numDifferences = numDifferences + 1
        if numDifferences >= (NN/2):
#             print("Remove:", sampleIndex)
            markForRemoving.append(sampleIndex)
    
    X_new = np.delete(X, markForRemoving, axis=0)
    Y_new = np.delete(Y, markForRemoving, axis=0)
    return X_new, Y_new

In [21]:
X_mlenn, Y_mlenn = MLeNN(X_data.values, y_data.values, NN=5, HT=0.75)
print(X_mlenn.shape, Y_mlenn.shape)

(3922, 9) (3922, 8)


In [23]:
X_mlenn_df = pd.DataFrame(X_mlenn, columns=features)
y_mlenn = pd.DataFrame(Y_mlenn, columns=targets)

data_mlenn = pd.concat([X_mlenn_df, y_mlenn], axis=1)
data_mlenn.head()

Unnamed: 0,ISI1a,ISI1b,ISI1c,ISI2,ISI3,ISI4,ISI5,ISI,ISI cut,OSA,insomnia,"PLMD, RLS",parasomnia,circadian rhythm sleep-wake disorder,"Narcolepsy, EDS, Hypersomnia",Catathrenia,control
0,0,3,3,3,2,2,2,15,2,1,1,1,0,0,0,0,0
1,2,2,2,3,2,1,3,15,2,1,0,0,0,0,0,0,0
2,1,1,1,3,2,2,3,13,1,1,0,0,0,0,0,0,0
3,0,0,0,1,1,0,1,3,0,1,0,0,0,0,0,0,0
4,3,0,0,4,3,1,3,14,1,1,0,0,0,0,0,0,0


In [25]:
target = data_mlenn.iloc[:, 9:17]
features = data_mlenn.drop(data_mlenn.iloc[:, 9:17], axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [27]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

train.to_csv('../data/isi_only_train_mlenn.csv', index=False)
train.to_csv('../data/isi_only_test_mlenn.csv', index=False)