In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import WeightedRandomSampler, Sampler

In [3]:
df = pd.read_csv('./data/train.csv').set_index('image_id').sort_index()
df.head()

Unnamed: 0_level_0,data_provider,isup_grade,gleason_score
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0005f7aaab2800f6170c399693a96917,karolinska,0,0+0
000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0
0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4
001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4
001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0


In [4]:
sub_df = df.iloc[:5].loc[:, 'isup_grade']
sub_df

image_id
0005f7aaab2800f6170c399693a96917    0
000920ad0b612851f8e01bcc880d9b3d    0
0018ae58b01bdadc8e347995b69f99aa    4
001c62abd11fa4b57bf7a6c603a11bb9    4
001d865e65ef5d2579c190a0e0350d8f    0
Name: isup_grade, dtype: int64

In [5]:
sub_df.value_counts(normalize=True, sort=False)

0    0.6
4    0.4
Name: isup_grade, dtype: float64

In [8]:
targets = df.loc[:,'isup_grade']
targets

image_id
0005f7aaab2800f6170c399693a96917    0
000920ad0b612851f8e01bcc880d9b3d    0
0018ae58b01bdadc8e347995b69f99aa    4
001c62abd11fa4b57bf7a6c603a11bb9    4
001d865e65ef5d2579c190a0e0350d8f    0
                                   ..
ffd2841373b39792ab0c84cccd066e31    0
ffdc59cd580a1468eac0e6a32dd1ff2d    5
ffe06afd66a93258f8fabdef6044e181    0
ffe236a25d4cbed59438220799920749    2
ffe9bcababc858e04840669e788065a1    4
Name: isup_grade, Length: 10616, dtype: int64

### WeightedRandomSampler

In [93]:
labels = np.random.choice([0,1,2], p=[0.6, 0.3, 0.1], size=(1000,))

In [94]:
idxs = np.arange(len(labels))

In [95]:
counts = np.unique(labels, return_counts=True)[1]
counts

array([592, 316,  92])

In [97]:
# weights over the whole dataset (here the dataset is the labels array)
w = 1 / counts[labels]

In [100]:
sampled = [idxs[j] for j in torch.multinomial(torch.from_numpy(w), len(labels), True)]
print('examples sampled: ', sampled[:10])
print('labels sampled: ', labels[sampled][:10])
print('sampled distribution: ', np.unique(labels[sampled], return_counts=True)[1] / len(sampled))

examples sampled:  [910, 123, 72, 98, 152, 854, 176, 990, 710, 455]
labels sampled:  [1 1 0 0 1 2 2 2 1 1]
sampled distribution:  [0.33  0.338 0.332]


### Multinomial resampling -- note uniform classes sampled BUT only 60% of the unique ids

In [6]:
labels = df.isup_grade.values
idxs = np.arange(len(df))
counts = np.unique(labels, return_counts=True)[1]
print('counts by label: ', counts)
weights = torch.from_numpy(1 / counts[labels])
sampled_idxs = [idxs[j] for j in torch.multinomial(weights, len(df), True)]
sampled = df.iloc[sampled_idxs,:]
print('sampled unique labels: ', np.unique(sampled.isup_grade.values))
print('sampled unique ids: ', len(np.unique(sampled.index)))
print('sampled classes: ', np.unique(sampled.isup_grade.values, return_counts=True)[1] / len(sampled))

counts by label:  [2892 2666 1343 1242 1249 1224]
sampled unique labels:  [0 1 2 3 4 5]
sampled unique ids:  6417
sampled classes:  [0.16842502 0.16823662 0.17040317 0.16296157 0.16041824 0.16955539]


### Randomly under-sampler the majority classes

In [136]:
sampled_class_0_idxs = np.random.choice(np.flatnonzero(labels==0), size=int(np.mean(counts[2:])), replace=False)
sampled_class_1_idxs = np.random.choice(np.flatnonzero(labels==1), size=int(np.mean(counts[2:])), replace=False)
resampled_df = df.copy()
resampled_df = resampled_df[resampled_df.isup_grade != 0]
resampled_df = resampled_df[resampled_df.isup_grade != 1]
resampled_df = resampled_df.append(df.iloc[sampled_class_0_idxs])
resampled_df = resampled_df.append(df.iloc[sampled_class_1_idxs])

In [141]:
print('sampled unique labels: ', resampled_df.isup_grade.unique())
print('sampled unique ids: ', len(resampled_df))
print('sampled classses: ', resampled_df.isup_grade.value_counts(normalize=True, sort=False).values)

sampled unique labels:  [4 3 5 2 0 1]
sampled unique ids:  7586
sampled classses:  [0.16662273 0.1646454  0.16662273 0.16134985 0.17703665 0.16372265]
