# Downsampling

<span> In supervised learning many datasets contain data that is class imbalanced. Therefore you will have to downsample the majority class to match the minority class shape. Below I have created a function that can downsample numpy data.</span>

### Import Preliminaries

In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data[:80], iris.target[:80]

### View Class Imbalance

In [42]:
pd.Series(y).value_counts()

0    50
1    30
dtype: int64

### Retrieve Downsampled Class Index

In [43]:
# Indicies of each class' observation
class_0 = np.where(y==0)[0]
class_1 = np.where(y==1)[0]
class_1[:5], class_0[:5]

(array([50, 51, 52, 53, 54]), array([0, 1, 2, 3, 4]))

In [45]:
# Random select indices from the magority class to fit
# the minority class
class_0_downsampled = np.random.choice(class_0, 
                                       size=len(class_1), 
                                       replace=False)
class_0_downsampled

array([30, 36,  3, 15, 18, 39, 48, 47, 26, 23, 46, 22, 24, 19,  0, 20, 42,
       32, 14, 41, 11, 34, 33,  5,  9, 12, 35,  1, 25,  6])

In [47]:
# Stack the minority class index and dowmsampled majority class
downsampled_index = np.hstack((class_0_downsampled, class_1))
downsampled_index

array([30, 36,  3, 15, 18, 39, 48, 47, 26, 23, 46, 22, 24, 19,  0, 20, 42,
       32, 14, 41, 11, 34, 33,  5,  9, 12, 35,  1, 25,  6, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79])

In [48]:
# Sort the index
downsampled_index.sort()
downsampled_index

array([ 0,  1,  3,  5,  6,  9, 11, 12, 14, 15, 18, 19, 20, 22, 23, 24, 25,
       26, 30, 32, 33, 34, 35, 36, 39, 41, 42, 46, 47, 48, 50, 51, 52, 53,
       54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
       71, 72, 73, 74, 75, 76, 77, 78, 79])

### Downsampling Data

In [49]:
# Down sample dataset
X, y = X[downsampled_index], y[downsampled_index]

# View new dataset sizes
print(f'DataFrame Shape of X: {len(X)}')
print(f'DataFrame Shape of y: {len(y)}')

DataFrame Shape of X: 60
DataFrame Shape of y: 60


In [51]:
# View the class counts of y
pd.Series(y).value_counts()

1    30
0    30
dtype: int64

### Creating a Function for Downsampling

In [77]:
def binary_random_downsample(X, y):
    '''
    Function conduct downsampling for binary crossentropy problem
    between to classes
    
    Parameters
    ----------
    X: numpy matrix containing all the features of the dataset
    y: numpy 1D array containing all target values of the dataset 
    
    Examples
    ----------
    >>>> X, y = binary_random_downsample(X,y)
    '''
    
    assert type(X) == np.ndarray, 'Make sure X is a numpy array'
    assert type(y) == np.ndarray, "Make sure y is a numpy array"
    
    class_0 = np.where(y==0)[0]
    class_1 = np.where(y==1)[0]
    
    if len(class_0) > len(class_1):
        magc = class_0
        minc = class_1
    else:
        magc = class_1
        minc = class_0
    
    mag_class_downsampled = np.random.choice(magc, 
                                       size=len(minc), 
                                       replace=False)
    
    downsampled_index = np.hstack((mag_class_downsampled, minc))
    downsampled_index.sort()
    X, y = X[downsampled_index], y[downsampled_index]
    
    return X,y

### Testing our Function

In [78]:
iris = load_iris()
X, y = iris.data[:70], iris.target[:70]
X, y = binary_random_downsample(X,y)

print(f'DataFrame Shape of X: {len(X)}')
print(f'DataFrame Shape of y: {len(y)}')

DataFrame Shape of X: 40
DataFrame Shape of y: 40


Author: Kavi Sekhon