## Handling Imbalanced Dataset
 1.Unsampling
 2.Down Sampling

In [1]:
## Classification -- Supervised ML

In [2]:
## output -- Categorical feature

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_l = n_samples - n_class_0

In [6]:
n_class_0,n_class_l

(900, 100)

In [10]:
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_l),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_l),
    'target': [1] * n_class_l
})

In [13]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [14]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-2.643425,-0.093436,0
1,-0.448991,1.650653,0
2,-0.358907,-0.845004,0
3,0.133194,0.395839,0
4,-0.644242,-1.342092,0


In [16]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

## Upsampling

In [22]:
df_minority=df[df['target']==1]
df_majority=df[df['target']==0]

In [19]:
df_minority.head()

Unnamed: 0,feature_1,feature_2,target
900,1.349116,0.821543,1
901,4.079573,1.198984,1
902,3.445297,1.269421,1
903,1.331946,2.984417,1
904,1.418799,0.944062,1


In [23]:
df_majority.head()

Unnamed: 0,feature_1,feature_2,target
0,-2.643425,-0.093436,0
1,-0.448991,1.650653,0
2,-0.358907,-0.845004,0
3,0.133194,0.395839,0
4,-0.644242,-1.342092,0


In [24]:
## Upsampling perform 
from sklearn.utils import resample

In [27]:
df_minority_upsample=resample(df_minority, 
                            replace=True, ## Sample with replacement
                            n_samples=len(df_majority),
                            random_state=42
                            )

In [28]:
df_minority_upsample.shape

(900, 3)

In [29]:
df_minority_upsample.value_counts()

feature_1  feature_2  target
 2.043900  2.289980   1         19
 2.544474  2.095308   1         18
-0.192571  0.606920   1         18
 1.157370  2.781408   1         16
 3.767493  2.373888   1         16
                                ..
 3.571824  2.806820   1          4
 3.529335  1.737718   1          4
 2.586068  2.406844   1          4
 4.049064  3.395138   1          4
 2.833016  2.096360   1          3
Length: 100, dtype: int64

In [30]:
df_upsampled=pd.concat([df_majority,df_minority_upsample])

In [31]:
df_upsampled['target'].value_counts()

0    900
1    900
Name: target, dtype: int64

In [33]:
df_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
0,-2.643425,-0.093436,0
1,-0.448991,1.650653,0
2,-0.358907,-0.845004,0
3,0.133194,0.395839,0
4,-0.644242,-1.342092,0


In [34]:
df_upsampled.tail()

Unnamed: 0,feature_1,feature_2,target
952,1.421568,1.825104,1
965,0.731857,3.333514,1
976,1.34461,0.993064,1
942,2.888538,2.257259,1
974,2.651029,0.867834,1


In [35]:
df_upsampled.shape

(1800, 3)

### Downsampling

In [36]:
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_l),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_l),
    'target': [1] * n_class_l
})

In [37]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [39]:
df_minority=df[df['target']==1]
df_majority=df[df['target']==0]

In [41]:
df_majority_downsample=resample(df_minority, 
                            replace=False, ## Sample with replacement
                            n_samples=len(df_minority),
                            random_state=42
                            )

In [43]:
df_majority_downsample.shape

(100, 3)

In [49]:
df_downsample=pd.concat([df_minority, df_majority_downsample])

In [50]:
df_downsample['target'].value_counts()

1    200
Name: target, dtype: int64