## Balancing dataset with downsampling:
    to balance the dataset we use sampling 

In [9]:
import pandas as pd
from sklearn.utils import resample
df=pd.DataFrame({
'Age': [22,25,27,28,30,35,40,45,50,55,60,65,70],
'Income':[2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
'Class':['High','Low','Low','High','High','Low','High','High','Low','Low','High','High','Low']
})

In [33]:
df_high=df[df['Class']=='High']
df_low=df[df['Class']=='Low']
print(df_high)
print('\n')
print(df_low)


    Age  Income Class
0    22    2000  High
3    28    3200  High
4    30    3500  High
6    40    4000  High
7    45    4200  High
10   60    5000  High
11   65    5500  High


    Age  Income Class
1    25    2500   Low
2    27    2700   Low
5    35    3800   Low
8    50    4300   Low
9    55    4500   Low
12   70    6000   Low


In [19]:
#down sample majority class
df_high_downsample=resample(df_high,replace=False,n_samples=len(df_low),random_state=42)

In [39]:
df_balanced=pd.concat([df_high_downsample,df_low])
df_balanced

Unnamed: 0,Age,Income,Class
0,22,2000,High
3,28,3200,High
10,60,5000,High
4,30,3500,High
7,45,4200,High
6,40,4000,High
1,25,2500,Low
2,27,2700,Low
5,35,3800,Low
8,50,4300,Low


In [37]:
df_balanced['Class'].value_counts()

Class
High    6
Low     6
Name: count, dtype: int64

## UPSAMPLING MINORITY CLASS

In [73]:
df=pd.DataFrame({
'Age': [22,25,27,28,30,35,40,45,50,55,60,65,70],
'Income':[2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
'Class':['minority','majority','majority','majority','majority','minority','minority','minority','majority','majority','majority','majority','majority']
})

In [75]:
df_minority=df[df['Class']=='minority']
df_majority=df[df['Class']=='majority']
df2['Class'].value_counts()

Class
majority    9
minority    4
Name: count, dtype: int64

In [77]:
df_minority_upsampling=resample(df_minority,replace=True,n_samples=len(df_majority),random_state=42)

In [81]:
df_balanced=pd.concat([df_majority,df_minority_upsampling])


In [83]:
df_balanced['Class'].value_counts()

Class
majority    9
minority    9
Name: count, dtype: int64

In [88]:
df_balanced['Class']

1     majority
2     majority
3     majority
4     majority
8     majority
9     majority
10    majority
11    majority
12    majority
6     minority
7     minority
0     minority
6     minority
6     minority
7     minority
0     minority
0     minority
6     minority
Name: Class, dtype: object

## SMOTE:
**Synthetic Minority Over-Sampling Techniques**

1.it will generate own samples instead of duplicating existing ones

2.here we will convert categorical values into numeric

3.alpply SMOTE to balnace the dataset.

4.convert back to categorical labels.

5.balanced dataset is created.




In [96]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [135]:
import pandas as pd
from imblearn.over_sampling import SMOTE

df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Minority', 'Minority', 'Minority', 'Majority',  'Majority', 'Majority', 'Majority', 'Majority']
})
df['Class'].value_counts()
df['Class'] = df['Class'].map({'Majority': 0, 'Minority': 1})
X = df[['Age', 'Income']]
y = df['Class']
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)
X_resampled,y_resampled.map({'Majority': 0, 'Minority': 1})
df_balanced = pd.concat([pd.DataFrame(X_resampled, columns=['Age', 'Income']), pd.DataFrame(y_resampled, columns=['Class'])], axis=1)
print(df_balanced['Class'].value_counts())
print(df_balanced)

Class
1    9
0    9
Name: count, dtype: int64
    Age  Income  Class
0    22    2000      1
1    25    2500      0
2    27    2700      0
3    28    3200      0
4    30    3500      0
5    35    3800      1
6    40    4000      1
7    45    4200      1
8    50    4300      0
9    55    4500      0
10   60    5000      0
11   65    5500      0
12   70    6000      0
13   40    4031      1
14   35    3831      1
15   44    4176      1
16   35    3826      1
17   41    4040      1
