***Balancing a Dataset with Downsampling***

In [4]:
import pandas as pd
from sklearn.utils import resample

# sample dataset
df=pd.DataFrame({
    'Age' :[22,25,27,28,30,35,40,45,50,55,60,65,70],
    'Income': [2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
    'Class':['High','Low','Low','High','High','Low','High','High','Low','Low','High','High','Low']
})

High Class Has 7 instances

Low Class has 6 instances

In [13]:
df_high=df[df['Class']=='High']
df_low=df[df['Class']=='Low']
df_high

Unnamed: 0,Age,Income,Class
0,22,2000,High
3,28,3200,High
4,30,3500,High
6,40,4000,High
7,45,4200,High
10,60,5000,High
11,65,5500,High


In [14]:
df_low

Unnamed: 0,Age,Income,Class
1,25,2500,Low
2,27,2700,Low
5,35,3800,Low
8,50,4300,Low
9,55,4500,Low
12,70,6000,Low


In [16]:
df_high_downsample = resample(df_high,replace=False,n_samples=len(df_low),random_state=42)

In [19]:
df_balanced=pd.concat([df_high_downsample,df_low])

In [20]:
print(df_balanced['Class'].value_counts())

Class
High    6
Low     6
Name: count, dtype: int64


In [21]:
df=pd.DataFrame({
    'Age':[22,25,27,28,30,35,40,45,50,55,60,65,70],
    'Income': [2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
    'Class':['Minority','Majority','Majority','Majority','Majority','Minority','Minority','Minority','Majority','Majority','Majority','Majority','Majority']
})

In [22]:
df_majority=df[df['Class']=='Majority']
df_minority=df[df['Class']=='Minority']
df_majority

Unnamed: 0,Age,Income,Class
1,25,2500,Majority
2,27,2700,Majority
3,28,3200,Majority
4,30,3500,Majority
8,50,4300,Majority
9,55,4500,Majority
10,60,5000,Majority
11,65,5500,Majority
12,70,6000,Majority


In [23]:
df_minority

Unnamed: 0,Age,Income,Class
0,22,2000,Minority
5,35,3800,Minority
6,40,4000,Minority
7,45,4200,Minority


In [28]:
#upsampling data
df_minority_upsampled=resample(df_minority,replace=True,n_samples=len(df_majority),random_state=42)

In [31]:
df_balanced=pd.concat([df_majority,df_minority_upsampled])

In [32]:
print(df_balanced['Class'].value_counts())

Class
Majority    9
Minority    9
Name: count, dtype: int64


**SMOTE**

In [44]:
pip uninstall scikit-learn imbalanced-learn -y


Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Note: you may need to restart the kernel to use updated packages.


ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\CVR\anaconda3\Lib\site-packages\pip\_internal\cli\base_command.py", line 180, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "C:\Users\CVR\anaconda3\Lib\site-packages\pip\_internal\commands\uninstall.py", line 110, in run
    uninstall_pathset.commit()
  File "C:\Users\CVR\anaconda3\Lib\site-packages\pip\_internal\req\req_uninstall.py", line 432, in commit
    self._moved_paths.commit()
  File "C:\Users\CVR\anaconda3\Lib\site-packages\pip\_internal\req\req_uninstall.py", line 278, in commit
    save_dir.cleanup()
  File "C:\Users\CVR\anaconda3\Lib\site-packages\pip\_internal\utils\temp_dir.py", line 173, in cleanup
    rmtree(self._path)
  File "C:\Users\CVR\anaconda3\Lib\site-packages\pip\_vendor\tenacity\__init__.py", line 291, in wrapped_f
    return self(f, *args, **kw)
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\CVR\anaconda3\Lib\site-packages\pip\_vendor\tenacity\__

1.SMOTE to generate synthetic samples instead of duplicating existing ones

2.Convert categorical class labels into numeric forms for SMORT to work

3.Apply SMORT to balance the dataset 

4.Convert back to original categorical labels

5.Combine the rsampleed data into a final balanced dataset

In [45]:
pip install -U scikit-learn imbalanced-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/a1/a6/c5b78606743a1f28eae8f11973de6613a5ee87366796583fb74c67d54939/scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-1.6.1
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
from imblearn.over_sampling import SMOTE

# Sample dataset
df=pd.DataFrame({
    'Age':[22,25,27,28,30,35,40,45,50,55,60,65,70],
    'Income':[2000,2500,2700,3200,3500,3800,4000,4200,4300,4500,5000,5500,6000],
    'Class':['Minority','Majority','Majority','Majority','Majority',
             'Minority','Minority','Minority','Majority','Majority',
             'Majority','Majority','Majority']
    
    
})
#Step 1: Convert categorical labels to numeric values
df['Class']=df['Class'].map({'Majority': 0, 'Minority':1})

# Step 2: Split features (X) and target variable (y)
X=df[['Age','Income']]
y=df['Class']

#Step 3: Apply SMOTE with k_neighbors=3 (reducing from default 5)
smote=SMOTE(sampling_strategy='auto',random_state=42,k_neighbors=3)
X_resampled,y_resampled=smote.fit_resample(X,y)

#Step 4: Convert numeric labels back to categorical
y_resampled=y_resampled.map({0:'Majority',1:'Minority'})

#Step 5: Combine the resampled dataset
df_balanced=pd.concat([pd.DataFrame(X_resampled,columns=['Age','Income']),pd.DataFrame(y_resampled,columns=['Class'])],axis=1)

#Step 6: Print class distribution
print(df_balanced['Class'].value_counts())

#Step 7: Display the upsampled dataset
print(df_balanced)

Class
Minority    9
Majority    9
Name: count, dtype: int64
    Age  Income     Class
0    22    2000  Minority
1    25    2500  Majority
2    27    2700  Majority
3    28    3200  Majority
4    30    3500  Majority
5    35    3800  Minority
6    40    4000  Minority
7    45    4200  Minority
8    50    4300  Majority
9    55    4500  Majority
10   60    5000  Majority
11   65    5500  Majority
12   70    6000  Majority
13   40    4031  Minority
14   35    3831  Minority
15   44    4176  Minority
16   35    3826  Minority
17   41    4040  Minority
