__<h1 style="text-align: center;font-size: 3rem">Data Rebalancing</h1><p style="text-align: center;font-size: 1.3rem">(Notebook III)</p>__

## Imports

Primarily importing from the `imblearn` module, using a mixture of over-sampling, under-sampling, and Imbalance-Learn's Pipeline which allows sampling methods to be mixed with SciKit-Learn's functionality.

In [32]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd


from dotenv import load_dotenv
import os
from typing import NamedTuple

In [33]:
load_dotenv()
RANDOM_STATE = int(os.getenv("RANDOM_STATE", 0))

print(f"{RANDOM_STATE=}")

RANDOM_STATE=4813


In [34]:
def is_fraud(entries: pd.DataFrame) -> pd.Series:
    return entries["Class"] == 1

DataSplit will help organize between X and y

In [35]:
class FeatureTarget(NamedTuple):
    X: pd.DataFrame
    y: pd.Series

In [36]:
train_data = pd.read_parquet("../data/processed/train.parquet")

In [37]:
print(train_data["Class"].value_counts())

Class
0    227451
1       394
Name: count, dtype: int64


In [38]:
print(train_data["Class"].value_counts(normalize=True).apply(lambda x: f"{x:.2%}"))

Class
0    99.83%
1     0.17%
Name: proportion, dtype: object


In [39]:
def boost_minority_by(target: pd.Series, proportion: float):
    if not 0 < proportion < 1:
        raise ValueError("Proportion must be between 0 and 1")
    class_counts = target.value_counts()

    return {
        i: int((1 + proportion) * count) if i == class_counts.idxmin() else count
        for i, count in class_counts.items()
    }


In [40]:
sm = SMOTE(
    sampling_strategy=boost_minority_by(is_fraud(train_data), 0.3),  # type: ignore (mypy)
    random_state=RANDOM_STATE,
)

In [41]:
rebal_train = pd.concat(
    sm.fit_resample(
        train_data.drop(columns=["Class"]),
        train_data["Class"],
    ),
    axis=1,
)

In [42]:
rebal_train.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,161919.0,1.946747,-0.752526,-1.35513,-0.66163,1.502822,4.024933,-1.479661,1.13988,1.406819,...,0.076197,0.297537,0.307915,0.69098,-0.350316,-0.388907,0.077641,-0.032248,7.32,0
1,124477.0,2.035149,-0.04888,-3.058693,0.247945,2.943487,3.298697,-0.002192,0.674782,0.045826,...,0.038628,0.228197,0.035542,0.70709,0.512885,-0.471198,0.00252,-0.069002,2.99,0
2,41191.0,-0.99192,0.603193,0.711976,-0.992425,-0.825838,1.956261,-2.212603,-5.037523,0.000772,...,-2.798352,0.109526,-0.43653,-0.932803,0.826684,0.913773,0.038049,0.18534,175.1,0
3,132624.0,2.285718,-1.500239,-0.747565,-1.668119,-1.394143,-0.350339,-1.427984,0.01001,-1.118447,...,-0.13967,0.077013,0.20831,-0.538236,-0.278032,-0.162068,0.018045,-0.063005,6.1,0
4,59359.0,-0.448747,-1.01144,0.115903,-3.454854,0.715771,-0.14749,0.504347,-0.113817,-0.044782,...,-0.243245,-0.173298,-0.006692,-1.362383,-0.292234,-0.144622,-0.03258,-0.064194,86.1,0
5,79404.0,1.320449,-1.560991,-0.154324,-1.806184,-0.831334,0.87782,-1.212515,0.24602,-1.747897,...,-0.346511,-0.941239,-0.104322,-1.790279,0.184198,-0.299018,0.025862,0.016056,125.29,0
6,111904.0,1.90218,0.158704,-0.210092,3.459251,0.346012,1.465813,-0.749323,0.432622,-0.39013,...,0.208545,0.64648,0.075336,-0.17678,-0.095253,0.090097,0.004346,-0.045702,1.51,0
7,151647.0,2.091991,-1.009477,-0.88566,-0.424255,-0.575602,0.486532,-1.033124,0.20155,0.263308,...,-0.158022,0.244872,0.083967,0.116089,0.032727,-0.123307,0.042437,-0.051537,11.99,0
8,164062.0,-3.309691,-5.662946,0.466119,-0.430087,3.664003,-3.036146,-2.84165,0.520501,1.485832,...,0.819604,0.742448,1.745293,-0.490856,-2.153907,0.173475,0.119893,0.467544,208.6,0
9,148508.0,1.973973,-0.404605,-0.222072,0.534479,-0.830112,-0.626462,-0.595997,-0.066845,1.465829,...,0.194876,0.822169,0.120061,0.072218,-0.097635,-0.215837,0.0419,-0.036105,9.99,0


In [43]:
min_count = rebal_train[is_fraud(rebal_train)].shape[0]
min_count

512

In [44]:
rus = RandomUnderSampler(
    sampling_strategy={0: int(min_count / 0.4), 1: min_count},  # type: ignore (mypy)
    random_state=RANDOM_STATE,
)

In [45]:
rebal_train = pd.concat(
    rus.fit_resample(
        rebal_train.drop(columns=["Class"]),
        rebal_train["Class"],
    ),
    axis=1,
)

In [46]:
rebal_train.value_counts("Class")

Class
0    1280
1     512
Name: count, dtype: int64

In [52]:
pl = Pipeline(
    [
        ("smote", sm),
        ("under sample", rus),
    ],
    verbose=True,
)

In [53]:
train: FeatureTarget = FeatureTarget(
    X=train_data.drop(columns=["Class"]),
    y=train_data["Class"],
)

In [55]:
pl.fit_resample(train.X, train.y)
train.X.shape, train.y.shape

[Pipeline] ............. (step 1 of 2) Processing smote, total=   0.2s
[Pipeline] ...... (step 2 of 2) Processing under sample, total=   0.0s


((227845, 30), (227845,))

In [50]:
train.X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,161919.0,1.946747,-0.752526,-1.35513,-0.66163,1.502822,4.024933,-1.479661,1.13988,1.406819,...,-0.134435,0.076197,0.297537,0.307915,0.69098,-0.350316,-0.388907,0.077641,-0.032248,7.32
1,124477.0,2.035149,-0.04888,-3.058693,0.247945,2.943487,3.298697,-0.002192,0.674782,0.045826,...,-0.227279,0.038628,0.228197,0.035542,0.70709,0.512885,-0.471198,0.00252,-0.069002,2.99
2,41191.0,-0.99192,0.603193,0.711976,-0.992425,-0.825838,1.956261,-2.212603,-5.037523,0.000772,...,1.280856,-2.798352,0.109526,-0.43653,-0.932803,0.826684,0.913773,0.038049,0.18534,175.1
3,132624.0,2.285718,-1.500239,-0.747565,-1.668119,-1.394143,-0.350339,-1.427984,0.01001,-1.118447,...,-0.490642,-0.13967,0.077013,0.20831,-0.538236,-0.278032,-0.162068,0.018045,-0.063005,6.1
4,59359.0,-0.448747,-1.01144,0.115903,-3.454854,0.715771,-0.14749,0.504347,-0.113817,-0.044782,...,-0.275297,-0.243245,-0.173298,-0.006692,-1.362383,-0.292234,-0.144622,-0.03258,-0.064194,86.1


In [51]:
train.y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64