In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import random

In [2]:
df = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")
df.head()

Unnamed: 0,ID,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Marital_Satisfaction,Divorce_Status,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion
0,0,23,Male,Graduate,Different,Hindu,Urban,Medium,Yes,Middle,No,No,No
1,1,28,Female,School,Same,Hindu,Rural,Low,No,Middle,No,No,Yes
2,2,39,Male,Postgraduate,Same,Muslim,Rural,Medium,No,High,No,No,No
3,3,26,Female,School,Different,Hindu,Urban,Low,No,High,No,Yes,No
4,4,32,Female,Graduate,Same,Hindu,Rural,Medium,No,Middle,No,No,Yes


In [3]:
# Distribution of target data

combinations = pd.MultiIndex.from_product(
    [df["Marital_Satisfaction"].unique(),df["Divorce_Status"].unique()],
    names=["Marital_Satisfaction","Divorce_Status"]
)

count = dict(df.groupby(["Marital_Satisfaction","Divorce_Status"]).size())

print(count)

{('High', 'No'): np.int64(2689), ('High', 'Yes'): np.int64(304), ('Low', 'No'): np.int64(1818), ('Low', 'Yes'): np.int64(188), ('Medium', 'No'): np.int64(4492), ('Medium', 'Yes'): np.int64(509)}


In [4]:
df["target"] =  df["Marital_Satisfaction"].astype(str)+'_'+df["Divorce_Status"].astype(str)
df.drop(columns=["Marital_Satisfaction","Divorce_Status"],inplace=True)
df.head()

Unnamed: 0,ID,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion,target
0,0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,No,Medium_Yes
1,1,28,Female,School,Same,Hindu,Rural,Middle,No,No,Yes,Low_No
2,2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,No,Medium_No
3,3,26,Female,School,Different,Hindu,Urban,High,No,Yes,No,Low_No
4,4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,No,Yes,Medium_No


In [5]:
# Stratified random over sampling t

x = df.drop(columns=["target"])
y = df["target"]
print((y.value_counts()).median())
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.2,stratify=y,random_state=42
)

print("before sampling", Counter(y_train))

val_count = y_train.value_counts()
median_class_size = (val_count).median()
print("median class size ", median_class_size)

sampled_data = RandomOverSampler(sampling_strategy="not majority", random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

print("After Random Over Sampling to the majority class size", Counter(y_os))




sampled_data = RandomOverSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count < median_class_size
    },random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

sampled_data = RandomUnderSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count > median_class_size
    },random_state=42)
x_us,y_us= sampled_data.fit_resample(x_os,y_os)

print("After Random Over Sampling to median class size", Counter(y_os))
print("After Random Under Sampling to median class size", Counter(y_us))



1163.5
before sampling Counter({'Medium_No': 3594, 'High_No': 2151, 'Low_No': 1454, 'Medium_Yes': 407, 'High_Yes': 243, 'Low_Yes': 151})
median class size  930.5
After Random Over Sampling to the majority class size Counter({'Medium_No': 3594, 'High_Yes': 3594, 'Low_No': 3594, 'High_No': 3594, 'Medium_Yes': 3594, 'Low_Yes': 3594})
After Random Over Sampling to median class size Counter({'Medium_No': 3594, 'High_No': 2151, 'Low_No': 1454, 'High_Yes': 930, 'Medium_Yes': 930, 'Low_Yes': 930})
After Random Under Sampling to median class size Counter({'High_No': 930, 'High_Yes': 930, 'Low_No': 930, 'Low_Yes': 930, 'Medium_No': 930, 'Medium_Yes': 930})


In [14]:
# Roulette Wheel Technique for Resampling

val_count = dict(y_train.value_counts())

weight_class = {cls: (1.0/count) for cls,count in val_count.items()}
print("weights ",weight_class)
total_weight = sum(weight_class.values())
print("Total weight",total_weight)
probabilities = {cls: (wg/total_weight) for cls,wg in weight_class.items()}
print("probabilteies ",probabilities)


count_max = max(val_count.values())

x_resample,y_resample = [],[]
while True:
    curr_count = Counter(y_train.to_list() + y_resample)
    if all(c >= count_max for c in curr_count.values()):
        break

    choosen_class = random.choices(
        population=list(probabilities.keys()),
        weights=list(probabilities.values())
    )[0]
    if curr_count[choosen_class] >= count_max:
        continue
    record = x_train[y_train==choosen_class].sample(n=1,replace=True)
    x_resample.append(record)
    y_resample.append(choosen_class) 



weights  {'Medium_No': np.float64(0.0002782415136338342), 'High_No': np.float64(0.00046490004649000463), 'Low_No': np.float64(0.000687757909215956), 'Medium_Yes': np.float64(0.002457002457002457), 'High_Yes': np.float64(0.00411522633744856), 'Low_Yes': np.float64(0.006622516556291391)}
Total weight 0.014625644820082204
probabilteies  {'Medium_No': np.float64(0.019024221978355845), 'High_No': np.float64(0.031786635885732634), 'Low_No': np.float64(0.047024108521465546), 'Medium_Yes': np.float64(0.16799276115530934), 'High_Yes': np.float64(0.281370591729263), 'Low_Yes': np.float64(0.45280168072987353)}


In [17]:
x_balanced = pd.concat([x_train]+ x_resample, ignore_index=True)
y_balanced = pd.concat([y_train,pd.Series(y_resample)],ignore_index=True)

print("Count after appling roulette wheel",Counter(y_balanced))

Count after appling roulette wheel Counter({'Medium_No': 3594, 'High_Yes': 3594, 'Low_No': 3594, 'High_No': 3594, 'Medium_Yes': 3594, 'Low_Yes': 3594})
