In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler,ClusterCentroids
import random

In [3]:
df = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")
df.head()

Unnamed: 0,ID,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Marital_Satisfaction,Divorce_Status,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion
0,0,23,Male,Graduate,Different,Hindu,Urban,Medium,Yes,Middle,No,No,No
1,1,28,Female,School,Same,Hindu,Rural,Low,No,Middle,No,No,Yes
2,2,39,Male,Postgraduate,Same,Muslim,Rural,Medium,No,High,No,No,No
3,3,26,Female,School,Different,Hindu,Urban,Low,No,High,No,Yes,No
4,4,32,Female,Graduate,Same,Hindu,Rural,Medium,No,Middle,No,No,Yes


In [4]:
# Distribution of target data

combinations = pd.MultiIndex.from_product(
    [df["Marital_Satisfaction"].unique(),df["Divorce_Status"].unique()],
    names=["Marital_Satisfaction","Divorce_Status"]
)

count = dict(df.groupby(["Marital_Satisfaction","Divorce_Status"]).size())

print(count)

{('High', 'No'): np.int64(2689), ('High', 'Yes'): np.int64(304), ('Low', 'No'): np.int64(1818), ('Low', 'Yes'): np.int64(188), ('Medium', 'No'): np.int64(4492), ('Medium', 'Yes'): np.int64(509)}


In [5]:
df["target"] =  df["Marital_Satisfaction"].astype(str)+'_'+df["Divorce_Status"].astype(str)
df.drop(columns=["Marital_Satisfaction","Divorce_Status","ID"],inplace=True)
df.head()

Unnamed: 0,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion,target
0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,No,Medium_Yes
1,28,Female,School,Same,Hindu,Rural,Middle,No,No,Yes,Low_No
2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,No,Medium_No
3,26,Female,School,Different,Hindu,Urban,High,No,Yes,No,Low_No
4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,No,Yes,Medium_No


In [6]:
df.shape
df["Spouse_Working"].value_counts()

Spouse_Working
Yes    5047
No     4953
Name: count, dtype: int64

In [40]:
# Stratified random over sampling t

x = df.drop(columns=["target"])
y = df["target"]
print((y.value_counts()).median())
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.3,stratify=y,random_state=42
)

print("before sampling", Counter(y_train))

val_count = y_train.value_counts()
median_class_size = (val_count).median()
print("median class size ", median_class_size)

sampled_data = RandomOverSampler(sampling_strategy="not majority", random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

print("After Random Over Sampling to the majority class size", Counter(y_os))




sampled_data = RandomOverSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count < median_class_size
    },random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

sampled_data = RandomUnderSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count > median_class_size
    },random_state=42)
x_us,y_us= sampled_data.fit_resample(x_os,y_os)

print("After Random Over Sampling to median class size", Counter(y_os))
print("After Random Under Sampling to median class size", Counter(y_us))



1163.5
before sampling Counter({'Medium_No': 3144, 'High_No': 1882, 'Low_No': 1273, 'Medium_Yes': 356, 'High_Yes': 213, 'Low_Yes': 132})
median class size  814.5
After Random Over Sampling to the majority class size Counter({'Medium_No': 3144, 'Medium_Yes': 3144, 'Low_No': 3144, 'High_No': 3144, 'Low_Yes': 3144, 'High_Yes': 3144})
After Random Over Sampling to median class size Counter({'Medium_No': 3144, 'High_No': 1882, 'Low_No': 1273, 'Medium_Yes': 814, 'Low_Yes': 814, 'High_Yes': 814})
After Random Under Sampling to median class size Counter({'High_No': 814, 'High_Yes': 814, 'Low_No': 814, 'Low_Yes': 814, 'Medium_No': 814, 'Medium_Yes': 814})


In [None]:
# Roulette Wheel Technique for Resampling

val_count = dict(y_train.value_counts())

weight_class = {cls: (1.0/count) for cls,count in val_count.items()}
print("weights ",weight_class)
total_weight = sum(weight_class.values())
print("Total weight",total_weight)
probabilities = {cls: (wg/total_weight) for cls,wg in weight_class.items()}
print("probabilteies ",probabilities)


count_max = max(val_count.values())

x_resample,y_resample = [],[]
while True:
    curr_count = Counter(y_train.to_list() + y_resample)
    if all(c >= count_max for c in curr_count.values()):
        break

    choosen_class = random.choices(
        population=list(probabilities.keys()),
        weights=list(probabilities.values())
    )[0]
    if curr_count[choosen_class] >= count_max:
        continue
    record = x_train[y_train==choosen_class].sample(n=1,replace=True)
    x_resample.append(record)
    y_resample.append(choosen_class) 



weights  {'Medium_No': np.float64(0.0003180661577608143), 'High_No': np.float64(0.0005313496280552603), 'Low_No': np.float64(0.0007855459544383347), 'Medium_Yes': np.float64(0.0028089887640449437), 'High_Yes': np.float64(0.004694835680751174), 'Low_Yes': np.float64(0.007575757575757576)}
Total weight 0.016714543760808102
probabilteies  {'Medium_No': np.float64(0.019029305394898596), 'High_No': np.float64(0.031789657896685), 'Low_No': np.float64(0.046997750323300225), 'Medium_Yes': np.float64(0.16805656225157634), 'High_Yes': np.float64(0.280883268364137), 'Low_Yes': np.float64(0.4532434557694029)}


In [57]:
x_balanced = pd.concat([x_train]+ x_resample, ignore_index=True)
y_balanced = pd.concat([y_train,pd.Series(y_resample)],ignore_index=True)

rws_df_train = pd.DataFrame(x_balanced,columns=x_balanced.columns)
rws_df_train["target"] = y_balanced
rws_df_train.to_csv("Training Data (RWS Oversampling).csv",index=False)

print(x_balanced)

# print("Count after appling roulette wheel",Counter(x_balanced),Counter(y_balanced))

       Age_at_Marriage  Gender Education_Level Caste_Match   Religion  \
0                   30    Male          School        Same      Hindu   
1                   37  Female        Graduate        Same  Christian   
2                   38  Female    Postgraduate        Same     Muslim   
3                   21  Female    Postgraduate        Same     Others   
4                   26  Female          School        Same     Others   
...                ...     ...             ...         ...        ...   
22859               33    Male        Graduate        Same      Hindu   
22860               33    Male        Graduate        Same     Muslim   
22861               33  Female        Graduate   Different     Muslim   
22862               38  Female    Postgraduate        Same      Hindu   
22863               35  Female        Graduate   Different      Hindu   

      Urban_Rural Income_Level Spouse_Working Inter-Caste Inter-Religion  
0           Rural       Middle            Yes   

In [55]:
val_count = dict(y_test.value_counts())

weight_class = {cls: (1.0/count) for cls,count in val_count.items()}
print("weights ",weight_class)
total_weight = sum(weight_class.values())
print("Total weight",total_weight)
probabilities = {cls: (wg/total_weight) for cls,wg in weight_class.items()}
print("probabilteies ",probabilities)
x_resample,y_resample = [],[]
while True:
    curr_count = Counter(y_test.to_list() + y_resample)
    if all(c >= count_max for c in curr_count.values()):
        break

    choosen_class = random.choices(
        population=list(probabilities.keys()),
        weights=list(probabilities.values())
    )[0]
    if curr_count[choosen_class] >= count_max:
        continue
    record = x_test[y_test==choosen_class].sample(n=1,replace=True)
    x_resample.append(record)
    y_resample.append(choosen_class) 

weights  {'Medium_No': np.float64(0.000741839762611276), 'High_No': np.float64(0.0012391573729863693), 'Low_No': np.float64(0.001834862385321101), 'Medium_Yes': np.float64(0.006535947712418301), 'High_Yes': np.float64(0.01098901098901099), 'Low_Yes': np.float64(0.017857142857142856)}
Total weight 0.03919796107949089
probabilteies  {'Medium_No': np.float64(0.01892546811572351), 'High_No': np.float64(0.03161280175959764), 'Low_No': np.float64(0.046810148660541825), 'Medium_Yes': np.float64(0.16674203281042677), 'High_Yes': np.float64(0.280346494725223), 'Low_Yes': np.float64(0.45556305392848734)}


In [58]:
x_balanced = pd.concat([x_test]+ x_resample, ignore_index=True)
y_balanced = pd.concat([y_test,pd.Series(y_resample)],ignore_index=True)

rws_df_train = pd.DataFrame(x_balanced,columns=x_balanced.columns)
rws_df_train["target"] = y_balanced
rws_df_train.to_csv("Testing Data (RWS Oversampling).csv",index=False)

print(x_balanced)

# print("Count after appling roulette wheel",Counter(x_balanced),Counter(y_balanced))

       Age_at_Marriage  Gender Education_Level Caste_Match Religion  \
0                   25  Female        Graduate        Same   Others   
1                   22    Male          School   Different   Muslim   
2                   22    Male          School        Same    Hindu   
3                   30    Male          School        Same    Hindu   
4                   25  Female          School   Different   Muslim   
...                ...     ...             ...         ...      ...   
18859               33    Male        Graduate        Same    Hindu   
18860               33    Male        Graduate        Same   Muslim   
18861               33  Female        Graduate   Different   Muslim   
18862               38  Female    Postgraduate        Same    Hindu   
18863               35  Female        Graduate   Different    Hindu   

      Urban_Rural Income_Level Spouse_Working Inter-Caste Inter-Religion  
0           Urban       Middle             No         Yes            Yes

In [49]:
X_train = x_train.copy()
X_test = x_test.copy()
encoders = {}
# Encode all categorical columns
for col in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.fit_transform(X_test[col].astype(str))

    encoders[col] =  le

print(encoders)


{'Gender': LabelEncoder(), 'Education_Level': LabelEncoder(), 'Caste_Match': LabelEncoder(), 'Religion': LabelEncoder(), 'Urban_Rural': LabelEncoder(), 'Income_Level': LabelEncoder(), 'Spouse_Working': LabelEncoder(), 'Inter-Caste': LabelEncoder(), 'Inter-Religion': LabelEncoder()}


In [50]:
# Cluster centroids Prototype undersampling
print(Counter(y_train))
cc = ClusterCentroids(random_state=42)
X_train,Y_train = cc.fit_resample(X_train,y_train)
    

X_test,Y_test = cc.fit_resample(X_test,y_test)
print(Counter(Y_train),"-",Counter(Y_test))

Counter({'Medium_No': 3144, 'High_No': 1882, 'Low_No': 1273, 'Medium_Yes': 356, 'High_Yes': 213, 'Low_Yes': 132})
Counter({'High_No': 132, 'High_Yes': 132, 'Low_No': 132, 'Low_Yes': 132, 'Medium_No': 132, 'Medium_Yes': 132}) - Counter({'High_No': 56, 'High_Yes': 56, 'Low_No': 56, 'Low_Yes': 56, 'Medium_No': 56, 'Medium_Yes': 56})


In [51]:
train_dataset = pd.DataFrame(X_train,columns=X_train.columns)
train_dataset["target"] = Y_train

train_dataset.to_csv("Training Data (CC undersampling).csv",index=False)
testing_dataset = pd.DataFrame(X_test,columns=X_test.columns)
testing_dataset["target"] = Y_test

testing_dataset.to_csv("Testing Data (CC undersampling).csv",index=False)
train_dataset.shape
testing_dataset["target"].value_counts()

target
High_No       56
High_Yes      56
Low_No        56
Low_Yes       56
Medium_No     56
Medium_Yes    56
Name: count, dtype: int64

In [52]:
# print(encoders)
decoded_X_train = X_train.copy()

for col, le in encoders.items():
    decoded_X_train[col] = le.inverse_transform(decoded_X_train[col])
    # print(decoded_X_train[col])

dataset = decoded_X_train
dataset["target"] = Y_train  

print(dataset["target"].value_counts())

# Save the decoded dataset, not the encoded one!
dataset.to_csv("Training Data (CC undersampling).csv", index=False)

# print("File saved successfully with decoded categorical values")


target
High_No       132
High_Yes      132
Low_No        132
Low_Yes       132
Medium_No     132
Medium_Yes    132
Name: count, dtype: int64


In [53]:
decoded_X_test = X_test.copy()

for col, le in encoders.items():
    decoded_X_test[col] = le.inverse_transform(decoded_X_test[col])
    # print(decoded_X_train[col])

dataset = decoded_X_test
dataset["target"] = Y_test  

print(dataset["target"].value_counts())

# Save the decoded dataset, not the encoded one!
dataset.to_csv("Testing Data (CC undersampling).csv", index=False)

target
High_No       56
High_Yes      56
Low_No        56
Low_Yes       56
Medium_No     56
Medium_Yes    56
Name: count, dtype: int64
