In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler,ClusterCentroids
import random
import pickle as p

In [42]:
df = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")
df.head()

Unnamed: 0,ID,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Marital_Satisfaction,Divorce_Status,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion
0,0,23,Male,Graduate,Different,Hindu,Urban,Medium,Yes,Middle,No,No,No
1,1,28,Female,School,Same,Hindu,Rural,Low,No,Middle,No,No,Yes
2,2,39,Male,Postgraduate,Same,Muslim,Rural,Medium,No,High,No,No,No
3,3,26,Female,School,Different,Hindu,Urban,Low,No,High,No,Yes,No
4,4,32,Female,Graduate,Same,Hindu,Rural,Medium,No,Middle,No,No,Yes


In [43]:
# Distribution of target data

combinations = pd.MultiIndex.from_product(
    [df["Marital_Satisfaction"].unique(),df["Divorce_Status"].unique()],
    names=["Marital_Satisfaction","Divorce_Status"]
)

count = dict(df.groupby(["Marital_Satisfaction","Divorce_Status"]).size())

print(count)

{('High', 'No'): np.int64(2689), ('High', 'Yes'): np.int64(304), ('Low', 'No'): np.int64(1818), ('Low', 'Yes'): np.int64(188), ('Medium', 'No'): np.int64(4492), ('Medium', 'Yes'): np.int64(509)}


In [44]:
df["target"] =  df["Marital_Satisfaction"].astype(str)+'_'+df["Divorce_Status"].astype(str)
df.drop(columns=["Marital_Satisfaction","Divorce_Status","ID"],inplace=True)
df.head()

Unnamed: 0,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion,target
0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,No,Medium_Yes
1,28,Female,School,Same,Hindu,Rural,Middle,No,No,Yes,Low_No
2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,No,Medium_No
3,26,Female,School,Different,Hindu,Urban,High,No,Yes,No,Low_No
4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,No,Yes,Medium_No


In [45]:
df.shape
df["Spouse_Working"].value_counts()

Spouse_Working
Yes    5047
No     4953
Name: count, dtype: int64

In [46]:
# Stratified random over sampling t

x = df.drop(columns=["target"])
y = df["target"]
print((y.value_counts()).median())
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.3,stratify=y,random_state=42
)

print("before sampling", Counter(y_train))

val_count = y_train.value_counts()
median_class_size = (val_count).median()
print("median class size ", median_class_size)

sampled_data = RandomOverSampler(sampling_strategy="not majority", random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

print("After Random Over Sampling to the majority class size", Counter(y_os))




sampled_data = RandomOverSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count < median_class_size
    },random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

sampled_data = RandomUnderSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count > median_class_size
    },random_state=42)
x_us,y_us= sampled_data.fit_resample(x_os,y_os)

print("After Random Over Sampling to median class size", Counter(y_os))
print("After Random Under Sampling to median class size", Counter(y_us))



1163.5
before sampling Counter({'Medium_No': 3144, 'High_No': 1882, 'Low_No': 1273, 'Medium_Yes': 356, 'High_Yes': 213, 'Low_Yes': 132})
median class size  814.5
After Random Over Sampling to the majority class size Counter({'Medium_No': 3144, 'Medium_Yes': 3144, 'Low_No': 3144, 'High_No': 3144, 'Low_Yes': 3144, 'High_Yes': 3144})
After Random Over Sampling to median class size Counter({'Medium_No': 3144, 'High_No': 1882, 'Low_No': 1273, 'Medium_Yes': 814, 'Low_Yes': 814, 'High_Yes': 814})
After Random Under Sampling to median class size Counter({'High_No': 814, 'High_Yes': 814, 'Low_No': 814, 'Low_Yes': 814, 'Medium_No': 814, 'Medium_Yes': 814})


In [47]:
# Roulette Wheel Technique for Resampling

val_count = dict(y_train.value_counts())

weight_class = {cls: (1.0/count) for cls,count in val_count.items()}
print("weights ",weight_class)
total_weight = sum(weight_class.values())
print("Total weight",total_weight)
probabilities = {cls: (wg/total_weight) for cls,wg in weight_class.items()}
print("probabilteies ",probabilities)


count_max = max(val_count.values())

x_resample,y_resample = [],[]
while True:
    curr_count = Counter(y_train.to_list() + y_resample)
    if all(c >= count_max for c in curr_count.values()):
        break

    choosen_class = random.choices(
        population=list(probabilities.keys()),
        weights=list(probabilities.values())
    )[0]
    if curr_count[choosen_class] >= count_max:
        continue
    record = x_train[y_train==choosen_class].sample(n=1,replace=True)
    x_resample.append(record)
    y_resample.append(choosen_class) 



weights  {'Medium_No': np.float64(0.0003180661577608143), 'High_No': np.float64(0.0005313496280552603), 'Low_No': np.float64(0.0007855459544383347), 'Medium_Yes': np.float64(0.0028089887640449437), 'High_Yes': np.float64(0.004694835680751174), 'Low_Yes': np.float64(0.007575757575757576)}
Total weight 0.016714543760808102
probabilteies  {'Medium_No': np.float64(0.019029305394898596), 'High_No': np.float64(0.031789657896685), 'Low_No': np.float64(0.046997750323300225), 'Medium_Yes': np.float64(0.16805656225157634), 'High_Yes': np.float64(0.280883268364137), 'Low_Yes': np.float64(0.4532434557694029)}


In [48]:
x_balanced = pd.concat([x_train]+ x_resample, ignore_index=True)
y_balanced = pd.concat([y_train,pd.Series(y_resample)],ignore_index=True)

rws_df_train = pd.DataFrame(x_balanced,columns=x_balanced.columns)
rws_df_train["target"] = y_balanced
rws_df_train.to_csv("Training Data (RWS Oversampling).csv",index=False)

print(y_balanced.value_counts())

# print("Count after appling roulette wheel",Counter(x_balanced),Counter(y_balanced))

Medium_No     3144
Medium_Yes    3144
Low_No        3144
High_No       3144
Low_Yes       3144
High_Yes      3144
Name: count, dtype: int64


In [49]:
val_count = dict(y_test.value_counts())

weight_class = {cls: (1.0/count) for cls,count in val_count.items()}
print("weights ",weight_class)
total_weight = sum(weight_class.values())
print("Total weight",total_weight)
probabilities = {cls: (wg/total_weight) for cls,wg in weight_class.items()}
print("probabilteies ",probabilities)

count_max = max(val_count.values())
x_resample,y_resample = [],[]
while True:
    curr_count = Counter(y_test.to_list() + y_resample)
    if all(c >= count_max for c in curr_count.values()):
        break

    choosen_class = random.choices(
        population=list(probabilities.keys()),
        weights=list(probabilities.values())
    )[0]
    if curr_count[choosen_class] >= count_max:
        continue
    record = x_test[y_test==choosen_class].sample(n=1,replace=True)
    x_resample.append(record)
    y_resample.append(choosen_class) 

weights  {'Medium_No': np.float64(0.000741839762611276), 'High_No': np.float64(0.0012391573729863693), 'Low_No': np.float64(0.001834862385321101), 'Medium_Yes': np.float64(0.006535947712418301), 'High_Yes': np.float64(0.01098901098901099), 'Low_Yes': np.float64(0.017857142857142856)}
Total weight 0.03919796107949089
probabilteies  {'Medium_No': np.float64(0.01892546811572351), 'High_No': np.float64(0.03161280175959764), 'Low_No': np.float64(0.046810148660541825), 'Medium_Yes': np.float64(0.16674203281042677), 'High_Yes': np.float64(0.280346494725223), 'Low_Yes': np.float64(0.45556305392848734)}


In [50]:
x_balanced = pd.concat([x_test]+ x_resample, ignore_index=True)
y_balanced = pd.concat([y_test,pd.Series(y_resample)],ignore_index=True)

rws_df_train = pd.DataFrame(x_balanced,columns=x_balanced.columns)
rws_df_train["target"] = y_balanced
rws_df_train.to_csv("Testing Data (RWS Oversampling).csv",index=False)

print(y_balanced.value_counts())

# print("Count after appling roulette wheel",Counter(x_balanced),Counter(y_balanced))

Medium_Yes    1348
Medium_No     1348
High_No       1348
Low_No        1348
Low_Yes       1348
High_Yes      1348
Name: count, dtype: int64


In [23]:
print(df.columns)
valueCounts = dict(df["target"].value_counts())

minority_Class_freq = min(valueCounts.values())

X = df.drop(columns=["target"])
Y = df["target"]
encoders = {}
# Encode all categorical columns
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] =  le

print(encoders)


Index(['Age_at_Marriage', 'Gender', 'Education_Level', 'Caste_Match',
       'Religion', 'Urban_Rural', 'Income_Level', 'Spouse_Working',
       'Inter-Caste', 'Inter-Religion', 'target'],
      dtype='object')
{'Gender': LabelEncoder(), 'Education_Level': LabelEncoder(), 'Caste_Match': LabelEncoder(), 'Religion': LabelEncoder(), 'Urban_Rural': LabelEncoder(), 'Income_Level': LabelEncoder(), 'Spouse_Working': LabelEncoder(), 'Inter-Caste': LabelEncoder(), 'Inter-Religion': LabelEncoder()}


In [26]:

# Cluster centroids Prototype undersampling
print(Counter(Y))
cc = ClusterCentroids(random_state=42)
X_us,Y_us = cc.fit_resample(X,Y)
    

print(Counter(Y_us))

Counter({'Medium_No': 4492, 'High_No': 2689, 'Low_No': 1818, 'Medium_Yes': 509, 'High_Yes': 304, 'Low_Yes': 188})
Counter({'High_No': 188, 'High_Yes': 188, 'Low_No': 188, 'Low_Yes': 188, 'Medium_No': 188, 'Medium_Yes': 188})


In [27]:
cc_undersapled_df = pd.DataFrame(X_us,columns=X.columns)
cc_undersapled_df["target"] = Y_us

decoded_X = X_us.copy()

for col, le in encoders.items():
    decoded_X[col] = le.inverse_transform(decoded_X[col])

dataset = decoded_X
dataset["target"] = Y_us 

print(dataset["target"].value_counts())

# Save the decoded dataset, not the encoded one!
dataset.to_csv("Testing Data CC undersampling.csv", index=False)

# print("File saved successfully with decoded categorical values")


target
High_No       188
High_Yes      188
Low_No        188
Low_Yes       188
Medium_No     188
Medium_Yes    188
Name: count, dtype: int64


In [29]:
#Saving the encoder to category encoder for future use
with open('category_encoder.pkl','wb') as f:
    p.dump(encoders,f) 

In [5]:
df = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")
# Distribution of target data

combinations = pd.MultiIndex.from_product(
    [df["Marital_Satisfaction"].unique(),df["Divorce_Status"].unique()],
    names=["Marital_Satisfaction","Divorce_Status"]
)

count = dict(df.groupby(["Marital_Satisfaction","Divorce_Status"]).size())

print(count)
df["target"] =  df["Marital_Satisfaction"].astype(str)+'_'+df["Divorce_Status"].astype(str)
df.drop(columns=["Marital_Satisfaction","Divorce_Status","ID"],inplace=True)
df.head()


{('High', 'No'): np.int64(2689), ('High', 'Yes'): np.int64(304), ('Low', 'No'): np.int64(1818), ('Low', 'Yes'): np.int64(188), ('Medium', 'No'): np.int64(4492), ('Medium', 'Yes'): np.int64(509)}


Unnamed: 0,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion,target
0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,No,Medium_Yes
1,28,Female,School,Same,Hindu,Rural,Middle,No,No,Yes,Low_No
2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,No,Medium_No
3,26,Female,School,Different,Hindu,Urban,High,No,Yes,No,Low_No
4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,No,Yes,Medium_No


In [8]:
valueCounts = dict(df["target"].value_counts())

minority_Class_freq = min(valueCounts.values())
print(minority_Class_freq)

X = df.drop(columns=["target"])
Y = df["target"]

undersampled_data = RandomUnderSampler(sampling_strategy={
    col : minority_Class_freq for col,_ in valueCounts.items()
},random_state=42)

X_us,Y_us = undersampled_data.fit_resample(X,Y)
print(Counter(Y_us))
undersampled_data_frame = pd.DataFrame(X_us,columns=X_us.columns)
undersampled_data_frame["target"] = Y_us
undersampled_data_frame.to_csv("Testing data Undersampled(All).csv",index=False)

188
Counter({'High_No': 188, 'High_Yes': 188, 'Low_No': 188, 'Low_Yes': 188, 'Medium_No': 188, 'Medium_Yes': 188})
