In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler,ClusterCentroids
import random
import pickle as pk

In [53]:
df = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")
df.shape

(10000, 11)

In [35]:
# # Distribution of target data

# combinations = pd.MultiIndex.from_product(
#     [df["Marital_Satisfaction"].unique(),df["Divorce_Status"].unique()],
#     names=["Marital_Satisfaction","Divorce_Status"]
# )

# count = dict(df.groupby(["Marital_Satisfaction","Divorce_Status"]).size())

# print(count)

{('High', 'No'): np.int64(2689), ('High', 'Yes'): np.int64(304), ('Low', 'No'): np.int64(1818), ('Low', 'Yes'): np.int64(188), ('Medium', 'No'): np.int64(4492), ('Medium', 'Yes'): np.int64(509)}


In [36]:
# df["target"] =  df["Marital_Satisfaction"].astype(str)+'_'+df["Divorce_Status"].astype(str)
# df.drop(columns=["Marital_Satisfaction","Divorce_Status","ID"],inplace=True)
# df.head()

Unnamed: 0,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion,target
0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,No,Medium_Yes
1,28,Female,School,Same,Hindu,Rural,Middle,No,No,Yes,Low_No
2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,No,Medium_No
3,26,Female,School,Different,Hindu,Urban,High,No,Yes,No,Low_No
4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,No,Yes,Medium_No


In [37]:
# df.shape
# df["Spouse_Working"].value_counts()

Spouse_Working
Yes    5047
No     4953
Name: count, dtype: int64

In [50]:
# df.to_csv("../../data/processed/TrainingData/marriage_data_india.csv", index=False)

In [19]:
# Stratified random over sampling t

x = df.drop(columns=["target"])
y = df["target"]
print((y.value_counts()).median())
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.2,stratify=y,random_state=42
)

print("before sampling", Counter(y_train))

val_count = y_train.value_counts()
median_class_size = (val_count).median()
print("median class size ", median_class_size)

sampled_data = RandomOverSampler(sampling_strategy="not majority", random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

print("After Random Over Sampling to the majority class size", Counter(y_os))




sampled_data = RandomOverSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count < median_class_size
    },random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

sampled_data = RandomUnderSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count > median_class_size
    },random_state=42)
x_us,y_us= sampled_data.fit_resample(x_os,y_os)

print("After Random Over Sampling to median class size", Counter(y_os))
print("After Random Under Sampling to median class size", Counter(y_us))



1163.5
before sampling Counter({'Medium_No': 3594, 'High_No': 2151, 'Low_No': 1454, 'Medium_Yes': 407, 'High_Yes': 243, 'Low_Yes': 151})
median class size  930.5
After Random Over Sampling to the majority class size Counter({'Medium_No': 3594, 'High_Yes': 3594, 'Low_No': 3594, 'High_No': 3594, 'Medium_Yes': 3594, 'Low_Yes': 3594})
After Random Over Sampling to median class size Counter({'Medium_No': 3594, 'High_No': 2151, 'Low_No': 1454, 'High_Yes': 930, 'Medium_Yes': 930, 'Low_Yes': 930})
After Random Under Sampling to median class size Counter({'High_No': 930, 'High_Yes': 930, 'Low_No': 930, 'Low_Yes': 930, 'Medium_No': 930, 'Medium_Yes': 930})


In [28]:
# Roulette Wheel Technique for Resampling
x_train = df.drop(columns=["target"])
y_train = df["target"]

val_count = dict(y_train.value_counts())

weight_class = {cls: (1.0/count) for cls,count in val_count.items()}
print("weights ",weight_class)
total_weight = sum(weight_class.values())
print("Total weight",total_weight)
probabilities = {cls: (wg/total_weight) for cls,wg in weight_class.items()}
print("probabilteies ",probabilities)


count_max = max(val_count.values())

x_resample,y_resample = [],[]
while True:
    curr_count = Counter(y_train.to_list() + y_resample)
    if all(c >= count_max for c in curr_count.values()):
        break

    choosen_class = random.choices(
        population=list(probabilities.keys()),
        weights=list(probabilities.values())
    )[0]
    if curr_count[choosen_class] >= count_max:
        continue
    record = x_train[y_train==choosen_class].sample(n=1,replace=True)
    x_resample.append(record)
    y_resample.append(choosen_class) 



weights  {'Medium_No': np.float64(0.0002226179875333927), 'High_No': np.float64(0.0003718854592785422), 'Low_No': np.float64(0.00055005500550055), 'Medium_Yes': np.float64(0.0019646365422396855), 'High_Yes': np.float64(0.003289473684210526), 'Low_Yes': np.float64(0.005319148936170213)}
Total weight 0.011717817614932909
probabilteies  {'Medium_No': np.float64(0.01899824650365727), 'High_No': np.float64(0.031736750946235945), 'Low_No': np.float64(0.04694176198813446), 'Medium_Yes': np.float64(0.16766232474347437), 'High_Yes': np.float64(0.28072408978430413), 'Low_Yes': np.float64(0.4539368260341939)}


In [29]:
x_balanced = pd.concat([x_train]+ x_resample, ignore_index=True)
y_balanced = pd.concat([y_train,pd.Series(y_resample)],ignore_index=True)

rws_df_train = pd.DataFrame(x_balanced,columns=x_balanced.columns)
rws_df_train["target"] = y_balanced
rws_df_train.to_csv("Training Data (RWS Oversampling).csv",index=False)

print(x_balanced)

# print("Count after appling roulette wheel",Counter(x_balanced),Counter(y_balanced))

       Age_at_Marriage  Gender Education_Level Caste_Match Religion  \
0                   23    Male        Graduate   Different    Hindu   
1                   28  Female          School        Same    Hindu   
2                   39    Male    Postgraduate        Same   Muslim   
3                   26  Female          School   Different    Hindu   
4                   32  Female        Graduate        Same    Hindu   
...                ...     ...             ...         ...      ...   
26947               32  Female        Graduate        Same    Hindu   
26948               22  Female          School   Different     Sikh   
26949               33    Male          School        Same     Sikh   
26950               33  Female          School        Same   Muslim   
26951               33  Female        Graduate        Same     Sikh   

      Urban_Rural Income_Level Spouse_Working Inter-Caste Inter-Religion  
0           Urban       Middle             No          No             No

In [80]:
df = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")

X_train = df.drop(columns=["target"])
Y_train = df["target"]

val_count = (Y_train.value_counts())
median_class_size = (val_count).median()
encoders = {}
# Encode all categorical columns
for col in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    encoders[col] =  le

print(encoders)


{'Gender': LabelEncoder(), 'Education_Level': LabelEncoder(), 'Caste_Match': LabelEncoder(), 'Religion': LabelEncoder(), 'Urban_Rural': LabelEncoder(), 'Income_Level': LabelEncoder(), 'Spouse_Working': LabelEncoder(), 'Inter-Caste': LabelEncoder(), 'Inter-Religion': LabelEncoder()}


In [81]:
# Cluster centroids Prototype undersampling
print(Counter(y_train))

strategy={
    target_class: 509 for target_class,count in val_count.items() if count > 509
}
cc = ClusterCentroids(sampling_strategy=strategy,random_state=42)
X_train,Y_train = cc.fit_resample(X_train,Y_train)
    

print(Counter(Y_train))

Counter({'Medium_No': 4492, 'High_No': 2689, 'Low_No': 1818, 'Medium_Yes': 509, 'High_Yes': 304, 'Low_Yes': 188})
Counter({'High_No': 509, 'Low_No': 509, 'Medium_No': 509, 'Medium_Yes': 509, 'High_Yes': 304, 'Low_Yes': 188})


In [82]:
train_dataset = pd.DataFrame(X_train,columns=X_train.columns)
train_dataset["target"] = Y_train

train_dataset.to_csv("Testing Data CC undersampling.csv",index=False)

train_dataset["target"].value_counts()

target
High_No       509
Low_No        509
Medium_Yes    509
Medium_No     509
High_Yes      304
Low_Yes       188
Name: count, dtype: int64

In [83]:
# print(encoders)
decoded_X_train = X_train.copy()

for col, le in encoders.items():
    decoded_X_train[col] = le.inverse_transform(decoded_X_train[col])
    # print(decoded_X_train[col])

dataset = decoded_X_train
dataset["target"] = Y_train  

print(dataset["target"].value_counts())

# Save the decoded dataset, not the encoded one!
dataset.to_csv("Testing Data CC undersampling.csv", index=False)

# print("File saved successfully with decoded categorical values")


target
High_No       509
Low_No        509
Medium_Yes    509
Medium_No     509
High_Yes      304
Low_Yes       188
Name: count, dtype: int64


In [87]:
with open('category_encoder.pkl','wb') as f:
    pk.dump(encoders,f)