In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import ttest_ind, chi2_contingency
import numpy as np
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler,ClusterCentroids
import random
import pickle as pk

In [53]:
df = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")
df.shape

(10000, 11)

In [35]:
# # Distribution of target data

# combinations = pd.MultiIndex.from_product(
#     [df["Marital_Satisfaction"].unique(),df["Divorce_Status"].unique()],
#     names=["Marital_Satisfaction","Divorce_Status"]
# )

# count = dict(df.groupby(["Marital_Satisfaction","Divorce_Status"]).size())

# print(count)

{('High', 'No'): np.int64(2689), ('High', 'Yes'): np.int64(304), ('Low', 'No'): np.int64(1818), ('Low', 'Yes'): np.int64(188), ('Medium', 'No'): np.int64(4492), ('Medium', 'Yes'): np.int64(509)}


In [36]:
# df["target"] =  df["Marital_Satisfaction"].astype(str)+'_'+df["Divorce_Status"].astype(str)
# df.drop(columns=["Marital_Satisfaction","Divorce_Status","ID"],inplace=True)
# df.head()

Unnamed: 0,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion,target
0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,No,Medium_Yes
1,28,Female,School,Same,Hindu,Rural,Middle,No,No,Yes,Low_No
2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,No,Medium_No
3,26,Female,School,Different,Hindu,Urban,High,No,Yes,No,Low_No
4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,No,Yes,Medium_No


In [37]:
# df.shape
# df["Spouse_Working"].value_counts()

Spouse_Working
Yes    5047
No     4953
Name: count, dtype: int64

In [50]:
# df.to_csv("../../data/processed/TrainingData/marriage_data_india.csv", index=False)

In [19]:
# Stratified random over sampling t

x = df.drop(columns=["target"])
y = df["target"]
print((y.value_counts()).median())
x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.2,stratify=y,random_state=42
)

print("before sampling", Counter(y_train))

val_count = y_train.value_counts()
median_class_size = (val_count).median()
print("median class size ", median_class_size)

sampled_data = RandomOverSampler(sampling_strategy="not majority", random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

print("After Random Over Sampling to the majority class size", Counter(y_os))




sampled_data = RandomOverSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count < median_class_size
    },random_state=42)
x_os,y_os= sampled_data.fit_resample(x_train,y_train)

sampled_data = RandomUnderSampler(
    sampling_strategy={
        target_class: int(median_class_size) for target_class,count in val_count.items() if count > median_class_size
    },random_state=42)
x_us,y_us= sampled_data.fit_resample(x_os,y_os)

print("After Random Over Sampling to median class size", Counter(y_os))
print("After Random Under Sampling to median class size", Counter(y_us))



1163.5
before sampling Counter({'Medium_No': 3594, 'High_No': 2151, 'Low_No': 1454, 'Medium_Yes': 407, 'High_Yes': 243, 'Low_Yes': 151})
median class size  930.5
After Random Over Sampling to the majority class size Counter({'Medium_No': 3594, 'High_Yes': 3594, 'Low_No': 3594, 'High_No': 3594, 'Medium_Yes': 3594, 'Low_Yes': 3594})
After Random Over Sampling to median class size Counter({'Medium_No': 3594, 'High_No': 2151, 'Low_No': 1454, 'High_Yes': 930, 'Medium_Yes': 930, 'Low_Yes': 930})
After Random Under Sampling to median class size Counter({'High_No': 930, 'High_Yes': 930, 'Low_No': 930, 'Low_Yes': 930, 'Medium_No': 930, 'Medium_Yes': 930})


In [28]:
# Roulette Wheel Technique for Resampling
x_train = df.drop(columns=["target"])
y_train = df["target"]

val_count = dict(y_train.value_counts())

weight_class = {cls: (1.0/count) for cls,count in val_count.items()}
print("weights ",weight_class)
total_weight = sum(weight_class.values())
print("Total weight",total_weight)
probabilities = {cls: (wg/total_weight) for cls,wg in weight_class.items()}
print("probabilteies ",probabilities)


count_max = max(val_count.values())

x_resample,y_resample = [],[]
while True:
    curr_count = Counter(y_train.to_list() + y_resample)
    if all(c >= count_max for c in curr_count.values()):
        break

    choosen_class = random.choices(
        population=list(probabilities.keys()),
        weights=list(probabilities.values())
    )[0]
    if curr_count[choosen_class] >= count_max:
        continue
    record = x_train[y_train==choosen_class].sample(n=1,replace=True)
    x_resample.append(record)
    y_resample.append(choosen_class) 



weights  {'Medium_No': np.float64(0.0002226179875333927), 'High_No': np.float64(0.0003718854592785422), 'Low_No': np.float64(0.00055005500550055), 'Medium_Yes': np.float64(0.0019646365422396855), 'High_Yes': np.float64(0.003289473684210526), 'Low_Yes': np.float64(0.005319148936170213)}
Total weight 0.011717817614932909
probabilteies  {'Medium_No': np.float64(0.01899824650365727), 'High_No': np.float64(0.031736750946235945), 'Low_No': np.float64(0.04694176198813446), 'Medium_Yes': np.float64(0.16766232474347437), 'High_Yes': np.float64(0.28072408978430413), 'Low_Yes': np.float64(0.4539368260341939)}


In [4]:
x_balanced = pd.concat([x_train]+ x_resample, ignore_index=True)
y_balanced = pd.concat([y_train,pd.Series(y_resample)],ignore_index=True)

rws_df_train = pd.DataFrame(x_balanced,columns=x_balanced.columns)
rws_df_train["target"] = y_balanced
rws_df_train.to_csv("Training Data (RWS Oversampling).csv",index=False)

# print(x_balanced)
y_balanced.value_counts()

# # print("Count after appling roulette wheel",Counter(x_balanced),Counter(y_balanced))

In [3]:
df_original = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")
df_oversampled = pd.read_csv(r"D:\FinalYearProject\Matrimony_Matchmaker\notebooks\Training-Testing Dataset\Training Data (RWS Oversampling).csv")

# Identify column types
numerical_cols = df_original.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df_original.select_dtypes(include=['object', 'category']).columns.tolist()

print("--- T-Test Calculations (RWS vs. Original) ---")
for col in numerical_cols:
    data_orig = df_original[col].dropna()
    data_rws = df_oversampled[col].dropna()
    t_stat, p_value = ttest_ind(data_orig, data_rws, equal_var=False)
    print(f"Feature: {col}")
    print(f"  T-Statistic: {t_stat:.4f}")
    print(f"  P-Value: {p_value:.4f}")
    print(f"  Means: Original={data_orig.mean():.2f}, RWS={data_rws.mean():.2f}")

print("\n--- Chi-Square Calculations (RWS vs. Original) ---")
for col in categorical_cols + ['target']:
    combined_df = pd.DataFrame({
        col: pd.concat([df_original[col], df_oversampled[col]]),
        'Sample_Group': ['Original'] * len(df_original) + ['RWS Oversampled'] * len(df_oversampled)
    })
    
    contingency_table = pd.crosstab(combined_df[col], combined_df['Sample_Group'])
    
    # Handle zero counts by adding a tiny epsilon for chi2_contingency
    if contingency_table.min().min() == 0:
        contingency_table += 1e-10

    chi2, p_value, dof, expected = chi2_contingency(contingency_table)

    print(f"Feature: {col}")
    print(f"  Chi2-Statistic: {chi2:.4f}")
    print(f"  P-Value: {p_value:.4f}")
    print(f"  Degrees of Freedom (DOF): {dof}")

--- T-Test Calculations (RWS vs. Original) ---
Feature: Age_at_Marriage
  T-Statistic: -2.3490
  P-Value: 0.0188
  Means: Original=28.50, RWS=28.68

--- Chi-Square Calculations (RWS vs. Original) ---
Feature: Gender
  Chi2-Statistic: 2.2658
  P-Value: 0.1323
  Degrees of Freedom (DOF): 1
Feature: Education_Level
  Chi2-Statistic: 1.8226
  P-Value: 0.6100
  Degrees of Freedom (DOF): 3
Feature: Caste_Match
  Chi2-Statistic: 0.6957
  P-Value: 0.4042
  Degrees of Freedom (DOF): 1
Feature: Religion
  Chi2-Statistic: 4.9215
  P-Value: 0.2955
  Degrees of Freedom (DOF): 4
Feature: Urban_Rural
  Chi2-Statistic: 0.8140
  P-Value: 0.3670
  Degrees of Freedom (DOF): 1
Feature: Income_Level
  Chi2-Statistic: 7.8564
  P-Value: 0.0197
  Degrees of Freedom (DOF): 2
Feature: Spouse_Working
  Chi2-Statistic: 0.3923
  P-Value: 0.5311
  Degrees of Freedom (DOF): 1
Feature: Inter-Caste
  Chi2-Statistic: 3.5725
  P-Value: 0.0587
  Degrees of Freedom (DOF): 1
Feature: Inter-Religion
  Chi2-Statistic: 5.8018

In [1]:
df_original = pd.read_csv("../../data/processed/TrainingData/training_data_8000_v4.csv")
df_oversampled = pd.read_csv("Training Data (RWS Oversampling).csv")

# Identify column types
numerical_cols = df_original.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df_original.select_dtypes(include=['object', 'category']).columns.tolist()

print("--- T-Test Calculations (RWS vs. Original) ---")
for col in numerical_cols:
    data_orig = df_original[col].dropna()
    data_rws = df_oversampled[col].dropna()
    t_stat, p_value = ttest_ind(data_orig, data_rws, equal_var=False)
    print(f"Feature: {col}")
    print(f"  T-Statistic: {t_stat:.4f}")
    print(f"  P-Value: {p_value:.4f}")
    print(f"  Means: Original={data_orig.mean():.2f}, RWS={data_rws.mean():.2f}")

print("\n--- Chi-Square Calculations (RWS vs. Original) ---")
for col in categorical_cols + ['target']:
    combined_df = pd.DataFrame({
        col: pd.concat([df_original[col], df_oversampled[col]]),
        'Sample_Group': ['Original'] * len(df_original) + ['RWS Oversampled'] * len(df_oversampled)
    })
    
    contingency_table = pd.crosstab(combined_df[col], combined_df['Sample_Group'])
    
    # Handle zero counts by adding a tiny epsilon for chi2_contingency
    if contingency_table.min().min() == 0:
        contingency_table += 1e-10

    chi2, p_value, dof, expected = chi2_contingency(contingency_table)

    print(f"Feature: {col}")
    print(f"  Chi2-Statistic: {chi2:.4f}")
    print(f"  P-Value: {p_value:.4f}")
    print(f"  Degrees of Freedom (DOF): {dof}")

NameError: name 'pd' is not defined

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df_original = pd.read_csv("training_data_8000_v4.csv")
df_oversampled = pd.read_csv("Training Data (RWS Oversampling).csv")

df_original['Source'] = 'Original (Representative)'
df_oversampled['Source'] = 'RWS Oversampled (Balanced)'
df_combined = pd.concat([df_original, df_oversampled], ignore_index=True)

plt.figure(figsize=(8, 6))
sns.boxplot(
    data=df_combined,
    x='Source',
    y='Age_at_Marriage',
    palette=['#4c72b0', '#55a868']
)
plt.title('Age at Marriage Distribution: Original vs. RWS', fontsize=14)
plt.xlabel('Data Source', fontsize=12)
plt.ylabel('Age at Marriage (Years)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.savefig("visual_justification_age_at_marriage.png")
plt.close()

income_counts = df_combined.groupby('Source')['Income_Level'].value_counts(normalize=True).mul(100).rename('Proportion').reset_index()

plt.figure(figsize=(10, 6))
sns.barplot(
    data=income_counts,
    x='Income_Level',
    y='Proportion',
    hue='Source',
    palette=['#4c72b0', '#55a868']
)
plt.title('Income Level Distribution: Original vs. RWS', fontsize=14)
plt.xlabel('Income Level', fontsize=12)
plt.ylabel('Proportion (%)', fontsize=12)
plt.legend(title='Data Source')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("visual_justification_income_level.png")
plt.close()

target_counts = df_combined.groupby('Source')['target'].value_counts(normalize=True).mul(100).rename('Proportion').reset_index()

plt.figure(figsize=(12, 6))
sns.barplot(
    data=target_counts,
    x='target',
    y='Proportion',
    hue='Source',
    palette=['#4c72b0', '#55a868']
)
plt.title('Target Variable Distribution: Original vs. RWS Oversampled', fontsize=14)
plt.xlabel('Target Class', fontsize=12)
plt.ylabel('Proportion (%)', fontsize=12)
plt.legend(title='Data Source', loc='upper right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig("visual_justification_target.png")
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(


In [None]:
df_original = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")

y = df_original['target']

X_train_pool, X_new_testing, y_train_pool, y_new_testing = train_test_split(
    X, y,
    test_size=0.25,      
    random_state=42,     
    stratify=y           # Crucially, use stratification to preserve the original target distribution
)

df_new_testing = pd.concat([X_new_testing, y_new_testing], axis=1)

new_testing_file_name = "Testing Data (Representative).csv"
df_new_testing.to_csv(new_testing_file_name, index=False)

print(f"New Representative Testing Data saved as: {new_testing_file_name}")
print(f"New Testing Sample Size: {df_new_testing.shape[0]} rows")
print("\n--- Next Steps ---")
print("1. Use your existing 'Training Data (RWS Oversampling).csv' for training.")
print(f"2. Use the new '{new_testing_file_name}' for testing and final model evaluation.")