In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the original dataset
df_original = pd.read_csv("../../data/processed/TrainingData/marriage_data_india.csv")

# 1. Load the existing (Oversampled) Training Data
# We keep this as is, as it's balanced for training purposes.
# df_training_oversampled = pd.read_csv("Training Data (RWS Oversampling).csv")

# 2. Split the Original Data to create a NEW, Representative Test Set
# We will use the 'target' column for stratified sampling to ensure
# the class distribution in the new test set mirrors the original.
# We aim for a test size of approximately 25% (2500 rows).

# Separate features (X) and target (y)
X = df_original.drop('target', axis=1)
y = df_original['target']

# The 'train_pool' will be combined with your existing oversampled training data later.
# The 'df_new_testing' will be your new, representative test set.
X_train_pool, X_new_testing, y_train_pool, y_new_testing = train_test_split(
    X, y,
    test_size=0.25,      # Keep 25% for testing (~2500 rows)
    random_state=42,     # Set a random seed for reproducibility
    stratify=y           # Crucially, use stratification to preserve the original target distribution
)

# Combine features and target for the new testing data
df_new_testing = pd.concat([X_new_testing, y_new_testing], axis=1)

# 3. Save the new, representative testing data to a CSV file
new_testing_file_name = "Testing Data (Representative).csv"
df_new_testing.to_csv(new_testing_file_name, index=False)

print(f"New Representative Testing Data saved as: {new_testing_file_name}")
print(f"New Testing Sample Size: {df_new_testing.shape[0]} rows")
print("\n--- Next Steps ---")
print("1. Use your existing 'Training Data (RWS Oversampling).csv' for training.")
print(f"2. Use the new '{new_testing_file_name}' for testing and final model evaluation.")

New Representative Testing Data saved as: Testing Data (Representative).csv
New Testing Sample Size: 2500 rows

--- Next Steps ---
1. Use your existing 'Training Data (RWS Oversampling).csv' for training.
2. Use the new 'Testing Data (Representative).csv' for testing and final model evaluation.


# Define the order for ordinal features (important for CatBoost to understand ranking)
# --- 0. Configuration ---
# Define categorical features based on your data structure

In [74]:
FILE_PATH = '../../data/raw/TrainingData/marriage_data_india.csv'
NOMINAL_COLS = ['Marriage_Type', 'Gender', 'Caste_Match', 'Religion', 'Urban_Rural', 
                'Spouse_Working', 'Inter-Caste', 'Inter-Religion'] 
ORDINAL_COLS = ['Education_Level', 'Income_Level']
NUMERIC_COLS = ['Age_at_Marriage'] 
RANDOM_SEED = 42

EDUCATION_ORDER = ['School', 'Graduate', 'Postgraduate', 'PhD']
INCOME_ORDER = ['Low', 'Middle', 'High']
CATEGORIES_ORDER = [EDUCATION_ORDER, INCOME_ORDER]


# --- 1. Data Loading and Target Redefinition (Feature Engineering) ---
# Step 1: Create the new binary target 'Match_Success'
# Success (1): High/Medium Satisfaction AND No Divorce
# Failure (0): Low Satisfaction OR Yes Divorce

In [75]:
df = pd.read_csv(FILE_PATH)
df['Match_Success'] = np.where(
    ((df['Marital_Satisfaction'].isin(['High', 'Medium'])) & (df['Divorce_Status'] == 'No')),
    1, 
    0   
)
df.head()


Unnamed: 0,ID,Marriage_Type,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Parental_Approval,Urban_Rural,Dowry_Exchanged,Marital_Satisfaction,Divorce_Status,Children_Count,Income_Level,Years_Since_Marriage,Spouse_Working,Inter-Caste,Inter-Religion,Match_Success
0,1,Love,23,Male,Graduate,Different,Hindu,No,Urban,No,Medium,Yes,5,Middle,34,No,No,No,0
1,2,Love,28,Female,School,Same,Hindu,Yes,Rural,Yes,Low,No,3,Middle,42,No,No,Yes,0
2,3,Arranged,39,Male,Postgraduate,Same,Muslim,Yes,Rural,No,Medium,No,0,High,25,No,No,No,1
3,4,Arranged,26,Female,School,Different,Hindu,Yes,Urban,Yes,Low,No,0,High,12,No,Yes,No,0
4,5,Love,32,Female,Graduate,Same,Hindu,Partial,Rural,Yes,Medium,No,1,Middle,41,No,No,Yes,1


# Clean the target variable: Handle cases where the original 'target' might be corrupted
# This assumes the combined satisfaction/divorce status is the ultimate source.

# Drop features that cause data leakage or are irrelevant for prediction

In [76]:
df.drop(columns=[
    'ID', 'Children_Count', 'Years_Since_Marriage', 'Parental_Approval','Marriage_Type',
    'Dowry_Exchanged', 'Marital_Satisfaction', 'Divorce_Status', 'target' 
], errors='ignore', inplace=True)
df.head()

Unnamed: 0,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion,Match_Success
0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,No,0
1,28,Female,School,Same,Hindu,Rural,Middle,No,No,Yes,0
2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,No,1
3,26,Female,School,Different,Hindu,Urban,High,No,Yes,No,0
4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,No,Yes,1


# --- 2. Train/Test Split Generator and Statistical Check ---
# Use StratifiedShuffleSplit to create the split indices

# Note: Since StratifiedShuffleSplit is used, the statistical representation of all
# categorical and numerical features is inherently maximized for binary target models.
# The printed balance confirms the most important aspect: Target Class balance is perfect.

In [77]:
df.head()

Unnamed: 0,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Spouse_Working,Inter-Caste,Inter-Religion,Match_Success
0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,No,0
1,28,Female,School,Same,Hindu,Rural,Middle,No,No,Yes,0
2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,No,1
3,26,Female,School,Different,Hindu,Urban,High,No,Yes,No,0
4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,No,Yes,1


In [78]:
target_col = "Match_Success"
X = df.drop(columns=[target_col])
y = df[target_col]
    

splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_SEED)
train_idx, test_idx = next(splitter.split(X, y))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

class_counts = Counter(y_train)
total_samples = len(y_train)
class_weights = {
    cls: total_samples / (len(class_counts) * count)
    for cls, count in class_counts.items()
}
print("\n--- Split Verification ---")
print(f"Original Target Balance: {y.value_counts(normalize=True).mul(100).round(2)}%")
print(f"Train Target Balance:    {y_train.value_counts(normalize=True).mul(100).round(2)}%")
print(f"Test Target Balance:     {y_test.value_counts(normalize=True).mul(100).round(2)}%")
print(f"Training set size: {len(X_train)}, Testing set size: {len(X_test)}")
print(f"Class Weights for Model: {class_weights}")




--- Split Verification ---
Original Target Balance: Match_Success
1    71.81
0    28.19
Name: proportion, dtype: float64%
Train Target Balance:    Match_Success
1    71.81
0    28.19
Name: proportion, dtype: float64%
Test Target Balance:     Match_Success
1    71.8
0    28.2
Name: proportion, dtype: float64%
Training set size: 8000, Testing set size: 2000
Class Weights for Model: {0: 1.7738359201773837, 1: 0.6962576153176675}


# --- 3. Preprocessing Pipeline Definition ---    
# Ordinal Preprocessor: Encodes ordered categorical features    
# Nominal Preprocessor: One-Hot Encodes unordered categorical features    
# Numeric Preprocessor: Standard scales the continuous feature



In [80]:
# --- 4. Preprocessing Pipeline Definition (The FINAL Indexing Logic) ---

# Define individual preprocessors
ordinal_preprocessor = OrdinalEncoder(
    categories=CATEGORIES_ORDER, handle_unknown='use_encoded_value', unknown_value=-1
)
numeric_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        # Transformer 1: Encode Ordinal features (2 columns)
        ('ord', ordinal_preprocessor, ORDINAL_COLS), 
        # Transformer 2: Scale Numeric features (1 column)
        ('num', numeric_preprocessor, NUMERIC_COLS) 
    ],
    remainder='passthrough', # Passes the remaining Nominal columns untouched
    verbose_feature_names_out=False
)

# Fit the preprocessor before use in the main pipeline
preprocessor.fit(X_train) 

# --- CRITICAL DYNAMIC INDEX CALCULATION ---
# The number of numerically transformed columns (Ordinal + Numeric) is fixed at 3
n_transformed_cols = len(ORDINAL_COLS) + len(NUMERIC_COLS) # = 3 

# DYNAMICALLY calculate the actual number of nominal columns remaining in X_train
total_features_in_X_train = X_train.shape[1] 
actual_nominal_count = total_features_in_X_train - n_transformed_cols # Should be 10 - 3 = 7

# Recalculate the indices based on the ACTUAL count of 7 nominal columns
cat_feature_indices = list(range(n_transformed_cols, n_transformed_cols + actual_nominal_count))
# This should result in [3, 4, 5, 6, 7, 8, 9]
# The highest index is 9, which is < 10, resolving the error!

print(f"\n[DEBUG] Total columns found in X_train: {total_features_in_X_train}")
print(f"[DEBUG] CatBoost now expects categorical features at indices: {cat_feature_indices}")




[DEBUG] Total columns found in X_train: 10
[DEBUG] CatBoost now expects categorical features at indices: [3, 4, 5, 6, 7, 8, 9]


# THE CATBOOST : Uses the calculated indices to tell CatBoost where the text columns are

In [81]:
model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=RANDOM_SEED,
    verbose=0,
    n_estimators=500,
    early_stopping_rounds=50,
    class_weights=class_weights,
    cat_features=cat_feature_indices 
)

In [82]:
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Final Training Step (Should now run successfully!)
print("\n--- Training CatBoost Model (Final & Correct) ---")
full_pipeline.fit(X_train, y_train) 

print("\nModel trained successfully! You can now proceed to evaluation.")
# The AUC score and prediction probability generation would follow here.



--- Training CatBoost Model (Final & Correct) ---

Model trained successfully! You can now proceed to evaluation.


In [85]:
# --- 1. Generate Predictions ---

# Generate probability predictions on the test set.
# We use the second column (index 1) which corresponds to the probability of class 1 (Match_Success).
y_pred_proba = full_pipeline.predict_proba(X_test)[:, 1]

# Generate hard (binary) predictions based on the model's default threshold (0.5)
y_pred = full_pipeline.predict(X_test)


# --- 2. Calculate AUC Score ---
auc_score = roc_auc_score(y_test, y_pred_proba)

print("\n=======================================================")
print(f"| Test Set AUC Score: {auc_score:.4f} |")
print("=======================================================")

# --- 3. Print Classification Report and Confusion Matrix ---

print("\nClassification Report (Accuracy, Precision, Recall, F1):")
# This report helps understand performance for the less frequent class (0)
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print("---------------------------------------")
print(f"| True Negatives (0,0): {cm[0, 0]:<5} | False Positives (0,1): {cm[0, 1]:<5} |")
print("---------------------------------------")
print(f"| False Negatives (1,0): {cm[1, 0]:<5} | True Positives (1,1): {cm[1, 1]:<5} |")
print("---------------------------------------")



| Test Set AUC Score: 0.4796 |

Classification Report (Accuracy, Precision, Recall, F1):
              precision    recall  f1-score   support

           0       0.26      0.38      0.31       564
           1       0.70      0.58      0.64      1436

    accuracy                           0.52      2000
   macro avg       0.48      0.48      0.47      2000
weighted avg       0.58      0.52      0.54      2000

Confusion Matrix:
---------------------------------------
| True Negatives (0,0): 214   | False Positives (0,1): 350   |
---------------------------------------
| False Negatives (1,0): 603   | True Positives (1,1): 833   |
---------------------------------------
