In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [3]:
df = pd.read_csv("../../../data/raw/TrainingData/marriage_data_india.csv")

In [4]:
df.columns

Index(['ID', 'Marriage_Type', 'Age_at_Marriage', 'Gender', 'Education_Level',
       'Caste_Match', 'Religion', 'Parental_Approval', 'Urban_Rural',
       'Dowry_Exchanged', 'Marital_Satisfaction', 'Divorce_Status',
       'Children_Count', 'Income_Level', 'Years_Since_Marriage',
       'Spouse_Working', 'Inter-Caste', 'Inter-Religion'],
      dtype='object')

In [5]:
# combiining the Marital_Satisfaction(Medium/Low/High) & Divorce Status(yes/No) into Compatibility
df["Marital_Satisfaction"].value_counts()
df["Divorce_Status"].value_counts()

# Setting compatibility 1 for couples staisfied (Medium or high and not divorced)
df["Compatibility"] = np.where(
    ((df["Marital_Satisfaction"].isin(["High","Medium"])) & (df["Divorce_Status"] == "No")),
    1,
    0
)
df.drop(columns=["Marital_Satisfaction","Divorce_Status"], inplace=True)
df["Compatibility"].value_counts()

Compatibility
1    7181
0    2819
Name: count, dtype: int64

In [6]:
# converting categorical columns 
# like religion gender education level into numbers
categorical_columns = df.select_dtypes(include=["object"]).columns
labels = {}

for col in  categorical_columns:
    le = LabelEncoder()
    print(col,df[col].unique())
    df[col] = le.fit_transform(df[col])
    labels[col] = le
    print(col,df[col].unique())

Marriage_Type ['Love' 'Arranged']
Marriage_Type [1 0]
Gender ['Male' 'Female']
Gender [1 0]
Education_Level ['Graduate' 'School' 'Postgraduate' 'PhD']
Education_Level [0 3 2 1]
Caste_Match ['Different' 'Same']
Caste_Match [0 1]
Religion ['Hindu' 'Muslim' 'Sikh' 'Christian' 'Others']
Religion [1 2 4 0 3]
Parental_Approval ['No' 'Yes' 'Partial']
Parental_Approval [0 2 1]
Urban_Rural ['Urban' 'Rural']
Urban_Rural [1 0]
Dowry_Exchanged ['No' 'Yes' 'Not Disclosed']
Dowry_Exchanged [0 2 1]
Income_Level ['Middle' 'High' 'Low']
Income_Level [2 0 1]
Spouse_Working ['No' 'Yes']
Spouse_Working [0 1]
Inter-Caste ['No' 'Yes']
Inter-Caste [0 1]
Inter-Religion ['No' 'Yes']
Inter-Religion [0 1]


In [7]:
x = df.drop(columns=["Compatibility","ID"])#Features
y = df["Compatibility"]#Target

x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.2,random_state=42,stratify=y
)
model = RandomForestClassifier(
    n_estimators = 200,
    random_state = 42,
    class_weight = "balanced"
)
model.fit(x_train,y_train)

In [8]:
# Eavlute the model
y_pred = model.predict(x_test)
print("Accuracy", accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy 0.709
              precision    recall  f1-score   support

           0       0.18      0.01      0.02       564
           1       0.72      0.98      0.83      1436

    accuracy                           0.71      2000
   macro avg       0.45      0.50      0.42      2000
weighted avg       0.56      0.71      0.60      2000



In [42]:
df.columns


Index(['ID', 'Marriage_Type', 'Age_at_Marriage', 'Gender', 'Education_Level',
       'Caste_Match', 'Religion', 'Parental_Approval', 'Urban_Rural',
       'Dowry_Exchanged', 'Children_Count', 'Income_Level',
       'Years_Since_Marriage', 'Spouse_Working', 'Inter-Caste',
       'Inter-Religion', 'Compatibility'],
      dtype='object')

In [9]:
# Save model + encoders
joblib.dump(model, "compatibility_model.pkl")
joblib.dump(labels, "label_encoders.pkl")

['label_encoders.pkl']

In [10]:
def parse_profile(profile_str):
    """
    Parse raw CSV-like string into dict with only the attributes required
    for compatibility scoring.
    """
    parts = profile_str.split(",")

    return {
        "ID": parts[0].strip(),
        "Religion": parts[1].strip(),
        "Caste": parts[2].strip(),
        "Mother_Tongue": parts[3].strip(),
        "Profession": parts[4].strip(),
        "Education_Level": parts[5].strip(),
        "Age": int(parts[6].strip()),
        "Height_cm": int(parts[7].strip()),
        "Height": parts[8].strip(),
        "Country": parts[9].strip(),
        "State": parts[10].strip(),
        "City": parts[11].strip(),
        # infer Gender based on ID prefix just for demo
        "Gender": "Male" if parts[0].startswith("V") else "Female"
    }


In [11]:
def safe_encode(value, encoder):
    try:
        return encoder.transform([value])[0]
    except:
        return -1

In [12]:
def get_compatibility_score(profile1, profile2, model, encoders):
    # Feature engineering
    age_at_marriage = (profile1["Age"] + profile2["Age"]) // 2
    same_caste = int(profile1["Caste"].lower() == profile2["Caste"].lower())
    same_religion = int(profile1["Religion"].lower() == profile2["Religion"].lower())

    features = pd.DataFrame([{
        "Marriage_Type": safe_encode("Arranged", encoders["Marriage_Type"]),  # encode instead of raw string
        "Age_at_Marriage": age_at_marriage,
        "Gender": safe_encode(profile1["Gender"], encoders["Gender"]),        # encode gender too
        "Education_Level": safe_encode(profile1["Education_Level"], encoders["Education_Level"]),
        "Caste_Match": same_caste,
        "Religion": safe_encode(profile1["Religion"], encoders["Religion"]),
        "Parental_Approval": 1,
        "Urban_Rural": 0,
        "Dowry_Exchanged": 0,
        "Children_Count": 0,
        "Income_Level": 1,
        "Years_Since_Marriage": 0,
        "Spouse_Working": 1,
        "Inter-Caste": 0 if same_caste else 1,
        "Inter-Religion": 0 if same_religion else 1
    }])

    # Ensure exact same order
    features = features[model.feature_names_in_]

    score = model.predict_proba(features)[0][1]
    return round(score, 3)


In [13]:
loaded_model = joblib.load("compatibility_model.pkl")
loaded_encoders = joblib.load("label_encoders.pkl")

p1 = parse_profile("VVZ5841,Hindu,maratha,english,scientist,masters in medicine,26,162,5ft 4in,india,maharashtra,navi mumbai")
p2 = parse_profile("ABX1234,Hindu,maratha,english,doctor,masters in medicine,28,170,5ft 7in,india,maharashtra,pune")

print("Compatibility Score:", get_compatibility_score(p1, p2, loaded_model, loaded_encoders))

Compatibility Score: 0.65


In [14]:
print(model.feature_names_in_)

['Marriage_Type' 'Age_at_Marriage' 'Gender' 'Education_Level'
 'Caste_Match' 'Religion' 'Parental_Approval' 'Urban_Rural'
 'Dowry_Exchanged' 'Children_Count' 'Income_Level' 'Years_Since_Marriage'
 'Spouse_Working' 'Inter-Caste' 'Inter-Religion']
