In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblibl          

In [2]:
df = pd.read_csv("../../../data/raw/TrainingData/marriage_data_india.csv")

In [4]:
df.columns
df.head(4)
df.shape

(10000, 18)

In [3]:
# Needed for ealuation of compatibility = 'Marital_Satisfaction',(mean can be used) ,Divorce_Status
# Dont needed  = Marriage_Type,Parental_Approval,Dowry_Exchanged,'Children_Count',spouse workingYears_Since_Marriage'
df.drop(columns=['ID','Marriage_Type','Parental_Approval','Dowry_Exchanged','Children_Count','Spouse_Working','Years_Since_Marriage'],inplace=True)

In [4]:
# combiining the Marital_Satisfaction(Medium/Low/High) & Divorce Status(yes/No) into Compatibility
df["Marital_Satisfaction"].value_counts()
df["Divorce_Status"].value_counts()

# Setting compatibility 1 for couples staisfied (Medium or high and not divorced)
df["Compatibility"] = np.where(
    ((df["Marital_Satisfaction"].isin(["High","Medium"])) & (df["Divorce_Status"] == "No")),
    1,
    0
)
df.drop(columns=["Marital_Satisfaction","Divorce_Status"], inplace=True)
df["Compatibility"].value_counts()
df.to_csv("marriage_data_india(processed).csv",index=False)
for i in df.columns:
    print(i) 

Age_at_Marriage
Gender
Education_Level
Caste_Match
Religion
Urban_Rural
Income_Level
Inter-Caste
Inter-Religion
Compatibility


In [5]:
# converting categorical columns 
# like religion gender education level into numbers
categorical_columns = df.select_dtypes(include=["object"]).columns
labels = {}

for col in  categorical_columns:
    le = LabelEncoder()
    print(col,df[col].unique())
    df[col] = le.fit_transform(df[col])
    labels[col] = le
    print(col,df[col].unique())

Gender ['Male' 'Female']
Gender [1 0]
Education_Level ['Graduate' 'School' 'Postgraduate' 'PhD']
Education_Level [0 3 2 1]
Caste_Match ['Different' 'Same']
Caste_Match [0 1]
Religion ['Hindu' 'Muslim' 'Sikh' 'Christian' 'Others']
Religion [1 2 4 0 3]
Urban_Rural ['Urban' 'Rural']
Urban_Rural [1 0]
Income_Level ['Middle' 'High' 'Low']
Income_Level [2 0 1]
Inter-Caste ['No' 'Yes']
Inter-Caste [0 1]
Inter-Religion ['No' 'Yes']
Inter-Religion [0 1]


In [15]:
x = df.drop(columns=["Compatibility"])#Features
y = df["Compatibility"]#Target

x_train,x_test,y_train,y_test = train_test_split(
    x,y,test_size=0.3,random_state=42,stratify=y
)
model = RandomForestClassifier(
    n_estimators = 200,
    random_state = 42,
    class_weight = "balanced"
)
model.fit(x_train,y_train)

In [16]:
# Eavlute the model
y_pred = model.predict(x_test)
print("Accuracy", accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy 0.6233333333333333
              precision    recall  f1-score   support

           0       0.26      0.18      0.21       846
           1       0.71      0.80      0.75      2154

    accuracy                           0.62      3000
   macro avg       0.49      0.49      0.48      3000
weighted avg       0.59      0.62      0.60      3000



In [63]:
df.columns


Index(['Age_at_Marriage', 'Gender', 'Education_Level', 'Caste_Match',
       'Religion', 'Urban_Rural', 'Income_Level', 'Inter-Caste',
       'Inter-Religion', 'Compatibility'],
      dtype='object')

In [17]:
# Save model + encoders
joblib.dump(model, "compatibility_model.pkl")
joblib.dump(labels, "label_encoders.pkl")

['label_encoders.pkl']

In [24]:
def safe_encode(value, encoder):
    try:
        return encoder.transform([value])[0]
    except:
        return -1  # unknown category
        

def build_features(p1, p2, encoders):
    # Derived features
    age_at_marriage = (p1["Age_at_Marriage"] + p2["Age_at_Marriage"]) // 2
    caste_match = int(p1["Caste"].lower() == p2["Caste"].lower())
    religion_match = int(p1["Religion"].lower() == p2["Religion"].lower())

    # Encode categorical features safely
    gender = safe_encode(p1["Gender"], encoders["Gender"])
    edu = safe_encode(p1["Education_Level"], encoders["Education_Level"])
    religion = safe_encode(p1["Religion"], encoders["Religion"])
    urban_rural = safe_encode(p1.get("Urban_Rural", "Urban"), encoders["Urban_Rural"])
    income = safe_encode(p1.get("Income_Level", "Middle"), encoders["Income_Level"])

    # Build dataframe with **same feature names as model**
    features = pd.DataFrame([{
        "Age_at_Marriage": age_at_marriage,
        "Gender": gender,
        "Education_Level": edu,
        "Caste_Match": caste_match,
        "Religion": religion,
        "Urban_Rural": urban_rural,
        "Income_Level": income,
        "Inter-Caste": 0 if caste_match else 1,
        "Inter-Religion": 0 if religion_match else 1
    }])

    return features


In [25]:
def get_compatibility_score(p1, p2, model, encoders):
    features = build_features(p1, p2, encoders)
    score = model.predict_proba(features)[0][1]
    return round(score, 3)

In [26]:
def parse_profile(profile_str):
    """
    Parse raw CSV-like string into standardized dict for compatibility scoring.
    Expected input:
    "Name,Religion,Caste,Mother_Tongue,Profession,Education,Age,Height_cm,Height,Country,State,City,Urban_Rural,Income_Level"
    """

    parts = profile_str.split(",")

    # Extract fields
    name = parts[0].strip().lower()
    religion = parts[1].strip().capitalize()
    caste = parts[2].strip().lower()
    education_raw = parts[5].strip().lower()
    age = int(parts[6].strip())
    city = parts[11].strip().lower()
    urban_rural = parts[12].strip().capitalize()
    income = parts[13].strip().capitalize()

    # Normalize Education_Level
    if "phd" in education_raw or "doctorate" in education_raw:
        education = "PhD"
    elif "master" in education_raw or "postgraduate" in education_raw:
        education = "Postgraduate"
    elif "bachelor" in education_raw or "graduate" in education_raw:
        education = "Graduate"
    elif "school" in education_raw or "diploma" in education_raw:
        education = "School"
    else:
        education = "Other"

    # Infer Gender (fallback = Unknown so KeyError never occurs)
    if any(x in name for x in ["rahul", "rohit", "amit", "arjun", "raj"]):
        gender = "Male"
    elif any(x in name for x in ["priya", "neha", "pooja", "anita", "kavita"]):
        gender = "Female"
    else:
        gender = "Unknown"

    return {
        "Religion": religion,
        "Caste": caste,
        "Education_Level": education,
        "Age_at_Marriage": age,
        "Gender": gender,              
        "Urban_Rural": urban_rural,
        "Income_Level": income,
    }


In [28]:
# Load model + encoders
loaded_model = joblib.load("compatibility_model.pkl")
loaded_encoders = joblib.load("label_encoders.pkl")

# --- Profiles (raw user input) ---
p1 = parse_profile(
    "Rahul Sharma,Hindu,Brahmin,Hindi,Software Engineer,Postgraduate,27,175,5ft 9in,India,Maharashtra,Mumbai,Urban,High"
)

p2 = parse_profile(
    "Priya Iyer,Sikh,Brahmin,Tamil,Doctor,Graduate,25,162,5ft 4in,India,Tamil Nadu,Chennai,Urban,Medium"
)


# --- Predict compatibility ---
score = get_compatibility_score(p1, p2, loaded_model, loaded_encoders)

print("Compatibility Score:", score)


Compatibility Score: 0.975


In [14]:
print(model.feature_names_in_)

['Marriage_Type' 'Age_at_Marriage' 'Gender' 'Education_Level'
 'Caste_Match' 'Religion' 'Parental_Approval' 'Urban_Rural'
 'Dowry_Exchanged' 'Children_Count' 'Income_Level' 'Years_Since_Marriage'
 'Spouse_Working' 'Inter-Caste' 'Inter-Religion']


In [29]:
print("No of records ", df.shape[0],df.shape[1])
df = pd.read_csv("../../../data/processed/TrainingData/marriage_data_india.csv")
df.head(15)

No of records  10000 10


Unnamed: 0,Age_at_Marriage,Gender,Education_Level,Caste_Match,Religion,Urban_Rural,Income_Level,Inter-Caste,Inter-Religion,Compatibility
0,23,Male,Graduate,Different,Hindu,Urban,Middle,No,No,0
1,28,Female,School,Same,Hindu,Rural,Middle,No,Yes,0
2,39,Male,Postgraduate,Same,Muslim,Rural,High,No,No,1
3,26,Female,School,Different,Hindu,Urban,High,Yes,No,0
4,32,Female,Graduate,Same,Hindu,Rural,Middle,No,Yes,1
5,37,Female,School,Different,Hindu,Urban,Low,No,No,1
6,24,Male,Graduate,Different,Hindu,Rural,Middle,No,No,1
7,18,Male,Postgraduate,Same,Sikh,Rural,Middle,Yes,No,1
8,22,Female,Postgraduate,Same,Christian,Rural,Low,No,Yes,1
9,24,Male,School,Same,Hindu,Urban,High,Yes,No,1
