In [1]:
import pandas as pd

In [3]:
import pandas as pd

# Path to your file
file_path = r"D:\FinalYearProject\Matrimony_Matchmaker\data\DHS\IACR7EFL.DTA"

# Use the StataReader to get the data and the labels separately
with pd.io.stata.StataReader(file_path) as reader:
    # 1. Load the data without converting categoricals
    df = reader.read(convert_categoricals=False)
    
    # 2. Extract the value labels (dictionary) if you need them later
    # This stores what 1, 2, 3 mean for each column
    value_labels = reader.value_labels()

# Now the dataframe will load successfully
print("Data loaded successfully!")
print(df.head())

Data loaded successfully!
            caseid v000  v001  v002  v003  v004    v005  v006  v007  v008  \
0    0100103022 04  IA7   130    22     4   130  201224    12  2019  1440   
1    0100103033 02  IA7   130    33     2   130  201224    12  2019  1440   
2    0100100997 02  IA7   109    97     2   109  196628    11  2019  1439   
3    0100100966 02  IA7   109    66     2   109  196628    11  2019  1439   
4    0100100914 02  IA7   109    14     2   109  196628    11  2019  1439   

   ...  smb81  smb305  smb306  sm190s   sm191s  sm190us  sm191us  sm190rs  \
0  ...    0.0    70.1    77.2       5  1264770      NaN      NaN      5.0   
1  ...    0.0    92.5    96.3       2   144090      NaN      NaN      3.0   
2  ...    0.0    72.8    78.2       2   103120      NaN      NaN      3.0   
3  ...    0.0    81.2    84.4       1  -460030      NaN      NaN      2.0   
4  ...    0.0    81.4    84.5       3   626660      NaN      NaN      4.0   

     sm191rs  smcrefuse  
0  1690700.0        Na

In [15]:
df["v131"].unique()

array([991, 992, 993, 998], dtype=int16)

In [16]:
import pandas as pd
import numpy as np

# Load the data raw to avoid Stata label conflicts
file_path = r"D:\FinalYearProject\Matrimony_Matchmaker\data\DHS\IACR7EFL.DTA"
with pd.io.stata.StataReader(file_path) as reader:
    raw_df = reader.read(convert_categoricals=False)

# Standardize column names to lowercase to match the DCT file provided
raw_df.columns = [col.lower() for col in raw_df.columns]

# Helper function to map education years to your specific B.Tech/MBA format
def map_degree(years):
    if pd.isna(years) or years > 90: return "N/A"
    if years >= 17: return "Post-Graduate (MBA/MA)"
    if years >= 15: return "Graduate (B.Tech/B.Sc/BA)"
    if years >= 12: return "Higher Secondary"
    return "Secondary or below"

# Helper function to map profession codes from the labels provided
def map_occ(code):
    occ_map = {0: "Not Working", 1: "Professional/IT/Tech", 3: "Clerical", 4: "Sales", 
               5: "Services", 6: "Agricultural", 7: "Manual"}
    return occ_map.get(code, "Other")

processed_df = pd.DataFrame()

# 1. IDENTIFICATION
processed_df['ID'] = raw_df['v001'].astype(str) + "_" + raw_df['v002'].astype(str)

# 2. FEMALE FEATURES (Respondent)
processed_df['Wife_Age'] = raw_df['v012']
processed_df['Wife_Education'] = raw_df['v133'].apply(map_degree)
processed_df['Wife_Profession'] = raw_df['v717'].apply(map_occ)
processed_df['Wife_Caste'] = raw_df['v131'].map({991: 'Caste', 992: 'Tribe', 993: 'No Caste'})
processed_df['Wife_Religion'] = raw_df['v130'].map({1: 'Hindu', 2: 'Muslim', 3: 'Christian', 4: 'Sikh', 5: 'Buddhist', 6: 'Jain'})
processed_df['Wife_Residence'] = raw_df['v025'].map({1: 'Urban', 2: 'Rural'})

# 3. MALE FEATURES (Husband/Partner)
processed_df['Husband_Age'] = raw_df['v730'] # v730 is Husband's age in the Couple file
processed_df['Husband_Education'] = raw_df['mv133'].apply(map_degree)
processed_df['Husband_Profession'] = raw_df['mv717'].apply(map_occ)
processed_df['Husband_Caste'] = raw_df['mv131'].map({991: 'Caste', 992: 'Tribe', 993: 'No Caste'})
processed_df['Husband_Religion'] = raw_df['mv130'].map({1: 'Hindu', 2: 'Muslim', 3: 'Christian', 4: 'Sikh', 5: 'Buddhist', 6: 'Jain'})
processed_df['Husband_Residence'] = raw_df['mv025'].map({1: 'Urban', 2: 'Rural'})

# 4. SUCCESS METRIC (Domestic Conflict Proxy)
# d105a: "Ever been pushed, shook or had something thrown by husband"
processed_df['Conflict_Score'] = raw_df['d105a'].replace({0: 'No', 1: 'Yes', 2: 'Yes', 3: 'Yes', 4: 'Yes'})

# 5. SAVE
output_path = r"D:\FinalYearProject\Matrimony_Matchmaker\data\DHS\final_couples_profile.csv"
processed_df.to_csv(output_path, index=False)

print(f"Extraction successful! {len(processed_df)} couples processed.")
print(processed_df.head())

Extraction successful! 57693 couples processed.
       ID  Wife_Age      Wife_Education Wife_Profession Wife_Caste  \
0  130_22        29  Secondary or below        Services      Caste   
1  130_33        36  Secondary or below     Not Working      Caste   
2  109_97        48  Secondary or below     Not Working      Tribe   
3  109_66        39  Secondary or below     Not Working      Caste   
4  109_14        31  Secondary or below     Not Working      Tribe   

  Wife_Religion Wife_Residence  Husband_Age   Husband_Education  \
0        Muslim          Rural           31    Higher Secondary   
1        Muslim          Rural           42  Secondary or below   
2        Muslim          Rural           54  Secondary or below   
3        Muslim          Rural           38  Secondary or below   
4        Muslim          Rural           39    Higher Secondary   

  Husband_Profession Husband_Caste Husband_Religion Husband_Residence  \
0           Services         Caste           Muslim    

In [27]:
import pandas as pd
import numpy as np

# Load the raw dataset
# raw_df = pd.read_stata(r"D:\FinalYearProject\Matrimony_Matchmaker\data\DHS\IACR7EFL.DTA", convert_categoricals=False)
# raw_df.columns = [col.lower() for col in raw_df.columns]

# --- 1. Map Columns to their full Dictionary Descriptions ---
# These descriptions are taken exactly from your provided .txt and .dct files

satisfaction_mapping = {
    # Emotional/Verbal
    'd103a': "Ever_been_humiliated_by_husband_partner",
    'd103c': "Ever_been_insulted_or_made_to_feel_bad_by_husband_partner",
    
    # Physical Conflict
    'd105a': "Ever_been_pushed_shook_or_had_something_thrown_by_husband_partner",
    'd105b': "Ever_been_slapped_by_husband_partner",
    
    # Controlling Behavior
    'd101a': "Husband_partner_jealous_if_respondent_talks_with_other_men",
    'd101c': "Husband_partner_does_not_permit_respondent_to_meet_female_friends",
    'd101e': "Husband_partner_insists_on_knowing_where_respondent_is",
    
    # Agency / Decision Making
    'v743a': "Person_who_usually_decides_on_respondents_health_care",
    'v743b': "Person_who_usually_decides_on_large_household_purchases",
    
    # Fear
    'd129': "Respondent_afraid_of_husband_partner_most_of_the_time_sometimes_or_never"
}

# --- 2. Add these to your finalized dataframe ---

# We iterate through the mapping and add the raw values under the descriptive headers
for code, description in satisfaction_mapping.items():
    if code in raw_df.columns:
        processed_df[description] = raw_df[code]

# --- 3. Optional: Convert numeric codes to simple "Yes/No" or "Problem" text 
# for better readability in your final CSV

# For the Conflict/Violence variables (d series), 0 is 'No', >0 is 'Yes'
conflict_cols = [
    "Ever_been_humiliated_by_husband_partner",
    "Ever_been_insulted_or_made_to_feel_bad_by_husband_partner",
    "Ever_been_pushed_shook_or_had_something_thrown_by_husband_partner",
    "Ever_been_slapped_by_husband_partner",
    "Husband_partner_jealous_if_respondent_talks_with_other_men",
    "Husband_partner_does_not_permit_respondent_to_meet_female_friends",
    "Husband_partner_insists_on_knowing_where_respondent_is"
]

for col in conflict_cols:
    if col in processed_df.columns:
        processed_df[col] = np.where(processed_df[col] > 0, "Yes/Problem", "No/Never")

# For Decision Making (v743 series), keep numeric or map based on your labels:
# 1=Wife, 2=Joint, 4=Husband, 5=Other
# No categorization requested, so keeping as is.

# --- 4. Save the finalized file ---
output_path = r"D:\FinalYearProject\Matrimony_Matchmaker\data\DHS\finalized_descriptive_marriage_data.csv"
processed_df.to_csv(output_path, index=False)

print("Finalized dataframe created with descriptive column names.")
print(processed_df.columns.tolist())

Finalized dataframe created with descriptive column names.
['ID', 'Wife_Age', 'Wife_Education', 'Wife_Profession', 'Wife_Caste', 'Wife_Religion', 'Wife_Residence', 'Husband_Age', 'Husband_Education', 'Husband_Profession', 'Husband_Caste', 'Husband_Religion', 'Husband_Residence', 'Conflict_Score', 'Emotional_Conflict', 'Physical_Conflict', 'Controlling_Negativity', 'Lack_of_Agency', 'Total_Marital_Conflict_Score', 'Marriage_Successful', 'Ever_been_humiliated_by_husband_partner', 'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner', 'Ever_been_pushed_shook_or_had_something_thrown_by_husband_partner', 'Ever_been_slapped_by_husband_partner', 'Husband_partner_jealous_if_respondent_talks_with_other_men', 'Husband_partner_does_not_permit_respondent_to_meet_female_friends', 'Husband_partner_insists_on_knowing_where_respondent_is', 'Person_who_usually_decides_on_respondents_health_care', 'Person_who_usually_decides_on_large_household_purchases', 'Respondent_afraid_of_husband_partner_most

In [21]:
processed_df["Wife_Education"].unique()

array(['Secondary or below', 'Higher Secondary',
       'Graduate (B.Tech/B.Sc/BA)', 'Post-Graduate (MBA/MA)'],
      dtype=object)

In [22]:
processed_df["Husband_Profession"].unique()

array(['Services', 'Sales', 'Agricultural', 'Professional/IT/Tech',
       'Manual', 'Other', 'Clerical', 'Not Working'], dtype=object)

In [23]:
processed_df["Wife_Religion"].unique()

array(['Muslim', 'Hindu', 'Sikh', nan, 'Buddhist', 'Jain', 'Christian'],
      dtype=object)

In [24]:
processed_df["Husband_Religion"].unique()

array(['Muslim', 'Hindu', 'Sikh', 'Christian', nan, 'Buddhist', 'Jain'],
      dtype=object)

In [25]:
processed_df["Husband_Residence"].unique()

array(['Rural', 'Urban'], dtype=object)

In [1]:
import pandas as pd
import re

# 1. Load the raw dataset
file_path = r"D:\FinalYearProject\Matrimony_Matchmaker\data\DHS\IACR7EFL.DTA"
with pd.io.stata.StataReader(file_path) as reader:
    raw_df = reader.read(convert_categoricals=False)

raw_df.columns = [col.lower() for col in raw_df.columns]
# 1. Load raw data (lowercase columns to match text file)
# raw_df = pd.read_stata('IACR7EFL.DTA', convert_categoricals=False)
raw_df.columns = [col.lower() for col in raw_df.columns]

# 2. Read the dictionary text
with open('Couple_data_meaning of coulmns.txt', 'r') as f:
    dict_content = f.read()

# --- PART A: MAP COLUMN HEADERS ---
name_map = {}
header_pattern = re.compile(r'label variable\s+(\w+)\s+"(.+?)"', re.IGNORECASE)
for match in header_pattern.finditer(dict_content):
    clean_desc = match.group(2).replace(" ", "_").replace("/", "_").replace("-", "_")
    name_map[match.group(1).lower()] = clean_desc

# --- PART B: MAP VALUE MEANINGS (THE ANSWERS) ---

# 1. Extract the Answer Keys (label define blocks)
# This captures: label define V025 1 "Urban" 2 "Rural"
value_labels = {}
define_pattern = re.compile(r'label define\s+(\w+)\s*(.*?);', re.DOTALL | re.IGNORECASE)

for match in define_pattern.finditer(dict_content):
    label_name = match.group(1).lower()
    mapping_text = match.group(2)
    
    # Parse individual pairs like: 1 "Urban"
    pairs = re.findall(r'(\d+)\s+"(.+?)"', mapping_text)
    value_labels[label_name] = {int(num): txt for num, txt in pairs}

# 2. Link Columns to Answer Keys (label values lines)
# This captures: label values v025 V025 (Linking column v025 to the Urban/Rural key)
column_to_label_link = {}
link_pattern = re.compile(r'label values\s+(\w+)\s+(\w+)', re.IGNORECASE)

for match in link_pattern.finditer(dict_content):
    col_name = match.group(1).lower()
    label_key = match.group(2).lower()
    column_to_label_link[col_name] = label_key

# 3. Apply the mappings to the dataframe
decoded_df = raw_df.copy()

for col in decoded_df.columns:
    if col in column_to_label_link:
        label_key = column_to_label_link[col]
        if label_key in value_labels:
            # Replace numbers with text using the extracted dictionary
            decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])

# --- PART C: ORDERING AND SAVING ---
h_list = ['v730', 'mv438', 'mv133', 'mv717', 'mv131', 'mv130']
w_list = ['v012', 'v438', 'v133', 'v717', 'v131', 'v130', 'v025', 'v024']
c_list = ['d101a', 'd101e', 'd103a', 'd103c', 'd105a', 'd105b', 'v743a', 'v743b', 'd129']

priority_cols = [c for c in (h_list + w_list + c_list) if c in decoded_df.columns]
other_cols = [c for c in decoded_df.columns if c not in priority_cols]

final_df = decoded_df[priority_cols + other_cols].copy()
final_df.rename(columns=name_map, inplace=True)

# Save
final_df.to_csv('Fully_Decoded_India_Dataset.csv', index=False)
print("Headers and Answers decoded for all 3000+ columns.")

  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = decoded_df[col].map(value_labels[label_key]).fillna(decoded_df[col])
  decoded_df[col] = d

Headers and Answers decoded for all 3000+ columns.


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("Fully_Decoded_India_Dataset.csv")

  df = pd.read_csv("Fully_Decoded_India_Dataset.csv")


In [8]:
# Define the exact column names you want to keep
features = [
    'Husband_age', 
    'Husband years_of_education', 
    'Husband Occupation_(grouped)', 
    'Husband Ethnicity', 
    'Husband Religion', 
    "Wife's_current_age", 
    "Wife's_height_in_centimeters_(1_decimal)", 
    "Wife's Education_in_years", 
    "Wife's_occupation_(grouped)", 
    "Wife's Ethnicity", 
    "Wife's Religion", 
    'residence', 
    'State'
]

targets = [
    'Husband_partner_jealous_if_Wife_talks_with_other_men', 
    'Husband_partner_insists_on_knowing_where_Wife_is', 
    'Ever_been_humiliated_by_husband_partner', 
    'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner', 
    'Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner', 
    'Ever_been_slapped_by_husband_partner', 
    "Person_who_usually_decides_on_Wife's_health_care", 
    "Person_who_usually_decides_on_large_household_purchases", 
    "Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never"
]

# Create the new DataFrame with only these columns
# We use intersection() to avoid errors if a column name has a slight typo
cols_to_keep = [c for c in (features + targets) if c in df.columns]
final_df = df[cols_to_keep].copy()

# View the result
print(f"New DataFrame shape: {final_df.shape}")
print(final_df.head())

New DataFrame shape: (57693, 22)
   Husband_age  Husband years_of_education       Husband Occupation_(grouped)  \
0           31                          12  Services / household and domestic   
1           42                           5                              Sales   
2           54                           8                       Agricultural   
3           38                           7                       Agricultural   
4           39                          12                       Agricultural   

  Husband Ethnicity Husband Religion  Wife's_current_age  \
0             Caste           Muslim                  29   
1             Caste           Muslim                  36   
2             Tribe           Muslim                  48   
3             Caste           Muslim                  39   
4             Tribe           Muslim                  31   

  Wife's_height_in_centimeters_(1_decimal)  Wife's Education_in_years  \
0                                     1617    

In [None]:
# Helper function to map education years to your specific B.Tech/MBA format
def map_degree(years):
    if pd.isna(years) or years > 90: return "N/A"
    if years >= 17: return "Post-Graduate (MBA/MA)"
    if years >= 15: return "Graduate (B.Tech/B.Sc/BA)"
    if years >= 12: return "Higher Secondary"
    return "Secondary or below"
final_df["Husband years_of_education"] = final_df["Husband years_of_education"].apply(map_degree)
final_df["Wife's Education_in_years"] = final_df["Wife's Education_in_years"].apply(map_degree)
final_df.rename(columns={"Husband years_of_education" : "Husband's Education Level"}, inplace=True)
final_df.rename(columns={"Wife's Education_in_years" : "Wife's Education Level"}, inplace=True)
final_df.rename(columns={"Wife's_height_in_centimeters_(1_decimal)" : "Wife's height(centimeters)"}, inplace=True)



In [19]:
final_df["Wife's height(centimeters)"] = (
    pd.to_numeric(final_df["Wife's height(centimeters)"], errors="coerce") / 10
).round(2)


In [21]:
final_df.head()
final_df.to_csv("dataset_final_form.csv", index=False)

All Survey Questions Values

A. Person who usually decides on respondent's (Wife's) health care.
    1: Respondent alone (Wife decides)

    2: Husband/partner alone (Husband decides)

    3: Respondent and husband/partner jointly (Joint Decision - Ideal)

    4: Someone else

    5: Respondent and someone else

    6: Decision not made/not applicable

B.Person who usually decides on large household purchases
    1: Respondent alone

    2: Husband/partner alone

    3: Respondent and husband/partner jointly (Joint Decision - Ideal)

    4: Someone else

    5: Respondent and someone else

    6: Decision not made/not applicable

C.Husband/partner jealous if wife talks with other men.
    0: No

    1: Yes
    
    8: Don't know

D.Husband/partner insists on knowing where wife is at all times.

    0: No
    
    1: Yes
    
    8: Don't know

E.Ever been humiliated by husband/partner.

    0: Never

    1: Often

    2: Sometimes

    3: Yes, but not in the last 12 months

    4: Yes, but frequency missing

F.Ever been insulted or made to feel bad by husband/partner.

    0: Never

    1: Often

    2: Sometimes

    3: Yes, but not in the last 12 months

    4: Yes, but frequency missing

G.Ever been pushed, shook, or had something thrown by husband/partner.

    0: Never
    
    1: Often
    
    2: Sometimes
    
    3: Yes, but not in the last 12 months
    
    4: Yes, but frequency missing

H.Ever been slapped by husband/partner.

    0: Never

    1: Often

    2: Sometimes

    3: Yes, but not in the last 12 months

    4: Yes, but frequency missing

I.Wife afraid of husband/partner most of the time, sometimes, or never.

    0: Never
    
    1: Sometimes
    
    2: Most of the time

Based on the "Predicting Marital Stability" (Frank, 2024) paper, 
the "Relationship Compatibility" dimension was built on 
6 specific psychological questions.

In [10]:
df = pd.read_csv(r"/workspaces/FinalYearProject/Matrimony_Matchmaker/data/DHS_DATA.csv")
print(df.columns)

Index(['Husband_age', 'Husband's Education Level',
       'Husband Occupation_(grouped)', 'Husband Ethnicity', 'Husband Religion',
       'Wife's_current_age', 'Wife's height(centimeters)',
       'Wife's Education Level', 'Wife's_occupation_(grouped)',
       'Wife's Ethnicity', 'Wife's Religion', 'residence', 'State',
       'Husband_partner_jealous_if_Wife_talks_with_other_men',
       'Husband_partner_insists_on_knowing_where_Wife_is',
       'Ever_been_humiliated_by_husband_partner',
       'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner',
       'Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner',
       'Ever_been_slapped_by_husband_partner',
       'Person_who_usually_decides_on_Wife's_health_care',
       'Person_who_usually_decides_on_large_household_purchases',
       'Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never'],
      dtype='object')


In [11]:
# 1. Load your dataset (Replace 'your_file.csv' with your actual filename)
# df = pd.read_csv('your_file.csv') 

# 2. Define the 9 Target Columns (Exact names from your list)
target_cols = [
    'Husband_partner_jealous_if_Wife_talks_with_other_men',
    'Husband_partner_insists_on_knowing_where_Wife_is',
    'Ever_been_humiliated_by_husband_partner',
    'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner',
    'Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner',
    'Ever_been_slapped_by_husband_partner',
    "Person_who_usually_decides_on_Wife's_health_care",
    'Person_who_usually_decides_on_large_household_purchases',
    'Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never'
]

# 3. Check for "Missing" Values
# Note: In some datasets, "missing" might be represented as NaN, or blank strings, 
# or specific codes like "Missing", "Don't know".
# We assume standard NaNs here. If you have "Don't know" as a string, add it to the replace list.

# Optional: Convert "Don't know" or "Missing" strings to NaN for accurate filtering
# df[target_cols] = df[target_cols].replace(['Missing', 'Don\'t know', '.'], np.nan)

# 4. Create the Masks
# Mask for Complete Rows: All 9 columns must have a value (not NaN)
mask_complete = df[target_cols].notna().all(axis=1)

# Mask for Incomplete Rows: At least one column is NaN
mask_incomplete = ~mask_complete

# 5. Split the Data
df_complete = df[mask_complete].copy()
df_incomplete = df[mask_incomplete].copy()

# 6. Verify Counts
print(f"Total Rows: {len(df)}")
print(f"Complete Responses (To Keep): {len(df_complete)}")
print(f"Incomplete Responses (To Separate): {len(df_incomplete)}")

# 7. Save to Files
df_complete.to_csv('completeresponse_dhs.csv', index=False)
df_incomplete.to_csv('incompleteresponse_dhs.csv', index=False)

print("\nFiles saved successfully: 'completeresponse_dhs.csv' and 'incompleteresponse_dhs.csv'")

Total Rows: 57693
Complete Responses (To Keep): 46488
Incomplete Responses (To Separate): 11205

Files saved successfully: 'completeresponse_dhs.csv' and 'incompleteresponse_dhs.csv'


The exact Scoring Dictionary to map text-based values to the 0â€“4 Risk Scale used in "Predicting Marital Stability: An Approach for More Characteristics (Frank, 2024)".

The Logic:

    0: Perfect Compatibility (Safe, Joint Decisions, No Fear).

    4: High Incompatibility (Violence, Total Control, High Fear).

1. ROLES (Decision Making)
-Person_who_usually_decides_on_Wife's_health_careConcept: Egalitarian vs. Hierarchical roles.
Text Value to Score:
    "Respondent and husband/partner jointly" - > 0 (Ideal/Compatible)
    "Respondent alone" -> 1 (Autonomous/Safe)
    "Husband/partner alone" -> 4 (Hierarchical/Risk)
    "Someone else" / "Other" -> 4 (Lack of Couple Autonomy)
----------------------------------------------------------------

2. GOALS (Financial Shared Vision) - Person_who_usually_decides_on_large_household_purchases
Text Value to Score: 
    "Respondent and husband/partner jointly" -> 0 (Shared Goals)
    "Respondent alone" -> 1 (Safe)
    "Husband/partner alone" -> 4 (Mismatched Goals/Control)
-----------------------------------------------------------------

3. TRUST (Jealousy & Control) - 
Husband_partner_jealous_if_Wife_talks_with_other_men + 
Husband_partner_insists_on_knowing_where_Wife_is
Text Value to Score:
    "No" ->0 (High Trust)
    "Yes" -> 4 (Low Trust / High Risk)
    "Don't know" -> 2 (Uncertainty)
-----------------------------------------------------------------

4. VIEWS ON MARRIAGE (Boundaries/Violence) -
Ever_been_pushed,_shook_or_had_something_thrown + 
Ever_been_slapped_by_husband_partner
    "Never" -> 0 (Compatible Views)
    "Yes, but not in the last 12 months" -> 2 (Past Issues/Moderate Risk)
    "Sometimes" -> 3 (Ongoing Incompatibility)
    "Often" -> 4 (Severe Incompatibility)
-----------------------------------------------------------------

5. LIFE GOALS (Fear)
Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never
    "Never" -> 0 (Aligned)
    "Sometimes" -> 2 (Friction)
    "Most of the time" -> 4 (Divergent/Toxic)
-----------------------------------------------------------------

6. DREAMS (Emotional Support vs. Abuse)
Ever_been_humiliated_by_husband_partner + 
Ever_been_insulted_or_made_to_feel_bad...
    "Never" -> 0 (Supportive)
    "Yes, but not in the last 12 months" -> 2 (Past Risk)"Sometimes" -> 3 (Damaging)
    "Often" -> 4 (Dream-Killing/High Risk)
-----------------------------------------------------------------

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('completeresponse_dhs.csv')
# Load your Complete Data
# df = pd.read_csv('completeresponse_dhs.csv')

# --- 1. DEFINE SCALING FUNCTIONS ---

def score_decisions(val):
    s = str(val).lower()
    if 'joint' in s: return 0        # Best
    if 'respondent' in s: return 1   # Good
    if 'husband' in s: return 4      # Bad
    if 'someone else' in s: return 4 # Bad
    return 2 # Neutral/Unknown

def score_binary_risk(val):
    # For Jealousy columns
    s = str(val).lower()
    if 'no' in s: return 0
    if 'yes' in s: return 4
    return 2

def score_frequency_risk(val):
    # For Violence/Humiliation columns
    s = str(val).lower()
    if 'never' in s: return 0
    if 'not in the last' in s: return 2
    if 'sometimes' in s: return 3
    if 'often' in s: return 4
    return 2

def score_fear(val):
    s = str(val).lower()
    if 'never' in s: return 0
    if 'sometimes' in s: return 2
    if 'most' in s: return 4
    return 0

# --- 2. APPLY MAPPING TO CREATE THE 6 DIMENSIONS ---

# Dim 1: Roles
df['Score_Roles'] = df["Person_who_usually_decides_on_Wife's_health_care"].apply(score_decisions)

# Dim 2: Goals
df['Score_Goals'] = df["Person_who_usually_decides_on_large_household_purchases"].apply(score_decisions)

# Dim 3: Trust (Average of Jealousy + Control)
t1 = df['Husband_partner_jealous_if_Wife_talks_with_other_men'].apply(score_binary_risk)
t2 = df['Husband_partner_insists_on_knowing_where_Wife_is'].apply(score_binary_risk)
df['Score_Trust'] = (t1 + t2) / 2

# Dim 4: Views (Average of Push + Slap)
v1 = df['Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner'].apply(score_frequency_risk)
v2 = df['Ever_been_slapped_by_husband_partner'].apply(score_frequency_risk)
df['Score_Views'] = (v1 + v2) / 2

# Dim 5: Life Goals (Fear)
df['Score_LifeGoals'] = df["Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never"].apply(score_fear)

# Dim 6: Dreams (Average of Humiliation + Insult)
d1 = df['Ever_been_humiliated_by_husband_partner'].apply(score_frequency_risk)
d2 = df['Ever_been_insulted_or_made_to_feel_bad_by_husband_partner'].apply(score_frequency_risk)
df['Score_Dreams'] = (d1 + d2) / 2

# --- 3. FINAL AGGREGATION ---
# Average of the 6 dimensions
df['Frank_Compatibility_Score'] = df[['Score_Roles', 'Score_Goals', 'Score_Trust', 
                                      'Score_Views', 'Score_LifeGoals', 'Score_Dreams']].mean(axis=1)

# --- 4. DETERMINE TARGET CLASS ---
# Threshold = 1.227 (from Paper)
# <= 1.227 : Compatible (Class 1)
# > 1.227  : Incompatible (Class 0)
df['Is_Compatible'] = np.where(df['Frank_Compatibility_Score'] <= 1.227, 1, 0)

# Save
df.to_csv('Target_Variable_Created.csv', index=False)
print(df[['Frank_Compatibility_Score', 'Is_Compatible']].head(10))

   Frank_Compatibility_Score  Is_Compatible
0                   0.666667              1
1                   1.000000              1
2                   0.333333              1
3                   0.666667              1
4                   1.333333              0
5                   0.333333              1
6                   0.333333              1
7                   0.333333              1
8                   0.333333              1
9                   0.333333              1


In [None]:
df['Is_Compatible'].value_counts()

Is_Compatible
1    31108
0    15380
Name: count, dtype: int64

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve

# --- 1. LOAD & PREPARE DATA ---
df_features = pd.read_csv('completeresponse_dhs.csv')
df_target = pd.read_csv('Target_Variable_Created.csv')
df = pd.concat([df_features, df_target[['Is_Compatible']]], axis=1)

feature_cols = [
    'Husband_age', "Husband's Education Level", 'Husband Occupation_(grouped)', 
    'Husband Religion', 'Husband Ethnicity',
    "Wife's_current_age", "Wife's Education Level", "Wife's_occupation_(grouped)", 
    "Wife's Religion", "Wife's Ethnicity"
]

X = pd.get_dummies(df[feature_cols], drop_first=True)
y = df['Is_Compatible']

# --- 2. TRAIN/TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- 3. AGGRESSIVE MODEL TRAINING ---
# We deliberately OVER-weight the minority class to force the model to learn it.
# Standard ratio was ~2. We are pumping it to 10.
aggressive_weight = 10 

print(f"\nTraining with Aggressive Weight: {aggressive_weight}")
model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.03,    # Slower learning to find subtle patterns
    max_depth=6,           
    scale_pos_weight=aggressive_weight, # FORCE FOCUS ON CLASS 0
    subsample=0.8,         # Prevent overfitting
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

# --- 4. THRESHOLD TUNING (The Magic Step) ---
# Instead of predict(), we get probabilities (0.0 to 1.0)
y_proba = model.predict_proba(X_test)[:, 1] # Probability of being Compatible (Class 1)

# We want to catch Class 0 (Incompatible).
# Standard logic: If Prob(Compatible) < 0.5, then Incompatible.
# NEW logic: We will be stricter. We only call it "Compatible" if Prob > 0.7.
# This makes it much easier to fall into the "Incompatible" bucket.

decision_threshold = 0.65  # <--- ADJUST THIS TO BALANCE RESULTS
y_pred_tuned = (y_proba > decision_threshold).astype(int)

# --- 5. EVALUATE ---
print(f"\n--- Performance at Threshold {decision_threshold} ---")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_tuned))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_tuned))

# --- 6. AUTOMATIC OPTIMAL THRESHOLD FINDER ---
# This loop finds the threshold that gives the best 'Macro F1' score
print("\n--- Searching for Best Threshold ---")
best_thresh = 0
best_score = 0
for thresh in np.arange(0.3, 0.9, 0.05):
    preds = (y_proba > thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    # Focus on Class 0 Recall (Catching Violence)
    recall_0 = tn / (tn + fp) if (tn + fp) > 0 else 0
    # But don't destroy Precision
    precision_0 = tn / (tn + fn) if (tn + fn) > 0 else 0
    
    f1_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0
    
    if f1_0 > best_score:
        best_score = f1_0
        best_thresh = thresh

print(f"Optimal Threshold for Class 0 Detection: {best_thresh:.2f}")
print(f"Best F1-Score for Class 0: {best_score:.3f}")


Training with Aggressive Weight: 10


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Performance at Threshold 0.65 ---
Confusion Matrix:
[[   2 3074]
 [   2 6220]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.00      0.00      3076
           1       0.67      1.00      0.80      6222

    accuracy                           0.67      9298
   macro avg       0.58      0.50      0.40      9298
weighted avg       0.61      0.67      0.54      9298


--- Searching for Best Threshold ---
Optimal Threshold for Class 0 Detection: 0.90
Best F1-Score for Class 0: 0.015


In [2]:
import pandas as pd
df_features = pd.read_csv('completeresponse_dhs.csv')


XGBoost Model is used to predict the ans to all 9 Questions. Since, mapping 9 to 1 marital satisfaction gave poor accuracy and recall.
MultiOutPut variant of XGBoost is used.(All 9 questions are treated independently and no correlation among them is consdered)

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
target_cols = [
    'Husband_partner_jealous_if_Wife_talks_with_other_men',
    'Husband_partner_insists_on_knowing_where_Wife_is',
    'Ever_been_humiliated_by_husband_partner',
    'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner',
    'Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner',
    'Ever_been_slapped_by_husband_partner',
    "Person_who_usually_decides_on_Wife's_health_care",
    'Person_who_usually_decides_on_large_household_purchases',
    'Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never'
]

feature = [c for c in df_features.columns if c not in target_cols]
X = df_features[feature].copy()
y = df_features[target_cols].copy()

# 6. Encode Categorical Features
X_encoded = pd.get_dummies(X, drop_first=True)

# 7. Encode the 9 Target Variables
label_encoders = {}
for col in target_cols:
    le = LabelEncoder()
    # XGBoost requires targets to be integer encoded (0, 1, 2...)
    y[col] = le.fit_transform(y[col].astype(str))
    label_encoders[col] = le

# 8. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 9. Build and Train the Multi-Output XGBoost Model
xgb = XGBClassifier(eval_metric='mlogloss', random_state=42)
multi_target_xgb = MultiOutputClassifier(xgb, n_jobs=-1)

print("Training multi-output XGBoost model...")
multi_target_xgb.fit(X_train, y_train)

# 10. Predict and Evaluate
y_pred_test = multi_target_xgb.predict(X_test)

print("\n--- Test Accuracy per Survey Question ---")
test_accuracies = []
for i, col in enumerate(target_cols):
    acc = accuracy_score(y_test.iloc[:, i], y_pred_test[:, i])
    test_accuracies.append(acc)
    print(f"{col}: {acc * 100:.2f}%")

print(f"\nAverage Overall Test Accuracy: {np.mean(test_accuracies) * 100:.2f}%")

Training multi-output XGBoost model...

--- Test Accuracy per Survey Question ---
Husband_partner_jealous_if_Wife_talks_with_other_men: 74.35%
Husband_partner_insists_on_knowing_where_Wife_is: 80.60%
Ever_been_humiliated_by_husband_partner: 91.85%
Ever_been_insulted_or_made_to_feel_bad_by_husband_partner: 92.77%
Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner: 89.17%
Ever_been_slapped_by_husband_partner: 76.44%
Person_who_usually_decides_on_Wife's_health_care: 73.60%
Person_who_usually_decides_on_large_household_purchases: 73.82%
Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never: 65.82%

Average Overall Test Accuracy: 79.82%


ClassiferChain variant(of XGBoost) however, sees the effect of one already predicted question on the other. Eg: 
if a husband has slapped his wife, the probability that she is afraid of him goes up significantly.

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import ClassifierChain
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# 1. Load your dataset (Replace with your actual dataframe if it's already loaded)
# df_features = pd.read_csv('your_actual_file.csv')

target_cols = [
    'Husband_partner_jealous_if_Wife_talks_with_other_men',
    'Husband_partner_insists_on_knowing_where_Wife_is',
    'Ever_been_humiliated_by_husband_partner',
    'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner',
    'Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner',
    'Ever_been_slapped_by_husband_partner',
    "Person_who_usually_decides_on_Wife's_health_care",
    'Person_who_usually_decides_on_large_household_purchases',
    'Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never'
]

# Ensure no targets are missing
df_features = df_features.dropna(subset=target_cols).copy()

feature_cols = [c for c in df_features.columns if c not in target_cols]
X = df_features[feature_cols].copy()
y = df_features[target_cols].copy()

# ---------------------------------------------------------
# NEW: Robust Missing Value Handling
# ---------------------------------------------------------
# Separate numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# Impute numeric columns with median (less sensitive to outliers)
num_imputer = SimpleImputer(strategy='median')
if len(numeric_cols) > 0:
    X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

# Impute categorical columns with a placeholder 'Unknown'
cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
if len(categorical_cols) > 0:
    X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
# ---------------------------------------------------------

# Encode Categorical Features
X_encoded = pd.get_dummies(X, drop_first=True)

# Encode the 9 Target Variables
label_encoders = {}
for col in target_cols:
    le = LabelEncoder()
    y[col] = le.fit_transform(y[col].astype(str))
    label_encoders[col] = le

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Build and Train the ClassifierChain XGBoost Model
xgb = XGBClassifier(
    learning_rate=0.05,
    n_estimators=150,
    max_depth=5,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss', 
    random_state=42
)

chain_xgb = ClassifierChain(xgb, order='random', random_state=42)

print("Training ClassifierChain XGBoost model...")
chain_xgb.fit(X_train, y_train)

# Predict and Evaluate
y_pred_test = chain_xgb.predict(X_test)

print("\n--- Test Accuracy per Survey Question ---")
# test_accuracies = []
# for i, col in enumerate(target_cols):
#     acc = accuracy_score(y_test.iloc[:, i], y_pred_test[:, i])
#     test_accuracies.append(acc)
#     print(f"{col}: {acc * 100:.2f}%")

# print(f"\n======================================")
# print(f"Average Overall Test Accuracy: {np.mean(test_accuracies) * 100:.2f}%")
# print(f"======================================")

from sklearn.metrics import classification_report

for col in target_cols:
    # ... (your model training code) ...
    
    print(f"\n=== Report for: {col} ===")
    
    # We use the label encoder to get the actual text labels (e.g., 'Never', 'Yes')
    actual_labels = label_encoders[col].classes_
    
    # This prints Precision, Recall, and F1 for EACH specific answer
    print(classification_report(y_test[col], y_pred, target_names=actual_labels))

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_cols = X.select_dtypes(include=['object', 'category']).columns


Training ClassifierChain XGBoost model...

--- Test Accuracy per Survey Question ---

=== Report for: Husband_partner_jealous_if_Wife_talks_with_other_men ===
              precision    recall  f1-score   support

  Don't know       0.00      0.00      0.00        31
          No       0.81      0.09      0.16      6968
         Yes       0.25      0.94      0.40      2299

    accuracy                           0.30      9298
   macro avg       0.35      0.34      0.19      9298
weighted avg       0.67      0.30      0.22      9298


=== Report for: Husband_partner_insists_on_knowing_where_Wife_is ===
              precision    recall  f1-score   support

  Don't know       0.00      0.00      0.00        12
          No       0.84      0.09      0.15      7501
         Yes       0.19      0.93      0.32      1785

    accuracy                           0.25      9298
   macro avg       0.34      0.34      0.16      9298
weighted avg       0.71      0.25      0.19      9298


=== Repo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])

Stragetic Chain Ordering

In [10]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import ClassifierChain
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore', category=FutureWarning)

# 1. Load Data (assuming df_features is already loaded in your environment)
# df_features = pd.read_csv('your_actual_file.csv')

target_cols = [
    'Husband_partner_jealous_if_Wife_talks_with_other_men',                # Index 0
    'Husband_partner_insists_on_knowing_where_Wife_is',                    # Index 1
    'Ever_been_humiliated_by_husband_partner',                             # Index 2
    'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner',           # Index 3
    'Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner',  # Index 4
    'Ever_been_slapped_by_husband_partner',                                # Index 5
    "Person_who_usually_decides_on_Wife's_health_care",                    # Index 6
    'Person_who_usually_decides_on_large_household_purchases',             # Index 7
    'Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never'  # Index 8
]

# Drop rows missing targets
df_features = df_features.dropna(subset=target_cols).copy()

# Feature engineering: Age Difference
if 'Husband_age' in df_features.columns and "Wife's_current_age" in df_features.columns:
    df_features['Age_Difference'] = df_features['Husband_age'] - df_features["Wife's_current_age"]

feature_cols = [c for c in df_features.columns if c not in target_cols]
X = df_features[feature_cols].copy()
y = df_features[target_cols].copy()

# --- Robust Missing Value Handling ---
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category', 'str']).columns

num_imputer = SimpleImputer(strategy='median')
if len(numeric_cols) > 0:
    X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
if len(categorical_cols) > 0:
    X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

# Encode Categorical Features
X_encoded = pd.get_dummies(X, drop_first=True)

# Encode Targets
label_encoders = {}
for col in target_cols:
    le = LabelEncoder()
    y[col] = le.fit_transform(y[col].astype(str))
    label_encoders[col] = le

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# --- The Strategic XGBoost Setup ---
xgb = XGBClassifier(
    learning_rate=0.05,
    n_estimators=150,
    max_depth=5,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss', 
    random_state=42,
    n_jobs=-1
)

# Strategic Chain Order based on your previous accuracy results:
# Easiest -> Hardest
# Index 3: Insulted (92.8%)
# Index 2: Humiliated (91.8%)
# Index 4: Pushed/Shook (89.2%)
# Index 1: Insists knowing where wife is (80.6%)
# Index 5: Slapped (76.5%)
# Index 0: Jealous (74.9%)
# Index 6: Health care decider (74.1%)
# Index 7: Household purchases decider (74.1%)
# Index 8: Wife afraid (66.2%)
optimal_order = [3, 2, 4, 1, 5, 0, 6, 7, 8]

# Apply the custom order to the ClassifierChain
chain_xgb = ClassifierChain(xgb, order=optimal_order, random_state=42)

print("Training Strategically Ordered ClassifierChain XGBoost model...")
chain_xgb.fit(X_train, y_train)

# Predict and Evaluate
y_pred_test = chain_xgb.predict(X_test)

print("\n--- Test Accuracy per Survey Question ---")
test_accuracies = []
for i, col in enumerate(target_cols):
    acc = accuracy_score(y_test.iloc[:, i], y_pred_test[:, i])
    test_accuracies.append(acc)
    print(f"{col}: {acc * 100:.2f}%")

print(f"\n======================================")
print(f"Average Overall Test Accuracy: {np.mean(test_accuracies) * 100:.2f}%")
print(f"======================================")

Training Strategically Ordered ClassifierChain XGBoost model...

--- Test Accuracy per Survey Question ---
Husband_partner_jealous_if_Wife_talks_with_other_men: 74.94%
Husband_partner_insists_on_knowing_where_Wife_is: 80.67%
Ever_been_humiliated_by_husband_partner: 91.85%
Ever_been_insulted_or_made_to_feel_bad_by_husband_partner: 92.80%
Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner: 89.22%
Ever_been_slapped_by_husband_partner: 76.59%
Person_who_usually_decides_on_Wife's_health_care: 74.14%
Person_who_usually_decides_on_large_household_purchases: 74.12%
Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never: 66.10%

Average Overall Test Accuracy: 80.05%


By using compute_sample_weight(class_weight='balanced'), we told the model: "Treat the rare answers (like 'Yes, I was slapped') as equally important to the common answers ('Never')." Because the model was suddenly terrified of missing a "Yes", it started over-predicting "Yes". Since the vast majority of the real answers are "Never", over-predicting "Yes" completely destroyed your overall accuracy. It traded accuracy for minority recall.

Since our specific goal right now is Overall Accuracy > 85%, we need to completely remove the weights. To maximize pure accuracy, the model should lean into the natural majority answers, but we will use smart rules to find the exceptions.

In [11]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.utils.class_weight import compute_sample_weight

# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# 1. Load Data
# df_features = pd.read_csv('your_actual_file.csv')

target_cols = [
    'Husband_partner_jealous_if_Wife_talks_with_other_men',
    'Husband_partner_insists_on_knowing_where_Wife_is',
    'Ever_been_humiliated_by_husband_partner',
    'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner',
    'Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner',
    'Ever_been_slapped_by_husband_partner',
    "Person_who_usually_decides_on_Wife's_health_care",
    'Person_who_usually_decides_on_large_household_purchases',
    'Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never'
]

df_features = df_features.dropna(subset=target_cols).copy()

# Feature Engineering
if 'Husband_age' in df_features.columns and "Wife's_current_age" in df_features.columns:
    df_features['Age_Difference'] = df_features['Husband_age'] - df_features["Wife's_current_age"]

feature_cols = [c for c in df_features.columns if c not in target_cols]
X = df_features[feature_cols].copy()
y = df_features[target_cols].copy()

# Robust Missing Value Handling
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category', 'str']).columns

num_imputer = SimpleImputer(strategy='median')
if len(numeric_cols) > 0:
    X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
if len(categorical_cols) > 0:
    X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

# Encode Categorical Features
X_encoded = pd.get_dummies(X, drop_first=True)

# Encode Targets
label_encoders = {}
for col in target_cols:
    le = LabelEncoder()
    y[col] = le.fit_transform(y[col].astype(str))
    label_encoders[col] = le

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# --- NEW STRATEGY: 9 INDEPENDENT WEIGHTED MODELS ---
print("Training 9 Independent XGBoost models with Sample Weights...\n")

test_accuracies = []
trained_models = {}

for col in target_cols:
    # 1. Calculate the exact weights for this specific survey question's imbalance
    weights = compute_sample_weight(class_weight='balanced', y=y_train[col])
    
    # 2. Build the model (Slightly deeper trees now that we have weights)
    xgb = XGBClassifier(
        learning_rate=0.05,
        n_estimators=200,          # Increased estimators
        max_depth=6,               # Slightly deeper to capture complex rules
        min_child_weight=2,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='mlogloss', 
        random_state=42,
        n_jobs=-1
    )
    
    # 3. Train the model using the calculated weights
    xgb.fit(X_train, y_train[col], sample_weight=weights)
    trained_models[col] = xgb
    
    # 4. Predict and Evaluate
    y_pred = xgb.predict(X_test)
    acc = accuracy_score(y_test[col], y_pred)
    test_accuracies.append(acc)
    
    print(f"{col}: {acc * 100:.2f}%")

print(f"\n======================================")
print(f"Average Overall Test Accuracy: {np.mean(test_accuracies) * 100:.2f}%")
print(f"======================================")

Training 9 Independent XGBoost models with Sample Weights...

Husband_partner_jealous_if_Wife_talks_with_other_men: 56.74%
Husband_partner_insists_on_knowing_where_Wife_is: 59.26%
Ever_been_humiliated_by_husband_partner: 54.28%
Ever_been_insulted_or_made_to_feel_bad_by_husband_partner: 56.28%
Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner: 49.96%
Ever_been_slapped_by_husband_partner: 45.33%
Person_who_usually_decides_on_Wife's_health_care: 36.19%
Person_who_usually_decides_on_large_household_purchases: 34.07%
Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never: 38.69%

Average Overall Test Accuracy: 47.87%


Going back to the independent model approach (which prevents error propagation), 
but this time we will tune XGBoost strictly 
for high accuracy (deeper trees, lightly more aggressive learning) and add some powerful interaction features.

In [13]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

warnings.filterwarnings('ignore', category=FutureWarning)

# 1. Load Data
# df_features = pd.read_csv('your_actual_file.csv')

target_cols = [
    'Husband_partner_jealous_if_Wife_talks_with_other_men',
    'Husband_partner_insists_on_knowing_where_Wife_is',
    'Ever_been_humiliated_by_husband_partner',
    'Ever_been_insulted_or_made_to_feel_bad_by_husband_partner',
    'Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner',
    'Ever_been_slapped_by_husband_partner',
    "Person_who_usually_decides_on_Wife's_health_care",
    'Person_who_usually_decides_on_large_household_purchases',
    'Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never'
]

df_features = df_features.dropna(subset=target_cols).copy()

# --- POWERFUL FEATURE ENGINEERING ---
# 1. Age Difference
if 'Husband_age' in df_features.columns and "Wife's_current_age" in df_features.columns:
    df_features['Age_Difference'] = df_features['Husband_age'] - df_features["Wife's_current_age"]

# 2. Education Match (Are they on the same educational level?)
if "Husband's Education Level" in df_features.columns and "Wife's Education Level" in df_features.columns:
    df_features['Education_Match'] = (df_features["Husband's Education Level"] == df_features["Wife's Education Level"]).astype(int)

feature_cols = [c for c in df_features.columns if c not in target_cols]
X = df_features[feature_cols].copy()
y = df_features[target_cols].copy()

# Robust Missing Value Handling
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category', 'str']).columns

num_imputer = SimpleImputer(strategy='median')
if len(numeric_cols) > 0:
    X[numeric_cols] = num_imputer.fit_transform(X[numeric_cols])

cat_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
if len(categorical_cols) > 0:
    X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])

# Encode Categorical Features
X_encoded = pd.get_dummies(X, drop_first=True)

# Encode Targets
label_encoders = {}
for col in target_cols:
    le = LabelEncoder()
    y[col] = le.fit_transform(y[col].astype(str))
    label_encoders[col] = le

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# --- INDEPENDENT MODELS TUNED FOR ACCURACY ---
print("Training 9 Independent Unweighted XGBoost models...\n")

test_accuracies = []

for col in target_cols:
    # Tuned for pure accuracy (No weights!)
    xgb = XGBClassifier(
        learning_rate=0.1,         # Faster learning rate to capture sharp splits
        n_estimators=250,          # More estimators
        max_depth=7,               # Deeper trees to learn specific combinations of features
        min_child_weight=1,        # Allow specific leaf nodes for higher accuracy
        gamma=0.1,                 # Prune branches that don't improve accuracy
        subsample=0.9,             # Use 90% of data (less random)
        colsample_bytree=0.9,      
        eval_metric='mlogloss', 
        random_state=42,
        n_jobs=-1
    )
    
    # Train the model normally (letting it naturally favor the majority classes)
    xgb.fit(X_train, y_train[col])
    
    # Predict and Evaluate
    y_pred = xgb.predict(X_test)
    acc = accuracy_score(y_test[col], y_pred)
    test_accuracies.append(acc)
    
    print(f"{col}: {acc * 100:.2f}%")

print(f"\n======================================")
print(f"Average Overall Test Accuracy: {np.mean(test_accuracies) * 100:.2f}%")
print(f"======================================")

from sklearn.metrics import classification_report

for col in target_cols:
    # ... (your model training code) ...
    y_pred = xgb.predict(X_test)
    
    print(f"\n=== Report for: {col} ===")
    
    # We use the label encoder to get the actual text labels (e.g., 'Never', 'Yes')
    actual_labels = label_encoders[col].classes_
    
    # This prints Precision, Recall, and F1 for EACH specific answer
    print(classification_report(y_test[col], y_pred, target_names=actual_labels))

Training 9 Independent Unweighted XGBoost models...

Husband_partner_jealous_if_Wife_talks_with_other_men: 74.60%
Husband_partner_insists_on_knowing_where_Wife_is: 80.54%
Ever_been_humiliated_by_husband_partner: 91.84%
Ever_been_insulted_or_made_to_feel_bad_by_husband_partner: 92.78%
Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner: 89.21%
Ever_been_slapped_by_husband_partner: 76.38%
Person_who_usually_decides_on_Wife's_health_care: 73.76%
Person_who_usually_decides_on_large_household_purchases: 73.96%
Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never: 65.77%

Average Overall Test Accuracy: 79.87%

=== Report for: Husband_partner_jealous_if_Wife_talks_with_other_men ===
              precision    recall  f1-score   support

  Don't know       0.00      0.00      0.00        31
          No       0.81      0.09      0.16      6968
         Yes       0.25      0.94      0.40      2299

    accuracy                           0.30      9298
   macro avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



=== Report for: Ever_been_insulted_or_made_to_feel_bad_by_husband_partner ===
                                    precision    recall  f1-score   support

                             Never       1.00      0.00      0.00      8629
                             Often       0.01      0.08      0.02       114
                         Sometimes       0.05      0.96      0.10       478
Yes, but not in the last 12 months       0.00      0.00      0.00        77

                          accuracy                           0.05      9298
                         macro avg       0.27      0.26      0.03      9298
                      weighted avg       0.93      0.05      0.01      9298


=== Report for: Ever_been_pushed,_shook_or_had_something_thrown_by_husband_partner ===
                                    precision    recall  f1-score   support

                             Never       0.89      0.00      0.00      8296
                             Often       0.00      0.03      0.01    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



=== Report for: Ever_been_slapped_by_husband_partner ===
                                    precision    recall  f1-score   support

                             Never       0.78      0.00      0.00      7121
                             Often       0.01      0.02      0.01       163
                         Sometimes       0.18      0.98      0.30      1531
Yes, but not in the last 12 months       0.00      0.00      0.00       483

                          accuracy                           0.16      9298
                         macro avg       0.24      0.25      0.08      9298
                      weighted avg       0.62      0.16      0.05      9298


=== Report for: Person_who_usually_decides_on_Wife's_health_care ===
                                precision    recall  f1-score   support

         Husband/partner alone       0.00      0.00      0.00      1552
                         Other       0.00      0.04      0.00        27
              Respondent alone       0.08   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



=== Report for: Person_who_usually_decides_on_large_household_purchases ===
                                precision    recall  f1-score   support

         Husband/partner alone       0.22      0.00      0.00      1593
                         Other       0.01      0.07      0.01        54
              Respondent alone       0.06      0.87      0.11       564
Respondent and husband/partner       0.00      0.00      0.00      6892
                  Someone else       0.00      0.00      0.00       195

                      accuracy                           0.05      9298
                     macro avg       0.06      0.19      0.02      9298
                  weighted avg       0.04      0.05      0.01      9298


=== Report for: Wife_afraid_of_husband_partner_most_of_the_time,_sometimes_or_never ===
                         precision    recall  f1-score   support

Most of the time afraid       0.11      0.00      0.00       986
           Never afraid       0.46      0.16      0.