# Marriage Perception
## Exploratory Analysis

In [None]:
# Import relevant Python libraries here
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [None]:
# Load dataset into notebook
df = pd.read_csv('Marriage Perception (Responses) - Form Responses 1.csv')

# Shape of dataset
print(f"Size of Dataset (Rows, Cols): {df.shape}\n")

# Peek at head of dataset
print("Preview: ")
df.head()

In [None]:
# Check info
print(df.info())

In [None]:
# Check for duplicates
print(f"Duplicates: {df.duplicated().sum()}\n")

# Check for null values
total_missing = df.isna().sum().sum()
col_missing = df.isna().sum(axis=0)
row_missing = df.isna().sum(axis=1)

print(f"Total missing values: {total_missing}\n")
print(f"Missing per row:\n{row_missing}\n")
print(f"Missing per column:\n{col_missing}")


## Manipulating and Cleansing Initial Dataset

In [None]:
## Function to extract numerical digits from Likert Scale strings
def clean_ratings(val):
    if pd.isna(val):
        return val

    # Use regex to find first digit in string
    match = re.search(r'(\d)', str(val))
    if match:
        return int(match.group(1))
    return val

# Indentify columns to clean
col_review = df.columns[1:22]
for col in col_review:
    df[col] = df[col].apply(clean_ratings)

## Shorten column names for simplicity
column_mapping = {
    df.columns[1]: 'want_married',
    df.columns[2]: 'unique_benefits',
    df.columns[3]: 'everyone_married',
    df.columns[4]: 'more_than_legal',
    df.columns[5]: 'outdated_idea',
    df.columns[6]: 'happy_unmarried',
    df.columns[7]: 'goal_is_marriage',
    df.columns[8]: 'unique_challenges',
    df.columns[9]: 'highest_level_rel',
    df.columns[10]: 'do_not_want_marry',
    df.columns[11]: 'marriage_important',
    df.columns[12]: 'money_success_ind',
    df.columns[13]: 'money_measure_success',
    df.columns[14]: 'rich_achievement',
    df.columns[15]: 'anxious_finances',
    df.columns[16]: 'good_budgeting',
    df.columns[17]: 'hesitant_change',
    df.columns[18]: 'uncertainty_nerve_wracking',
    df.columns[19]: 'variety_spice_life',
    df.columns[20]: 'map_out_career',
    df.columns[21]: 'taking_chances',
    df.columns[22]: 'age',
    df.columns[23]: 'gender',
    df.columns[24]: 'orientation',
    df.columns[25]: 'is_married',
    df.columns[26]: 'religion_affiliation',
    df.columns[27]: 'religiosity',
}

df = df.rename(columns=column_mapping)

## Remove empty questions column
if df.columns[-1].startswith('If you have any questions or comments'):
    df = df.drop(columns=[df.columns[-1]])

## Better categorize "Are you married" column
df['is_married'] = df['is_married'].str.strip()
df['is_married'] = df['is_married'].replace({'Complicated': 'Other', 'Engaged': 'Other'})

## Categorize religions
df['religion_affiliation'] = df['religion_affiliation'].astype(str).str.lower().str.strip()

# Place religions into separate categories
def categorize_religion(rel_string):
    if pd.isna(rel_string):
        return 'Unknown'
        
    # Agnostic, Atheist, or Non-religious
    if any(x in rel_string for x in ['no', 'none', 'n/a', 'agnostic', 'atheist', 'undecided', 'nah']):
        return 'Agnostic/Atheist/None'
        
    # Catholic
    elif 'catholic' in rel_string:
        return 'Catholic'
    
    # Protestant 
    elif 'protestant' in rel_string:
        return 'Protestant'
        
    # General Christian
    elif any(x in rel_string for x in ['christian', 'episcopalian', 'presbyterian', 'nondenominational']):
        return 'Other Christian'
        
    # Eastern Religions
    elif any(x in rel_string for x in ['buddhist', 'buddhism', 'hindu']):
        return 'Buddhist/Hindu'
        
    # Islamic
    elif 'islam' in rel_string or 'muslim' in rel_string:
        return 'Muslim'
        
    # Catch-all for other spiritualities or unique inputs like "God"
    else:
        return 'Other/Spiritual'
    
# Create new religion category column
df['religion_category'] = df['religion_affiliation'].apply(categorize_religion)

## Create aggregate score for valuation of money (OPTIONAL?)
df['money_value_score'] = df[['money_success_ind', 'money_measure_success', 'rich_achievement']].mean(axis=1) # combines questions pertaining to money

### Removing Biased Entries
Some mobile users possibly only noticed 3-4 of the 7 Likert Scale options available, hence choosing only up to 3-4.

In [None]:
## Identify all columns
likert_columns = [
    'want_married', 'unique_benefits', 'everyone_married', 'more_than_legal', 
    'outdated_idea', 'happy_unmarried', 'goal_is_marriage', 'unique_challenges', 
    'highest_level_rel', 'do_not_want_marry', 'marriage_important', 'money_success_ind', 
    'money_measure_success', 'rich_achievement', 'anxious_finances', 'good_budgeting', 
    'hesitant_change', 'uncertainty_nerve_wracking', 'variety_spice_life', 
    'map_out_career', 'taking_chances'
]

## Calculate max rating given by each respondent
df['row_max'] = df[likert_columns].max(axis=1)

## Flag any potential mobile bias
# Highly Suspicious = max rating of 3
# Suspicious = max rating of 4
df['bias_flag'] = 'None'
df.loc[df['row_max'] <= 4, 'bias_flag'] = 'Suspicious (Max 4)'
df.loc[df['row_max'] <= 3, 'bias_flag'] = 'Highly Suspicious (Max 3)'

## Isolate into separate dataframe
df_suspects = df[df['bias_flag'] != 'None']

## Print statements
print(f"Total entries: {len(df)}")
print(f"Suspect entries identified: {len(df_suspects)}")
df_suspects.head()

In [None]:
## Display new, clean dataset
df_clean = df[df['bias_flag'] == 'None'].copy()
print("Data cleaning complete.\n")
print(f"New shape: {df_clean.shape}")
df_clean.head()

In [None]:
# Filter out biased entries
print(f"Original entries: {len(df)}")
print(f"Clean entries for visualization: {len(df_clean)}")

## Visualizing Data
### Demographics Visualization

In [None]:
## Define demographic columns
demo_cat_cols = ['gender', 'orientation', 'is_married', 'religion_category', 'religiosity']

# Set up grid of subplots
fig, axes = plt.subplots(2, 3, figsize=(18,12))
axes = axes.flatten()

## Plot age
sns.histplot(data=df_clean, x='age', bins=10, ax=axes[0], color='skyblue', kde=True)
axes[0].set_title('Age Distribution')
axes[0].set_ylabel('Count')
axes[0].set_xlabel('Age')

## Plot categorical variables using a loop
for i, col in enumerate(demo_cat_cols, start=1):
    # Order the bars by highest count to lowest for better readability (except for religiosity which is ordinal)
    if col == 'religiosity':
        order = sorted(df_clean[col].dropna().unique())
    else:
        order = df_clean[col].value_counts().index
        
    sns.countplot(data=df_clean, y=col, ax=axes[i], palette='viridis', order=order)
    
    # Format the titles to look clean (e.g., 'religion_category' -> 'Religion Category')
    clean_title = col.replace('_', ' ').title()
    axes[i].set_title(f'{clean_title} Distribution')
    axes[i].set_xlabel('Count')
    axes[i].set_ylabel('') # Remove y-label since the tick labels are self-explanatory

# Adjust layout so titles and labels don't overlap
plt.tight_layout()
plt.show()

### Heatmap Visualization

In [None]:
# Selecting the most important 
marriage_cols = [
    'want_married', 'unique_benefits', 'everyone_married', 'more_than_legal', 
    'outdated_idea', 'happy_unmarried', 'goal_is_marriage', 'unique_challenges', 
    'highest_level_rel', 'do_not_want_marry', 'marriage_important', 'money_value_score', 'anxious_finances', 'good_budgeting', 
    'hesitant_change', 'uncertainty_nerve_wracking', 'variety_spice_life', 
    'map_out_career', 'taking_chances'
    ]

# TODO: Fix this section!!!
plt.figure(figsize=(10, 8))
corr = df_clean[marriage_cols].corr()

# Create a mask to hide the top triangle (make it easier to read)
mask = np.triu(np.ones_like(corr, dtype=bool))

# Plot heatmap
sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu_r', center=0, fmt=".2f")
plt.title('Correlation Between Marriage, Money, and Religiosity')
plt.show()

### Relationship between Valuation of Money and Marriage Importance

In [None]:
# TODO: Plot this relationship

### Relationship between Religiosity and Desire to Get Married

In [None]:
# TODO: Plot this relationship

### Relationship between Age and Marriage Importance

In [None]:
# TODO: Plot this relationship