In [15]:
import pandas as pd

data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")

edu_categories = ['High School', 'Graduate', 'Post Graduate', 'Professional']
company_categories = ['Startup', 'Small', 'Mid-size', 'Large Indian', 'MNC']

data['age'] = (
    data['age'].astype(str)
    .str.strip()                              # remove spaces
    .str.replace(r'[^0-9.]', '', regex=True)  # keep only digits and dots
    .str.replace(r'(\.\d*)\..*', r'\1', regex=True)  # keep only first decimal part
    .astype(float)
)

data['gender'] = data['gender'].astype(str).str.strip().str.lower()
data['gender'] = data['gender'].replace({'^m$': 'male', '^f$': 'female'}, regex=True)
data['gender'] = data['gender'].replace(r'^\s*$', pd.NA, regex=True)
data['gender'] = data['gender'].map({'male': 0, 'female': 1})

data['marital_status'] = data['marital_status'].astype(str).str.strip().str.lower()
data['marital_status'] = data['marital_status'].replace(r'^\s*$', pd.NA, regex=True)
data['marital_status'] = data['marital_status'].map({'single': 0, 'married': 1})

data["education"] = pd.Series(
    pd.Categorical(data["education"], categories=edu_categories, ordered=True).codes
).replace({-1: pd.NA})

data["monthly_salary"] = pd.to_numeric(data["monthly_salary"], errors='coerce')

# data["employment_type"] = pd.get_dummies(data["employment_type"]).add_suffix("_job")

data = pd.get_dummies(data, columns=["employment_type"], dtype="int") 


data['company_type'] = pd.Series(
    pd.Categorical(data["company_type"], categories=company_categories, ordered=True).codes
).replace({-1: pd.NA})


data["bank_balance"] = pd.to_numeric(data["bank_balance"], errors="coerce")


# data['existing_loans'] = data['existing_loans'].astype(str).str.strip().str.lower()
# data['existing_loans'] = data['existing_loans'].replace(r'^\s*$', pd.NA, regex=True)
# data['existing_loans'] = data['existing_loans'].map({'no': 0, 'yes': 1})

  data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")


In [16]:
data["existing_loans"].unique()

array(['Yes', 'No'], dtype=object)

In [27]:
data.dtypes

age                       float64
gender                      int64
marital_status              int64
education                  object
monthly_salary             object
employment_type            object
years_of_employment       float64
company_type               object
house_type                 object
monthly_rent              float64
family_size                 int64
dependents                  int64
school_fees               float64
college_fees              float64
travel_expenses           float64
groceries_utilities       float64
other_monthly_expenses    float64
existing_loans              int64
current_emi_amount        float64
credit_score              float64
bank_balance               object
emergency_fund            float64
emi_scenario               object
requested_amount          float64
requested_tenure            int64
emi_eligibility            object
max_monthly_emi           float64
dtype: object

In [26]:
data["age"].unique()

array([38., 58., 48., 32., 27., 47., 37., 31., 59., 49., 33., 26., 39.,
       57., 28.])

In [2]:
data.head()

Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,existing_loans,current_emi_amount,credit_score,bank_balance,emergency_fund,emi_scenario,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi
0,38.0,Female,Married,Professional,82600.0,Private,0.9,Mid-size,Rented,20000.0,...,Yes,23700.0,660.0,303200.0,70200.0,Personal Loan EMI,850000.0,15,Not_Eligible,500.0
1,38.0,Female,Married,Graduate,21500.0,Private,7.0,MNC,Family,0.0,...,Yes,4100.0,714.0,92500.0,26900.0,E-commerce Shopping EMI,128000.0,19,Not_Eligible,700.0
2,38.0,Male,Married,Professional,86100.0,Private,5.8,Startup,Own,0.0,...,No,0.0,650.0,672100.0,324200.0,Education EMI,306000.0,16,Eligible,27775.0
3,58.0,Female,Married,High School,66800.0,Private,2.2,Mid-size,Own,0.0,...,No,0.0,685.0,440900.0,178100.0,Vehicle EMI,304000.0,83,Eligible,16170.0
4,48.0,Female,Married,Professional,57300.0,Private,3.4,Mid-size,Family,0.0,...,No,0.0,770.0,97300.0,28200.0,Home Appliances EMI,252000.0,7,Not_Eligible,500.0


In [3]:
data.count

<bound method DataFrame.count of          age  gender marital_status      education monthly_salary  \
0       38.0  Female        Married   Professional        82600.0   
1       38.0  Female        Married       Graduate        21500.0   
2       38.0    Male        Married   Professional        86100.0   
3       58.0  Female        Married    High School        66800.0   
4       48.0  Female        Married   Professional        57300.0   
...      ...     ...            ...            ...            ...   
404795  27.0    Male        Married       Graduate        32400.0   
404796  38.0    Male        Married  Post Graduate        49200.0   
404797  32.0    Male         Single       Graduate        25700.0   
404798  48.0    Male        Married       Graduate        47200.0   
404799  38.0  FEMALE        Married       Graduate        34900.0   

       employment_type  years_of_employment  company_type house_type  \
0              Private                  0.9      Mid-size     Rent

In [2]:
data.columns

Index(['age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'years_of_employment', 'company_type', 'house_type', 'monthly_rent',
       'family_size', 'dependents', 'school_fees', 'college_fees',
       'travel_expenses', 'groceries_utilities', 'other_monthly_expenses',
       'existing_loans', 'current_emi_amount', 'credit_score', 'bank_balance',
       'emergency_fund', 'emi_scenario', 'requested_amount',
       'requested_tenure', 'emi_eligibility', 'max_monthly_emi',
       'employment_type_Government', 'employment_type_Private',
       'employment_type_Self-employed'],
      dtype='object')

In [5]:
data["gender"].unique()

array(['Female', 'Male', 'female', 'male', 'M', 'MALE', 'F', 'FEMALE'],
      dtype=object)

In [8]:
data['gender'] = data['gender'].astype(str).str.strip().str.lower()
data['gender'] = data['gender'].replace({'^m$': 'male', '^f$': 'female'}, regex=True)
data.loc[~data['gender'].isin(['male', 'female']), 'gender'] = pd.NA

In [11]:
data["gender"].unique()

array([1, 0])

In [10]:
data['gender'] = data['gender'].map({'male': 0, 'female': 1})

In [16]:
data["marital_status"].unique()

array([1, 0])

In [13]:
data['marital_status'] = data['marital_status'].astype(str).str.strip().str.lower()

In [15]:
data['marital_status'] = data['marital_status'].map({'single': 0, 'married': 1})

In [20]:
data["existing_loans"].unique()

array([1, 0])

In [19]:
data['existing_loans'] = data['existing_loans'].astype(str).str.strip().str.lower()
data['existing_loans'] = data['existing_loans'].map({'no': 0, 'yes': 1})

# pre-processing

In [21]:
import pandas as pd

data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")

# Columns
binary_cols = ['existing_loans']
categorical_cols = ['gender', 'marital_status', 'education', 'employment_type', 'company_type', 'house_type']
target_col = 'emi_eligibility'

# Example ordinal mapping for target to one-hot later
# We'll let get_dummies handle it for multi-class one-hot
data_tree = data.copy()

# Binary column kept as-is
# One-hot encode categorical + target
data_tree = pd.get_dummies(data_tree, columns=categorical_cols + [target_col], dtype=int)

# Add existing_loans as binary
data_tree['existing_loans'] = data['existing_loans'].map({'No': 0, 'Yes': 1})

# Final tree model features (excluding target one-hot columns)
tree_features = [col for col in data_tree.columns if col not in [c for c in data_tree.columns if target_col in c]]

print("Tree model data (one-hot):")
print(data_tree.head())


  data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")


Tree model data (one-hot):
    age monthly_salary  years_of_employment  monthly_rent  family_size  \
0  38.0        82600.0                  0.9       20000.0            3   
1  38.0        21500.0                  7.0           0.0            2   
2  38.0        86100.0                  5.8           0.0            4   
3  58.0        66800.0                  2.2           0.0            5   
4  48.0        57300.0                  3.4           0.0            4   

   dependents  school_fees  college_fees  travel_expenses  \
0           2          0.0           0.0           7200.0   
1           1       5100.0           0.0           1400.0   
2           3          0.0           0.0          10200.0   
3           4      11400.0           0.0           6200.0   
4           3       9400.0       21300.0           3600.0   

   groceries_utilities  ...  company_type_MNC  company_type_Mid-size  \
0              19500.0  ...                 0                      1   
1               5

In [22]:
import pandas as pd

data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")

# Define ordinal mappings
gender_order = {'male': 0, 'female': 1}
marital_order = {'Single': 0, 'Married': 1, 'Divorced': 2}
education_order = {'High School': 0, 'Graduate': 1, 'Post Graduate': 2, 'Professional': 3}
employment_order = {'Private': 0, 'Government': 1, 'Self-employed': 2}
company_order = {'Startup': 0, 'Small': 1, 'Mid-size': 2, 'Large Indian': 3, 'MNC': 4}
house_order = {'Rented': 0, 'Family': 1, 'Own': 2}
emi_order = {'Not_Eligible': 0, 'Eligible': 1, 'High_Risk': 2}

# Copy data
data_linear = data.copy()

# Encode categorical columns
data_linear['gender_ord'] = data_linear['gender'].map(gender_order)
data_linear['marital_status_ord'] = data_linear['marital_status'].map(marital_order)
data_linear['education_ord'] = data_linear['education'].map(education_order)
data_linear['employment_type_ord'] = data_linear['employment_type'].map(employment_order)
data_linear['company_type_ord'] = data_linear['company_type'].map(company_order)
data_linear['house_type_ord'] = data_linear['house_type'].map(house_order)

# Binary column
data_linear['existing_loans'] = data_linear['existing_loans'].map({'No': 0, 'Yes': 1})

# Encode target
data_linear['emi_eligibility_ord'] = data_linear['emi_eligibility'].map(emi_order)

# Keep binary column as-is
# You can drop original categorical columns if you want
linear_features = ['existing_loans', 'gender_ord', 'marital_status_ord', 'education_ord',
                   'employment_type_ord', 'company_type_ord', 'house_type_ord']

print("Linear model data (ordinal):")
print(data_linear[linear_features + ['emi_eligibility_ord']].head())

  data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")


Linear model data (ordinal):
   existing_loans  gender_ord  marital_status_ord  education_ord  \
0               1         NaN                   1            3.0   
1               1         NaN                   1            1.0   
2               0         NaN                   1            3.0   
3               0         NaN                   1            0.0   
4               0         NaN                   1            3.0   

   employment_type_ord  company_type_ord  house_type_ord  emi_eligibility_ord  
0                    0                 2               0                    0  
1                    0                 4               1                    0  
2                    0                 0               2                    1  
3                    0                 2               2                    1  
4                    0                 2               1                    0  


In [None]:
import pandas as pd

def encode_columns_dynamic(df, columns, encoding_type="one-hot", ordinal_maps=None, binary_maps=None, target_col=None):
    """
    Dynamically encode specified columns based on encoding type.

    Parameters:
        df (pd.DataFrame): Input DataFrame
        columns (list): Columns to encode
        encoding_type (str): 'one-hot', 'ordinal', or 'binary'
        ordinal_maps (dict, optional): Dictionary of column-wise mapping for ordinal encoding
        target_col (str, optional): Target column to encode

    Returns:
        pd.DataFrame: Encoded DataFrame
    """
    df_encoded = df.copy()
    
    # Apply encoding per type
    if encoding_type == "binary":
        if not binary_maps:
            raise ValueError("binary_maps must be provided for binary encoding")
        for col in columns:
            if col in binary_maps:
                df_encoded[col] = df_encoded[col].map(binary_maps[col])
    
    elif encoding_type == "ordinal":
        if not ordinal_maps:
            raise ValueError("ordinal_maps must be provided for ordinal encoding")
        for col in columns:
            if col in ordinal_maps:
                df_encoded[col] = df_encoded[col].map(ordinal_maps[col])
        if target_col and target_col in ordinal_maps:
            df_encoded[target_col] = df_encoded[target_col].map(ordinal_maps[target_col])
    
    elif encoding_type == "one-hot":
        all_cols = columns.copy()
        if target_col:
            all_cols.append(target_col)
        df_encoded = pd.get_dummies(df_encoded, columns=all_cols, dtype=int)
    
    else:
        raise ValueError("encoding_type must be 'one-hot', 'ordinal', or 'binary'")
    
    return df_encoded

ordinal_maps = {
    'gender': {'male':0, 'female':1},
    'marital_status': {'Single':0, 'Married':1, 'Divorced':2},
    'education': {'High School':0, 'Graduate':1, 'Post Graduate':2, 'Professional':3},
    'employment_type': {'Private':0, 'Government':1, 'Self-employed':2},
    'company_type': {'Startup':0, 'Small':1, 'Mid-size':2, 'Large Indian':3, 'MNC':4},
    'house_type': {'Rented':0, 'Family':1, 'Own':2},
    'emi_eligibility': {'Not_Eligible':0, 'Eligible':1, 'High_Risk':2}
}

binary_maps = {'existing_loans': {'No': 0, 'Yes': 1}}


In [None]:
import pandas as pd

def encode_columns_fully_dynamic(df, columns, encoding_type="one-hot",
                                 ordinal_maps=None, binary_maps=None, target_col=None, to_numeric_cols=None):
    """
    Encode specified columns dynamically using one-hot, ordinal, or binary encoding.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame
        columns (list): List of columns to encode
        encoding_type (str): 'one-hot', 'ordinal', or 'binary'
        ordinal_maps (dict, optional): Dictionary of column-wise mappings for ordinal encoding
        binary_maps (dict, optional): Dictionary of column-wise mappings for binary encoding
        target_col (str, optional): Target column to encode similarly
        
    Returns:
        pd.DataFrame: Encoded DataFrame
    """
    df_encoded = df.copy()

    # Convert to numeric first if specified
    if to_numeric_cols:
        for col in to_numeric_cols:
            df_encoded[col] = pd.to_numeric(df_encoded[col], errors='coerce')
    
    if encoding_type == "binary":
        if not binary_maps:
            raise ValueError("binary_maps must be provided for binary encoding")
        for col in columns:
            if col in binary_maps:
                df_encoded[col] = df_encoded[col].map(binary_maps[col])
                
    elif encoding_type == "ordinal":
        if not ordinal_maps:
            raise ValueError("ordinal_maps must be provided for ordinal encoding")
        for col in columns:
            if col in ordinal_maps:
                df_encoded[col] = df_encoded[col].map(ordinal_maps[col])
        if target_col and target_col in ordinal_maps:
            df_encoded[target_col] = df_encoded[target_col].map(ordinal_maps[target_col])
    
    elif encoding_type == "one-hot":
        all_cols = columns.copy()
        if target_col:
            all_cols.append(target_col)
        df_encoded = pd.get_dummies(df_encoded, columns=all_cols, dtype=int)
    
    else:
        raise ValueError("encoding_type must be 'one-hot', 'ordinal', or 'binary'")
    
    return df_encoded


# Dynamic ordinal maps
ordinal_maps = {
    'gender': {'male':0, 'female':1},
    'marital_status': {'Single':0, 'Married':1, 'Divorced':2},
    'education': {'High School':0, 'Graduate':1, 'Post Graduate':2, 'Professional':3},
    'employment_type': {'Private':0, 'Government':1, 'Self-employed':2},
    'company_type': {'Startup':0, 'Small':1, 'Mid-size':2, 'Large Indian':3, 'MNC':4},
    'house_type': {'Rented':0, 'Family':1, 'Own':2},
    'emi_eligibility': {'Not_Eligible':0, 'Eligible':1, 'High_Risk':2}
}

# Binary maps
binary_maps = {'existing_loans': {'No': 0, 'Yes': 1}}

cols = ['gender', 'marital_status', 'education', 'employment_type', 'company_type', 'house_type']

# Binary encoding for existing_loans
data_binary = encode_columns_fully_dynamic(
    data, columns=['existing_loans'], encoding_type="binary", binary_maps=binary_maps
)

# Ordinal encoding for selected columns + target
data_linear = encode_columns_fully_dynamic(
    data, columns=cols, encoding_type="ordinal", ordinal_maps=ordinal_maps, target_col='emi_eligibility'
)

# One-hot encoding for selected columns + target
data_tree = encode_columns_fully_dynamic(
    data, columns=cols, encoding_type="one-hot", target_col='emi_eligibility'
)



def preprocess_columns(df, preprocess_map):
    """
    Dynamically preprocess specified columns before encoding.

    Parameters:
        df (pd.DataFrame): Input DataFrame
        preprocess_map (dict): Dictionary specifying preprocessing type for each column
            Example:
            {
                'age': 'numeric_clean',
                'gender': 'lower_strip_replace',
                'marital_status': 'lower_strip'
            }

    Returns:
        pd.DataFrame: Preprocessed DataFrame
    """
    df_clean = df.copy()

    for col, ptype in preprocess_map.items():
        if col not in df_clean.columns:
            continue

        if ptype == "numeric_clean":
            df_clean[col] = (
                df_clean[col].astype(str)
                .str.strip()                                  # remove spaces
                .str.replace(r'[^0-9.]', '', regex=True)     # keep only digits and dots
                .str.replace(r'(\.\d*)\..*', r'\1', regex=True)  # keep only first decimal
            )
            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

        elif ptype == "lower_strip":
            df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()
            df_clean[col] = df_clean[col].replace(r'^\s*$', pd.NA, regex=True)

        elif ptype == "lower_strip_replace":
            df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()
            df_clean[col] = df_clean[col].replace(r'^\s*$', pd.NA, regex=True)
            # Example for gender: convert single letters to full
            if col == "gender":
                df_clean[col] = df_clean[col].replace({'^m$': 'male', '^f$': 'female'}, regex=True)

        elif ptype == "custom_regex":
            # Placeholder for any other regex replacements
            pass

        else:
            # Default: just strip spaces
            df_clean[col] = df_clean[col].astype(str).str.strip()

    return df_clean


preprocess_map = {
    'age': 'numeric_clean',
    'gender': 'lower_strip_replace',
    'marital_status': 'lower_strip'
}

data_clean = preprocess_columns(data, preprocess_map)

# Then pass the cleaned data to your dynamic encoding function
cols = ['gender', 'marital_status', 'education', 'employment_type', 'company_type', 'house_type']
data_linear = encode_columns_fully_dynamic(data_clean, columns=cols, encoding_type="ordinal",
                                 ordinal_maps=ordinal_maps, target_col='emi_eligibility')


In [None]:
def preprocess_and_encode(df, preprocess_map=None, numeric_cols=None, encode_cols=None,
                          encoding_type=None, ordinal_maps=None, binary_maps=None, target_col=None):
    """
    Wrapper function to run preprocessing and encoding in one line while keeping modularity.
    """
    df_clean = df.copy()
    if preprocess_map:
        df_clean = preprocess_columns(df_clean, preprocess_map)
    df_encoded = encode_columns_fully_dynamic(df_clean, columns=encode_cols, encoding_type=encoding_type,
                                    ordinal_maps=ordinal_maps, binary_maps=binary_maps,
                                    target_col=target_col, to_numeric_cols=numeric_cols)
    return df_encoded

In [23]:
data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")

  data = pd.read_csv(r"../dataset/emi_prediction_dataset.csv")


In [24]:
data.columns

Index(['age', 'gender', 'marital_status', 'education', 'monthly_salary',
       'employment_type', 'years_of_employment', 'company_type', 'house_type',
       'monthly_rent', 'family_size', 'dependents', 'school_fees',
       'college_fees', 'travel_expenses', 'groceries_utilities',
       'other_monthly_expenses', 'existing_loans', 'current_emi_amount',
       'credit_score', 'bank_balance', 'emergency_fund', 'emi_scenario',
       'requested_amount', 'requested_tenure', 'emi_eligibility',
       'max_monthly_emi'],
      dtype='object')

In [26]:
data["age"].unique()

array([38.0, 58.0, 48.0, 32.0, 27.0, 47.0, 37.0, 31.0, 59.0, 49.0, 33.0,
       26.0, 39.0, 57.0, 28.0, '58', '38', '48', '32', '27', '37', '48.0',
       '33', '38.0', '49', '27.0', '31', '39', '47', '59', '32.0', '58.0',
       '57', '26', '28', '58.0.0', '39.0', '26.0', '37.0', '38.0.0',
       '32.0.0'], dtype=object)

In [25]:
data.dtypes

age                        object
gender                     object
marital_status             object
education                  object
monthly_salary             object
employment_type            object
years_of_employment       float64
company_type               object
house_type                 object
monthly_rent              float64
family_size                 int64
dependents                  int64
school_fees               float64
college_fees              float64
travel_expenses           float64
groceries_utilities       float64
other_monthly_expenses    float64
existing_loans             object
current_emi_amount        float64
credit_score              float64
bank_balance               object
emergency_fund            float64
emi_scenario               object
requested_amount          float64
requested_tenure            int64
emi_eligibility            object
max_monthly_emi           float64
dtype: object