In [1]:
import pandas as pd
f_ratios = pd.read_json("data/financial_ratios.jsonl", lines=True)
f_ratios = f_ratios.rename(columns={"cust_num": "customer_id"}).astype({"customer_id": int})

f_ratios["monthly_income"] = (
    f_ratios["monthly_income"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)

f_ratios["existing_monthly_debt"] = (
    f_ratios["existing_monthly_debt"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)

f_ratios["monthly_payment"] = (
    f_ratios["monthly_payment"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)
f_ratios["debt_to_income_ratio"] = (f_ratios["debt_to_income_ratio"].round(2))
f_ratios["debt_service_ratio"] = (f_ratios["debt_service_ratio"].round(2))
f_ratios["payment_to_income_ratio"] = (f_ratios["payment_to_income_ratio"].round(2))
f_ratios["credit_utilization"] = (f_ratios["credit_utilization"].round(2))
f_ratios["loan_to_annual_income"] = (f_ratios["loan_to_annual_income"].round(2))
f_ratios["revolving_balance"] = f_ratios["revolving_balance"].fillna(0)

f_ratios["revolving_balance"] = (
    f_ratios["revolving_balance"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)

f_ratios["credit_usage_amount"] = (
    f_ratios["credit_usage_amount"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)

f_ratios["available_credit"] = (
    f_ratios["available_credit"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)

f_ratios["total_monthly_debt_payment"] = (
    f_ratios["total_monthly_debt_payment"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)
f_ratios["total_debt_amount"] = (
    f_ratios["total_debt_amount"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)
f_ratios["monthly_free_cash_flow"] = (
    f_ratios["monthly_free_cash_flow"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)
f_ratios["annual_debt_payment"] = (
    f_ratios["annual_debt_payment"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)
f_ratios.to_csv("data/cleaned_financial_ratios.csv", index=False)
f_ratios


Unnamed: 0,customer_id,monthly_income,existing_monthly_debt,monthly_payment,debt_to_income_ratio,debt_service_ratio,payment_to_income_ratio,credit_utilization,revolving_balance,credit_usage_amount,available_credit,total_monthly_debt_payment,annual_debt_payment,loan_to_annual_income,total_debt_amount,monthly_free_cash_flow
0,10000,5150.00,738.64,592.13,0.26,0.26,0.12,0.84,142213.1,142213.1,26886.9,1330.77,15969.24,0.29,159913.1,3819.23
1,10001,2383.33,392.21,1013.86,0.59,0.59,0.42,0.97,75932.2,75932.2,2267.8,1406.07,16872.84,3.99,189932.2,977.26
2,10002,1725.00,204.07,317.81,0.30,0.30,0.18,0.54,22314.6,22314.6,19085.4,521.88,6262.56,0.45,31614.6,1203.12
3,10003,2616.67,288.71,234.52,0.20,0.20,0.09,0.15,8820.0,8820.0,51180.0,523.23,6278.76,0.28,17520.0,2093.44
4,10004,2050.00,248.77,334.81,0.29,0.28,0.16,0.49,24253.6,24253.6,25446.4,583.58,7002.96,0.29,31453.6,1466.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89994,99994,6025.00,2120.97,519.87,0.44,0.44,0.09,0.36,48680.4,48680.4,86919.6,2640.84,31690.08,0.16,59980.4,3384.16
89995,99995,1666.67,138.97,545.44,0.41,0.41,0.33,0.36,15833.0,15833.0,28767.0,684.41,8212.92,0.82,32333.0,982.26
89996,99996,1666.67,129.90,616.96,0.45,0.45,0.37,0.67,12242.7,12242.7,6057.3,746.86,8962.32,0.89,30042.7,919.81
89997,99997,1666.67,162.11,351.00,0.31,0.31,0.21,0.56,30516.6,30516.6,23783.4,513.11,6157.32,0.58,42216.6,1153.56


In [2]:
df = pd.read_parquet('data/credit_history.parquet')


# 1. Remove redundant index column
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# 2. Fill missing delinquency values with 0
if "num_delinquencies_2yrs" in df.columns:
    df['num_delinquencies_2yrs'] = df['num_delinquencies_2yrs'].fillna(0)
else:
    print("Column 'num_delinquencies_2yrs' not found!")

# 3. Rename customer_number → customer_id
if "customer_number" in df.columns:
    df = df.rename(columns={"customer_number": "customer_id"})
else:
    print("Column 'customer_number' not found!")

df['oldest_account_age_months'] = (
    df['oldest_account_age_months']
        .astype(str)
        .str.replace("'", " ")
        .str.replace('"', " ")
        .str.strip()
)

df['oldest_account_age_months'] = pd.to_numeric(df['oldest_account_age_months'], errors='coerce').fillna(0)
df['oldest_account_age_months'].dtype

df.to_csv("data/cleaned_credit_history.csv", index=False)





df1 = pd.read_csv("data/geographic_data.csv")
# 1. Remove Unnamed index column
unnamed_cols = [col for col in df1.columns if "unnamed" in col.lower()]
df1 = df1.drop(columns=unnamed_cols)

# 2. Rename id → customer_id
rename_map = {}
for col in df1.columns:
    if col.lower() == "id":
        rename_map[col] = "customer_id"
df1 = df1.rename(columns=rename_map)

# 3. Remove spaces & apostrophes from ALL string values
df1 = df1.applymap(lambda x: x.strip().replace("'", "") if isinstance(x, str) else x)

# 4. Convert numeric-looking columns back to numeric
for col in df1.columns:
    df1[col] = pd.to_numeric(df1[col], errors="ignore")

# 5. Fill existing NaN values with 0
df1 = df1.fillna(0)
df1.to_csv("data/cleaned_geographic_data.csv", index=False)

  df1 = df1.applymap(lambda x: x.strip().replace("'", "") if isinstance(x, str) else x)
  df1[col] = pd.to_numeric(df1[col], errors="ignore")


In [3]:
loan_details=pd.read_excel("data/loan_details.xlsx")

loan_type_clean_map = {
    'personal': 'Personal Loan',
    'personal loan': 'Personal Loan',
    'personal': 'Personal Loan',
    'personal loan': 'Personal Loan',

    'creditcard': 'Credit Card',
    'credit card': 'Credit Card',
    'cc': 'Credit Card',

    'mortgage': 'Mortgage',
    'home loan': 'Mortgage'
}

# Kichik harfga o'tkazib, bo'sh joylarni olib tashlaymiz
loan_details['loan_type_cleaned'] = (
    loan_details['loan_type']
    .str.strip()
    .str.lower()
    .map(loan_type_clean_map)
)

loan_details['loan_amount_clean'] = (
    loan_details['loan_amount']
    .astype(str)
    .str.replace('$', '', regex=False)
    .str.replace(',', '', regex=False)
    .astype(float)
)

loan_details['interest_rate'] = loan_details['interest_rate'].astype(float) / 100

loan_details=loan_details.drop(columns={'loan_amount','loan_type'})


loan_details.rename(columns={'loan_type_cleaned':'loan_type','loan_amount_clean':'loan_amount'})

loan_mapping = {
    'Debt Consolidation': 0,
    'Refinance': 1,
    'Major Purchase': 2,
    'Medical': 3,
    'Revolving Credit': 4,
    'Home Improvement': 5,
    'Home Purchase': 6,
    'Other': 7
}

# loan_purpose ustunini label encoding qilish
loan_details['loan_purpose_encoded'] = loan_details['loan_purpose'].map(loan_mapping)

# natijani tekshirish
print(loan_details[['loan_purpose', 'loan_purpose_encoded']])

# origination_channel uchun label encoding
channel_mapping = {
    'Direct Mail': 0,
    'Branch': 1,
    'Online': 2,
    'Broker': 3
}

# ustunni raqamga aylantirish
loan_details['origination_channel_encoded'] = loan_details['origination_channel'].map(channel_mapping)

# natijani tekshirish
print(loan_details[['origination_channel', 'origination_channel_encoded']].head())


# loan_type_cleaned uchun label encoding
loan_type_mapping = {
    'Personal Loan': 0,
    'Mortgage': 1,
    'Credit Card': 2
}

# ustunni raqamga aylantirish
loan_details['loan_type_encoded'] = loan_details['loan_type_cleaned'].map(loan_type_mapping)

# natijani tekshirish
print(loan_details[['loan_type_cleaned', 'loan_type_encoded']].head())


# kerakli ustunlarni tanlash
columns_needed = [
    'customer_id', 
    'loan_term', 
    'interest_rate', 
    'loan_to_value_ratio', 
    'loan_officer_id', 
    'marketing_campaign', 
    'loan_amount_clean', 
    'loan_purpose_encoded', 
    'origination_channel_encoded', 
    'loan_type_encoded'
]

loan_final = loan_details[columns_needed].copy()

# ustunlarni qayta nomlash
loan_final.rename(columns={
    'loan_amount_clean': 'loan_amount',
    'loan_purpose_encoded': 'loan_purpose',
    'origination_channel_encoded': 'origination_channel',
    'loan_type_encoded': 'loan_type'
}, inplace=True)

# natijani tekshirish
print(loan_final.head())


loan_final.to_csv('data/cleaned_loan_details.csv', index=False)





application_metadata=pd.read_csv("data/application_metadata.csv")

# preferred_contact uchun label encoding
contact_mapping = {
    'Mail': 0,
    'Email': 0,
    'Phone': 1
}

# ustunni raqamga aylantirish
application_metadata['preferred_contact_encoded'] = application_metadata['preferred_contact'].map(contact_mapping)

# natijani tekshirish
print(application_metadata[['preferred_contact', 'preferred_contact_encoded']].head())


# account_status_code uchun label encoding
account_status_mapping = {
    'ACT-1': 0,
    'ACT-2': 1,
    'ACT-3': 2,
    'A01': 3,
    'ACTIVE': 4
}

# ustunni raqamga aylantirish
application_metadata['account_status_encoded'] = application_metadata['account_status_code'].map(account_status_mapping)

# natijani tekshirish
print(application_metadata[['account_status_code', 'account_status_encoded']].head())

application_metadata=application_metadata.drop(columns={'preferred_contact','account_status_code'})
application_metadata=application_metadata.rename(columns={'customer_ref':'customer_id'})

application_metadata.to_csv('data/cleaned_application_meta.csv', index=False)

             loan_purpose  loan_purpose_encoded
0      Debt Consolidation                     0
1               Refinance                     1
2          Major Purchase                     2
3                 Medical                     3
4      Debt Consolidation                     0
...                   ...                   ...
89994    Home Improvement                     5
89995               Other                     7
89996      Major Purchase                     2
89997    Revolving Credit                     4
89998           Refinance                     1

[89999 rows x 2 columns]
  origination_channel  origination_channel_encoded
0         Direct Mail                            0
1              Branch                            1
2              Online                            2
3              Online                            2
4              Branch                            1
  loan_type_cleaned  loan_type_encoded
0     Personal Loan                  0
1          Mor

In [4]:
demographics = pd.read_csv("data/demographics.csv")
demographics.head()
demographics.rename(columns={"cust_id": "customer_id"}, inplace=True)
demographics = demographics.astype({"customer_id": int,
                                   "age": int,
                                   "employment_length": float
                                   })

demographics["annual_income"] = (
    demographics["annual_income"]
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
    .replace('', '0')
    .astype(float).round(2)
)

demographics["employment_length"] = demographics["employment_length"].fillna(0)

demographics["employment_type"] = (
    demographics["employment_type"]
    .str.lower()
    .str.replace(r'[\s\-]+', '_', regex=True)
    .replace(
        {
            r'^(ft|ft:|fulltime|full_time)$': '0',
            r'^(pt|pt:|parttime|part_time)$': '1',
            r'^(self[_\- ]?emp:?|self_employed|self_employed:?)$': '2',
            "contractor": "3",
            "contract": "3",
            
        },
        regex=True,
    ).astype(int)
)
demographics["education"] = demographics["education"].str.lower().map(
        {
            "advanced": 0,
            "bachelor": 1,
            "graduate": 2,
            "high school": 3,
            "some college": 4,
        }
    ).astype(int)


demographics["marital_status"] = demographics["marital_status"].str.lower().map(
        {
            "single": 0,
            "married": 1,
            "divorced": 2,
            
        }
    ).astype(int)   
demographics.to_csv("data/cleaned_demographics.csv", index=False)
demographics.isna().sum()

customer_id          0
age                  0
annual_income        0
employment_length    0
employment_type      0
education            0
marital_status       0
num_dependents       0
dtype: int64

In [5]:
f_ratios
demographics
credit_history = pd.read_csv("data/cleaned_credit_history.csv")
geographic_data = pd.read_csv("data/cleaned_geographic_data.csv")
application_metadata = pd.read_csv("data/cleaned_application_meta.csv")
loan_details = pd.read_csv("data/cleaned_loan_details.csv")
merged = (
    application_metadata
    .merge(demographics, on="customer_id", how="left")
    .merge(credit_history, on="customer_id", how="left")
    .merge(geographic_data, on="customer_id", how="left")
    .merge(loan_details, on="customer_id", how="left")
    .merge(f_ratios, on="customer_id", how="left")
)

In [6]:
# merge all dataframes on customer_id (inner join to keep rows present in all)
merged = (
    application_metadata
    .merge(demographics, on="customer_id", how="left")
    .merge(credit_history, on="customer_id", how="left")
    .merge(geographic_data, on="customer_id", how="left")
    .merge(loan_details, on="customer_id", how="left")
    .merge(f_ratios, on="customer_id", how="left")
)

print("Merged shape:", merged.shape)
merged.to_csv("data/merged_customer_data.csv", index=False)
merged


Merged shape: (89999, 63)


Unnamed: 0,customer_id,application_id,application_hour,application_day_of_week,account_open_year,referral_code,random_noise_1,num_login_sessions,num_customer_service_calls,has_mobile_app,...,payment_to_income_ratio,credit_utilization,revolving_balance,credit_usage_amount,available_credit,total_monthly_debt_payment,annual_debt_payment,loan_to_annual_income,total_debt_amount,monthly_free_cash_flow
0,10000,620515,5,6,2013,REF0000,1.137099,13,2,1,...,0.12,0.84,142213.1,142213.1,26886.9,1330.77,15969.24,0.29,159913.1,3819.23
1,10001,624978,4,2,2015,REF0000,-0.164932,6,1,1,...,0.42,0.97,75932.2,75932.2,2267.8,1406.07,16872.84,3.99,189932.2,977.26
2,10002,564658,10,3,2020,REF0000,0.526700,1,2,1,...,0.18,0.54,22314.6,22314.6,19085.4,521.88,6262.56,0.45,31614.6,1203.12
3,10003,621493,7,5,2010,REF0000,-0.709779,4,1,1,...,0.09,0.15,8820.0,8820.0,51180.0,523.23,6278.76,0.28,17520.0,2093.44
4,10004,637785,1,2,2020,REF0000,-0.603132,6,2,1,...,0.16,0.49,24253.6,24253.6,25446.4,583.58,7002.96,0.29,31453.6,1466.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89994,99994,585752,13,5,2016,REF0000,1.628917,10,3,0,...,0.09,0.36,48680.4,48680.4,86919.6,2640.84,31690.08,0.16,59980.4,3384.16
89995,99995,595205,13,5,2014,REF9754,-0.501960,12,1,1,...,0.33,0.36,15833.0,15833.0,28767.0,684.41,8212.92,0.82,32333.0,982.26
89996,99996,544796,7,5,2010,REF0000,-0.964956,4,2,1,...,0.37,0.67,12242.7,12242.7,6057.3,746.86,8962.32,0.89,30042.7,919.81
89997,99997,560885,8,3,2021,REF0000,0.328372,5,2,0,...,0.21,0.56,30516.6,30516.6,23783.4,513.11,6157.32,0.58,42216.6,1153.56
