In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# I need to answer the questions:

- Is there a relationship between the number of children and timely loan repayment?  
- Is there a relationship between marital status and timely loan repayment?  
- How do different loan purposes affect timely loan repayment?

## Preparing data for analysis

In [None]:
df = pd.read_csv('./datasets/loan_reliability_project.csv')
print(df.head(5))

In [None]:
# There was more than 2000 null values in 'days_employed' column which is more than 10% of total values.
median_days_employed = df.loc[df['days_employed'] > 0, 'days_employed'].median()
# I changed missed values to median value because it handles outliers well,
# keeps the data balanced, and ensures the analysis stays accurate
df['days_employed'] = df['days_employed'].fillna(value=median_days_employed)


# Negative values (18080) in 'days_employed' column seems incorrect and I changed them to median too
df.loc[df['days_employed'] <= 0, 'days_employed'] = median_days_employed

# Checking that all the values are grater than 0
print(f"days_employed equals 0 (should be 0): {df.loc[df['days_employed'] <= 0, 'days_employed'].sum()}")

# Making sure the type is correct
print(f"Data types of num cells: {df['days_employed'].dtype, df['dob_years'].dtype, df['total_income'].dtype}")
#float64 int64 float64

# It's more comfortable to work with int numbers in salary column
median_income = df['total_income'].median()
df['total_income'] = df['total_income'].fillna(median_income)
print(f"Data types of num cells: {df['total_income'].dtype}")

# Removing NaN in debt
median_debt = df['debt'].median()
df['debt'] = df['debt'].fillna(median_debt)

# Checking num of children
print()
print(f"children values: {df['children'].value_counts()}")


# The value of 20 children seems highly unlikely
median_children = df['children'].median()
df.loc[df['children'] == 20, 'children'] = median_children
# A value of -1 for children is impossible and it will be replaced with median_children
df.loc[df['children'] == -1, 'children'] = 0

In [None]:
# Checking for duplicates in columns education, family_status, income_type, purpose

print(f'education: {df['education'].unique()}')
print(f'family_status: {df['family_status'].unique()}')
print(f'income_type: {df['income_type'].unique()}')
print(f'purpose: {df['purpose'].unique()}')

# There are duplicates in education and purpose columns

# To lowcase
df.loc[:, 'education'] = df.loc[:, 'education'].str.lower()

print()
print(f'education: {df['education'].unique()}')

In [None]:
# I'd like to create a new column to group the data from 'purpose' column

def categorize_purpose(purpose):
    purpose = purpose.lower().strip()
    if 'жиль' in purpose:
        return 'операции с недвижимостью'
    elif 'недвижимост' in purpose:
        return 'операции с недвижимостью'
    elif 'авто' in purpose:
        return 'операции с автомобилем'
    elif 'свадьб' in purpose:
        return 'проведение свадьбы'
    elif 'образован' in purpose:
        return 'получение образования'
    else:
        return 'прочее'

df['purpose_categories'] = df['purpose'].apply(categorize_purpose)

print(f'purpose_categories: {df['purpose_categories'].unique()}')

# Check if all categories are collected
if df.loc[df['purpose_categories'] == 'прочее'].shape[0] == 0:
    print('All the purposes are categorized')
else:
    print(df.loc[df['purpose_categories'] == 'прочее'])    

## The original sheet is ready for analysis. The next step is decomposition and create reference tables

In [None]:
print(df.head(5))

In [None]:
# Creating reference tables
education_df = df[['education', 'education_id']].drop_duplicates().reset_index(drop=True)
family_status_df = df[['family_status', 'family_status_id']].drop_duplicates().reset_index(drop=True)

# Removing extra data from the df
df.drop(columns=['education_id', 'family_status'], inplace=True)

print(education_df)
print()
print(family_status_df)

In [None]:
print(df.head(5))

## Analysis:

## relationship between the number of children and timely loan repayment

In [None]:
pivot_children = df.pivot_table(
    index='children', 
    values='debt', 
    aggfunc=['count', 'sum']
).reset_index()
pivot_children['debt_ratio'] = pivot_children['sum'] / pivot_children['count'] * 100

print(pivot_children)
print()
plt.barh(pivot_children['children'], pivot_children['debt_ratio'], color='#0088cc', edgecolor='black')
plt.title('Debt Ratio by number of children')
plt.xlabel('Debt Ratio %')
plt.ylabel('Number of children')
plt.tight_layout()
plt.show()

## relationship between marital status and timely loan repaymen

In [None]:
marital_grouped = df.groupby('family_status_id').agg(
    total_clients=('debt', 'count'),
    total_debtors=('debt', 'sum')
).reset_index()
marital_grouped['debt_ratio'] = marital_grouped['total_debtors'] / marital_grouped['total_clients'] * 100

print(marital_grouped)
print()
plt.barh(marital_grouped['family_status_id'], marital_grouped['debt_ratio'], color='#0088cc', edgecolor='black')
plt.title('Debt Ratio by marital status')
plt.xlabel('Debt Ratio %')
plt.ylabel('Status')
plt.tight_layout()
plt.show()

## loan purposes affect timely loan repayment

In [None]:
debt_analysis = pd.DataFrame({
    'total_credits': df.groupby('purpose_categories')['debt'].count(),  
    'total_debts': df.groupby('purpose_categories')['debt'].sum(),   
    'debt_ratio': df.groupby('purpose_categories')['debt'].sum() / df.groupby('purpose_categories')['debt'].count() * 100
}).sort_values(by='debt_ratio', ascending=False).reset_index()

print(debt_analysis)
print()


plt.barh(debt_analysis['purpose_categories'], debt_analysis['debt_ratio'], color='#0088cc', edgecolor='black')
plt.title('Debt Ratio by categories')
plt.xlabel('Debt Ratio %')
plt.ylabel('Purpose Categories')
plt.tight_layout()
plt.show()

# Overall:

### - Is there a relationship between the number of children and timely loan repayment?  
- Families with 0 children have the largest number of loans (14,272), but their percentage of debt is the lowest (7.51%).
- Families with 4 children make up the smallest group (41 clients), but they have the highest percentage of arrears (9.76%).
- Families with 1-3 children are in the middle range both in terms of the number of clients and the percentage of arrears.
 
Families without children are more likely to repay the loan.

### - Is there a relationship between marital status and timely loan repayment?  
- People in a civil marriage or unmarried (status 1, 4): The highest debt Ratio. They probably have poorer financial stability or a higher risk of insecurity.
- Married (status 0): The lowest debt ratio. Marriage can increase financial stabilit.
- Widows or divorced (for example, status 2 or 3) Moderate debt ratio, but better than that of civil marriages.

Marriage has a positive effect on creditworthiness.

### - How do different loan purposes affect timely loan repayment?
- Real estate loans are the most stable loans with the lowest percentage of arrears.
- Car loans are more difficult to repay