In [6]:
import pandas as pd

In [7]:
file_path = 'Task 3 and 4_Loan_Data.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please check the path.")
except Exception as e:
    print(f"An error occurred: {e}")

In [8]:
print(df.head())

   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB
None


In [10]:
print(df.describe())

        customer_id  credit_lines_outstanding  loan_amt_outstanding  \
count  1.000000e+04              10000.000000          10000.000000   
mean   4.974577e+06                  1.461200           4159.677034   
std    2.293890e+06                  1.743846           1421.399078   
min    1.000324e+06                  0.000000             46.783973   
25%    2.977661e+06                  0.000000           3154.235371   
50%    4.989502e+06                  1.000000           4052.377228   
75%    6.967210e+06                  2.000000           5052.898103   
max    8.999789e+06                  5.000000          10750.677810   

       total_debt_outstanding         income  years_employed    fico_score  \
count            10000.000000   10000.000000    10000.000000  10000.000000   
mean              8718.916797   70039.901401        4.552800    637.557700   
std               6627.164762   20072.214143        1.566862     60.657906   
min                 31.652732    1000.000000    

Since defaulting is either 0 or 1 it is a binary logistic regression with the "y" value being "default" and "x" being the various values to be determined if a factor that effects a loan to default.

In [20]:
import plotly.graph_objects as go
import plotly.express as px
fig = px.scatter(df, 
               x='fico_score', 
               y='default',
               title='Default on Loan vs. Fico Score',
               )

fig.update_layout(
    xaxis_title='Fico Score',
    yaxis_title='Default on Loan',
    width=1200,
    height=600
)


fig.update_traces(
    hovertemplate='<b>Fico Score:</b> %{x:.2f}<br>' +
                  '<extra></extra>'
)

fig.show()

In [None]:
import plotly.graph_objects as go
import plotly.express as px
fig = px.scatter(df, 
               x='years_employed', 
               y='default',
               title='Default on Loan vs. Years Employed',
               )

fig.update_layout(
    xaxis_title='Years Employed',
    yaxis_title='Default on Loan',
    width=1200,
    height=600
)


fig.update_traces(
    hovertemplate='<b>Years Employed:</b> %{x:.2f}<br>' +
                  '<extra></extra>'
)

fig.show()

In [22]:
import plotly.graph_objects as go
import plotly.express as px
fig = px.scatter(df, 
               x='income', 
               y='default',
               title='Default on Loan vs. Income',
               )

fig.update_layout(
    xaxis_title='Income',
    yaxis_title='Default on Loan',
    width=1200,
    height=600
)


fig.update_traces(
    hovertemplate='<b>Income:</b> %{x:.2f}<br>' +
                  '<extra></extra>'
)

fig.show()

In [23]:
import plotly.graph_objects as go
import plotly.express as px
fig = px.scatter(df, 
               x='credit_lines_outstanding', 
               y='default',
               title='Default on Loan vs. Credit Lines Outstanding',
               )

fig.update_layout(
    xaxis_title='Credit Lines Outstanding',
    yaxis_title='Default on Loan',
    width=1200,
    height=600
)


fig.update_traces(
    hovertemplate='<b>Credit Lines Outstanding:</b> %{x:.2f}<br>' +
                  '<extra></extra>'
)

fig.show()

In [24]:
import plotly.graph_objects as go
import plotly.express as px
fig = px.scatter(df, 
               x='loan_amt_outstanding', 
               y='default',
               title='Default on Loan vs. Loan Amount Outstanding',
               )

fig.update_layout(
    xaxis_title='Loan Amount Outstanding',
    yaxis_title='Default on Loan',
    width=1200,
    height=600
)


fig.update_traces(
    hovertemplate='<b>Loan Amount Outstanding:</b> %{x:.2f}<br>' +
                  '<extra></extra>'
)

fig.show()

In [25]:
import plotly.graph_objects as go
import plotly.express as px
fig = px.scatter(df, 
               x='total_debt_outstanding', 
               y='default',
               title='Default on Loan vs. Total Debt Outstanding',
               )

fig.update_layout(
    xaxis_title='Total Debt Outstanding',
    yaxis_title='Default on Loan',
    width=1200,
    height=600
)


fig.update_traces(
    hovertemplate='<b>Total Debt Outstanding:</b> %{x:.2f}<br>' +
                  '<extra></extra>'
)

fig.show()

Scatter plots as written do not provide a perfect view but confirm some points to go off of to review code implemented. Factors that appear to be tied to defaulting: lower FICO score, lower years employeed, more credit lines open, larger total debt outstanding.

Scatter plots are likely an incorrect approach, so switched to box plots.

In [26]:
import plotly.express as px

fig = px.box(df, 
             x='default', 
             y='total_debt_outstanding',
             title='Distribution of Total Debt Outstanding by Default Status',
             labels={'default': 'Default Status (0=No, 1=Yes)', 
                    'total_debt_outstanding': 'Total Debt Outstanding'})

fig.update_layout(width=1200, height=600)
fig.show()

In [27]:
import plotly.express as px

fig = px.box(df, 
             x='default', 
             y='loan_amt_outstanding',
             title='Distribution of Loan Amount Outstanding by Default Status',
             labels={'default': 'Default Status (0=No, 1=Yes)', 
                    'loan_amt_outstanding': 'Loan Amount Outstanding'})

fig.update_layout(width=1200, height=600)
fig.show()

In [28]:
import plotly.express as px

fig = px.box(df, 
             x='default', 
             y='credit_lines_outstanding',
             title='Distribution of Credit Lines Outstanding by Default Status',
             labels={'default': 'Default Status (0=No, 1=Yes)', 
                    'credit_lines_outstanding': 'Credit Lines Outstanding'})

fig.update_layout(width=1200, height=600)
fig.show()

In [29]:
import plotly.express as px

fig = px.box(df, 
             x='default', 
             y='income',
             title='Distribution of Income by Default Status',
             labels={'default': 'Default Status (0=No, 1=Yes)', 
                    'income': 'Income'})

fig.update_layout(width=1200, height=600)
fig.show()

In [30]:
import plotly.express as px

fig = px.box(df, 
             x='default', 
             y='years_employed',
             title='Distribution of Years Employed by Default Status',
             labels={'default': 'Default Status (0=No, 1=Yes)', 
                    'income': 'years_employed'})

fig.update_layout(width=1200, height=600)
fig.show()

In [31]:
import plotly.express as px

fig = px.box(df, 
             x='default', 
             y='fico_score',
             title='Distribution of Fico Score by Default Status',
             labels={'default': 'Default Status (0=No, 1=Yes)', 
                    'fico_score': 'Fico Score'})

fig.update_layout(width=1200, height=600)
fig.show()

Box plots showed results much more clearly while confirming previous assumptions it showed the largest factors appear to be from having outstanding debt and credit lines.

sklearn will be used as worked previously in Gas Pricing project and has tools for this type of problem.