In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# Define the file path
file_path = Path("Resources/lending_data.csv")

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows
print(df.head())

# Review the DataFrame
# Show the DataFrame's column names
print("\nColumn Names:")
print(df.columns)

# Get a concise summary of the DataFrame
print("\nDataFrame Info:")
print(df.info())

# Get basic statistical descriptions of numeric columns
print("\nSummary Statistics:")
print(df.describe())

   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  loan_status  
0                 1       22800            0  
1                 0       13600            0  
2                 0       16100            0  
3                 1       22700            0  
4                 1       23000            0  

Column Names:
Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

DataFrame Info:
<class 'pandas.core.frame.D

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [4]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['loan_status']  
print(y.shape)
print(y.head())
# Separate the X variable, the features
X = df.drop(columns=['loan_status'])

# Preview to confirm
print("Features (X):")
print(X.head())

print("\nTarget (y):")
print(y.head())

(77536,)
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64
Features (X):
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  

Target (y):
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64


In [5]:
# Review the y variable Series
# View the first few values
print(y.head())

# Check the unique values in the target
print("\nUnique values:")
print(y.unique())

# View the distribution of target classes
print("\nValue counts:")
print(y.value_counts())

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

Unique values:
[0 1]

Value counts:
loan_status
0    75036
1     2500
Name: count, dtype: int64


In [6]:
# Review the X variable DataFrame
# Display the first few rows
print(X.head())

# Show column names and data types
print("\nColumn Info:")
print(X.dtypes)

# Check for missing values
print("\nMissing Values:")
print(X.isnull().sum())

# Get summary statistics for numeric columns
print("\nSummary Statistics:")
print(X.describe())

   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  

Column Info:
loan_size           float64
interest_rate       float64
borrower_income       int64
debt_to_income      float64
num_of_accounts       int64
derogatory_marks      int64
total_debt            int64
dtype: object

Missing Values:
loan_size           0
interest_rate       0
borrower_income     0
de

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (58152, 7)
X_test shape: (19384, 7)
y_train shape: (58152,)
y_test shape: (19384,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
model = LogisticRegression(random_state=1)


# Fit the model using training data
# Fit the model with the training data
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [10]:
# Make a prediction using the testing data
y_pred = model.predict(X_test)


print("Predictions:")
print(y_pred[:10])

Predictions:
[0 0 0 0 0 0 0 0 0 0]


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [13]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, y_pred)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[18655   110]
 [   36   583]]


In [12]:
# Print the classification report for the model
report = classification_report(y_test, y_pred, target_names=["Healthy Loan (0)", "High-Risk Loan (1)"])

# Display the report
print(report)

                    precision    recall  f1-score   support

  Healthy Loan (0)       1.00      0.99      1.00     18765
High-Risk Loan (1)       0.84      0.94      0.89       619

          accuracy                           0.99     19384
         macro avg       0.92      0.97      0.94     19384
      weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:**     The model is very strong overall, especially for detecting healthy loans.

    It's also quite good at identifying high-risk loans, though it errs on the side of caution, potentially over-flagging some healthy ones.



---