In [17]:
import pandas as pd
import numpy as np 

In [18]:
loan_data = pd.read_csv('/Users/donjay/desktop/data_analytics_portfolio/credit_risk_dataset.csv')
loan_data.head(20)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2
6,26,77100,RENT,8.0,EDUCATION,B,35000,12.42,1,0.45,N,3
7,24,78956,RENT,5.0,MEDICAL,B,35000,11.11,1,0.44,N,4
8,24,83000,RENT,8.0,PERSONAL,A,35000,8.9,1,0.42,N,2
9,21,10000,OWN,6.0,VENTURE,D,1600,14.74,1,0.16,N,3


In [19]:
#Drop Missing Values, Duplicates and Outliers 
loan_data_clean = loan_data.dropna()
loan_data_clean = loan_data_clean.drop_duplicates()
loan_data_clean.drop('loan_grade', axis = 1, inplace=True)
loan_data_clean.drop(0, axis = 0, inplace=True)

In [20]:
# loan_data_clean['person_home_ownership'].unique()
loan_data_clean['loan_intent'].unique()

array(['EDUCATION', 'MEDICAL', 'VENTURE', 'PERSONAL', 'HOMEIMPROVEMENT',
       'DEBTCONSOLIDATION'], dtype=object)

In [21]:
from sklearn.preprocessing import LabelEncoder

# Label encoding home ownership status (0 = Mortgage, 1 = Other, 2 = Own, 3 = Rent)
encoder = LabelEncoder()
loan_data_clean['person_home_ownership'] = encoder.fit_transform(loan_data_clean['person_home_ownership'])

#Label encoding loan inent information (0 = Debt Consolidation, 1 = Education, 2 = Home Improvement , 3 = Medical, 4 = Personal, 5 = Venture)
loan_data_clean['loan_intent'] = encoder.fit_transform(loan_data_clean['loan_intent'])

#Convert 'cb_person_default_on_file' column to binary 
default_map = {'N' : 0, 'Y': 1}
loan_data_clean['cb_person_default_on_file'] = loan_data_clean['cb_person_default_on_file'].map(default_map)

loan_data_clean.head()


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,2,5.0,1,1000,11.14,0,0.1,0,2
2,25,9600,0,1.0,3,5500,12.87,1,0.57,0,3
3,23,65500,3,4.0,3,35000,15.23,1,0.53,0,2
4,24,54400,3,8.0,3,35000,14.27,1,0.55,1,4
5,21,9900,2,2.0,5,2500,7.14,1,0.25,0,2


In [22]:
# 
columns = ['person_age', 'person_income','person_home_ownership', 'person_emp_length',
       'loan_intent', 'loan_amnt', 'loan_percent_income','cb_person_default_on_file',
       'cb_person_cred_hist_length']

X = loan_data_clean[columns]
y = loan_data_clean['loan_status']

# Transform X
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 50)

In [23]:
# Dummy Classifier with baseline accuracy
from sklearn.dummy import DummyClassifier
dummy_classifier = DummyClassifier(strategy='most_frequent')
dummy_classifier.fit(X_train,y_train)
baseline_acc = dummy_classifier.score(X_train,y_train)

print("Baseline Accuracy = ", round((baseline_acc) * 100, 2), '%')

Baseline Accuracy =  78.08 %


# Logistic Regression Model 

In [24]:
# Create and fit the logistic regression model here:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

# Save and print the predicted outcomes
y_pred = model.predict(X_test)
print('predicted classes: ', y_pred)

# Print out the true outcomes for the test data
print('true classes: ', y_test)

# Print out the confusion matrix
from sklearn.metrics import confusion_matrix
print('confusion matrix: ')
print(confusion_matrix(y_test, y_pred))

# Print accuracy here:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', round((accuracy), 2) * 100, '%')

predicted classes:  [0 0 0 ... 0 0 0]
true classes:  6318     1
9540     0
24257    0
3688     1
11448    0
        ..
31210    0
898      0
29797    0
11633    1
23674    0
Name: loan_status, Length: 8550, dtype: int64
confusion matrix: 
[[6486  250]
 [1162  652]]
Accuracy: 83.0 %


In [25]:
import pickle

# Assuming your trained model is stored in a variable called `model`
with open('credit_application_lg_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved successfully!")


Model saved successfully!


In [26]:
# Save scaler (if applicable)
with open('credit_application_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

print("Scaler saved successfully!")


Scaler saved successfully!


In [56]:
#Sample array to approve or deny loans 

data = np.array([22, 50000, 3, 2, 0, 5000, .1, 1, 2]).reshape(1,-1)
data_scaled = scaler.transform(data)
predictions = model.predict(data_scaled)

print('predictions:', predictions)

predictions: [0]




In [57]:
if predictions == [0]:
    print("Congradulations, your loan has been approved!")
else:
    print("I am sorry, upon further review, your application has been denied.")

Congradulations, your loan has been approved!
