# Goals

1. How do you ensure that customers can/will pay their loans?
2. Can we approve customers with high certainty?

# Imports

In [None]:
#imports
#numpy, pandas, scipy, math, matplotlib
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt

#estimators
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import linear_model

#model metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

#cross validation
from sklearn.model_selection import train_test_split

#selection method
from sklearn.feature_selection import SelectKBest, f_classif

# Data

In [None]:
#data
rawData = pd.read_csv('Credit_One_Data.csv')
rawData.drop(rawData.columns[0], axis=1, inplace=True)

rawData.head()

In [None]:
#dependent variable
y = rawData['default_payment_next_month']

#features
X = rawData.iloc[0:,0:23]
print('Summary of feature sample')
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20)

In [None]:
#validate feature selection using SelectKBest
selector = SelectKBest(f_classif, k=10)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()

In [None]:
import matplotlib.pyplot as plt

X_indices = np.arange(X.shape[-1])
plt.figure(1)
plt.clf()
plt.bar(X_indices - 0.05, scores, width=0.2)
plt.title("Feature univariate score")
plt.xlabel("Feature number")
plt.ylabel(r"Univariate score ($-Log(p_{value})$)")
plt.show()

Features 5-10 are significant in predicting default_payment_next_month, they have the highest score with univariate feature selection

In [None]:
#compare with SVMs
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC

clf = make_pipeline(MinMaxScaler(), LinearSVC())
clf.fit(X_train, y_train)
print(
    "Classification accuracy without selecting features: {:.3f}".format(
        clf.score(X_test, y_test)
    )
)

svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
svm_weights /= svm_weights.sum()

In [None]:
svm_weights

In [None]:
#after univariate feature selection
clf_selected = make_pipeline(SelectKBest(f_classif, k=10), MinMaxScaler(), LinearSVC())
clf_selected.fit(X_train, y_train)
print(
    "Classification accuracy after univariate feature selection: {:.3f}".format(
        clf_selected.score(X_test, y_test)
    )
)

svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()

In [None]:
svm_weights_selected

In [None]:
plt.bar(
    X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)"
)

plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight")

plt.bar(
    X_indices[selector.get_support()] - 0.05,
    svm_weights_selected,
    width=0.2,
    label="SVM weights after selection",
)
plt.title("Comparing feature selection")
plt.xlabel("Feature number")
plt.yticks(())
plt.axis("tight")
plt.legend(loc="upper right")
plt.show()

In [None]:
#redefining X based on SelectKBest
#features
X = rawData.iloc[:, [5,7,8,9,10,17]]
print('Summary of feature sample')
X.head()

In [None]:
#redefine train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20)

In [None]:
algosClass = []

algosClass.append(('Random Forest Regressor',RandomForestRegressor()))

algosClass.append(('Linear Regression',LinearRegression()))

algosClass.append(('Support Vector Regression', SVR()))

#regression
results = []
names = []
for name, model in algosClass:
    result = cross_val_score(model, X,y, cv=3, scoring='r2')
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())

With the input data set, none of these models will perform well to ensure that customers will be able to pay their loans.

Of the 3 options, Linear Regression should perform the best as it has the highest r2 value

# Determine if customers can pay their loans

In [None]:
#algo = RandomForestRegressor()
algo = LinearRegression()
#algo = SVR()

In [None]:
model = algo.fit(X_train,y_train)

In [None]:
#make predictions
predictions = model.predict(X_test)
predRsquared = r2_score(y_test,predictions)
rmse = sqrt(mean_squared_error(y_test, predictions))
print('R Squared: %.3f' % predRsquared)
print('RMSE: %.3f' % rmse)

In [None]:
plt.scatter(y_test, predictions, alpha = .5)
plt.xlabel('Ground Truth')
plt.ylabel('Predictions')
plt.gcf().axes[0].xaxis.get_major_formatter().set_scientific(False)
plt.gcf().axes[0].yaxis.get_major_formatter().set_scientific(False)

plt.show();

# Determine the Credit Limit

In [None]:
#reusing rawData from the above

rawData.head()

In [None]:
#dependent variable
y = rawData['LIMIT_BAL']

#features
X = rawData.iloc[0:,1:24]
print('Summary of feature sample')
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20)

In [None]:
#validate feature selection using SelectKBest
selector = SelectKBest(f_classif, k=10)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()

In [None]:
import matplotlib.pyplot as plt

X_indices = np.arange(X.shape[-1])
plt.figure(1)
plt.clf()
plt.bar(X_indices - 0.05, scores, width=0.2)
plt.title("Feature univariate score")
plt.xlabel("Feature number")
plt.ylabel(r"Univariate score ($-Log(p_{value})$)")
plt.show()

Features 1 and 4-21 appear to be significant in predicting LIMIT_BAL, they have the highest score with univariate feature selection

In [None]:
#compare with SVMs
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC

clf = make_pipeline(MinMaxScaler(), LinearSVC())
clf.fit(X_train, y_train)
print(
    "Classification accuracy without selecting features: {:.3f}".format(
        clf.score(X_test, y_test)
    )
)

svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
svm_weights /= svm_weights.sum()

In [None]:
svm_weights

In [None]:
#after univariate feature selection
clf_selected = make_pipeline(SelectKBest(f_classif, k=10), MinMaxScaler(), LinearSVC())
clf_selected.fit(X_train, y_train)
print(
    "Classification accuracy after univariate feature selection: {:.3f}".format(
        clf_selected.score(X_test, y_test)
    )
)

svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()

In [None]:
svm_weights_selected

In [None]:
plt.bar(
    X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)"
)

plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight")

plt.bar(
    X_indices[selector.get_support()] - 0.05,
    svm_weights_selected,
    width=0.2,
    label="SVM weights after selection",
)
plt.title("Comparing feature selection")
plt.xlabel("Feature number")
plt.yticks(())
plt.axis("tight")
plt.legend(loc="upper right")
plt.show()

The SVM weights after selection appear to show that features 11-15, and 20 are significant to predicting LIMIT_BAL

In [None]:
#redefining X based on SelectKBest
#features
X = rawData.iloc[:, [6,10,11,12,13,14,15,17,18,20]]
print('Summary of feature sample')
X.head()

In [None]:
#redefine train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20)

In [None]:
algosClass = []

algosClass.append(('Random Forest Regressor',RandomForestRegressor()))

algosClass.append(('Linear Regression',LinearRegression()))

algosClass.append(('Support Vector Regression', SVR()))

#regression
results = []
names = []
for name, model in algosClass:
    result = cross_val_score(model, X,y, cv=3, scoring='r2')
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())

With the input data set, none of these models will perform well to ensure that customers will be able to pay their loans.

Of the 3 options, Random Forest Regression should perform the best as it has the highest r2 value

In [None]:
algo = RandomForestRegressor()
#algo = LinearRegression()
#algo = SVR()

In [None]:
model = algo.fit(X_train,y_train)

In [None]:
#make predictions
predictions = model.predict(X_test)
predRsquared = r2_score(y_test,predictions)
rmse = sqrt(mean_squared_error(y_test, predictions))
print('R Squared: %.3f' % predRsquared)
print('RMSE: %.3f' % rmse)

In [None]:
plt.scatter(y_test, predictions, alpha = .5)
plt.xlabel('Ground Truth')
plt.ylabel('Predictions')
plt.gcf().axes[0].xaxis.get_major_formatter().set_scientific(False)
plt.gcf().axes[0].yaxis.get_major_formatter().set_scientific(False)

plt.show();

### Regression Conclusions: The regression models are not an acceptable way to predict whether customers can make their payments next month, or what the credit limit should be.