In [140]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

data = pd.read_csv("UniversalBank.csv")
# print(data.columns)

data = data.drop(['ID', 'ZIP Code'], axis=1)
data = pd.get_dummies(data, columns=['Education'], prefix='Education', drop_first=True)

# print(data.columns.tolist())
# ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard', 'Education_2', 'Education_3']

customer1 = pd.DataFrame(
    {
        'Age': [40], 
        'Experience': [10], 
        'Income': [84], 
        'Family': [2], 
        'CCAvg': [2], 
        'Mortgage': [0], 
        'Securities Account': [0], 
        'CD Account': [0], 
        'Online': [1], 
        'CreditCard': [1],
        'Education_2': [1],
        'Education_3': [0]
    }
)

x = data.drop('Personal Loan', axis=1)
y = data['Personal Loan']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.4, random_state=1)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train, y_train)

customer1Scaled = scaler.transform(customer1)
pred = knn.predict(customer1Scaled)

if pred[0] == 1:
    print("Customer 1 would accept a loan, placed in Class 1")
else:
    print("Customer 1 could not accept a loan, placed in Class 0")

Customer 1 could not accept a loan, placed in Class 0


In [141]:
# 2
from sklearn.metrics import precision_score
kDict = {}

for k in range(1, 16):
    # skipping number of clusters that are even
    if k % 2 == 0:
        continue 
    else:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_val)
        kDict[str(k)] = (precision_score(y_val, y_pred), y_pred)

bestKey = 0
bestValue = 0
bestPred = None

for key, value in kDict.items():
    if value[0] > bestValue:
        bestValue = value[0]
        bestPred = value[1]
        bestKey = key

print("The best K: ", bestKey)
print(f"The precision score of K={bestKey} is {int(100 * bestValue)}%")

The best K:  9
The precision score of K=9 is 98%


In [142]:
# 3
from sklearn.metrics import confusion_matrix
bestPred = bestPred
bestK = bestKey
print("Best K: ", bestK, "and confusion matrix: ")
print(confusion_matrix(y_val, bestPred))

Best K:  9 and confusion matrix: 
[[1806    1]
 [ 102   91]]


In [143]:
# 4
from sklearn.metrics import precision_score

customer2 = pd.DataFrame(
    {
        'Age': [40], 
        'Experience': [10], 
        'Income': [84], 
        'Family': [2], 
        'CCAvg': [2], 
        'Mortgage': [0], 
        'Securities Account': [0], 
        'CD Account': [0], 
        'Online': [1], 
        'CreditCard': [1],
        'Education_2': [1],
        'Education_3': [0]
    }
)
customer2Scaled = scaler.transform(customer2)
customerDict = {}

for k in range(1, 16):
    if k % 2 == 0:
        continue 
    else:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_val)
        kDict[str(k)] = (precision_score(y_val, y_pred), y_pred)

bestKey = 0
bestValue = 0
bestPred = None

for key, value in kDict.items():
    if value[0] > bestValue:
        bestValue = value[0]
        bestPred = value[1]
        bestKey = key

if bestPred[0] == 1:
    print("Customer 2 would accept a loan, placed in Class 1")
else:
    print("Customer 2 could not accept a loan, placed in Class 0")


Customer 2 could not accept a loan, placed in Class 0


In [144]:
#5
from sklearn.metrics import confusion_matrix, precision_score

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.5, random_state=1)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.4, random_state=1)

print("x train:", str(x_train.shape[0]/x.shape[0] * 100) + "%")
print("y train:", str(y_train.shape[0]/y.shape[0] * 100) + "%")
print("x val:", str(x_val.shape[0]/x.shape[0] * 100) + "%")
print("y val:", str(y_val.shape[0]/y.shape[0] * 100) + "%")
print("x_test:", str(x_test.shape[0]/x.shape[0] * 100) + "%")
print("y_test:", str(y_test.shape[0]/y.shape[0] * 100) + "%")

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

# best k from above is 9
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_train)
trainPrec = precision_score(y_train, y_pred)
trainMat = confusion_matrix(y_train, y_pred)

y_pred = knn.predict(x_test)
testPrec = precision_score(y_test, y_pred)
testMat = confusion_matrix(y_test, y_pred)

y_pred = knn.predict(x_val)
valPrec = precision_score(y_val, y_pred)
valMat = confusion_matrix(y_val, y_pred)

print(f'''
Training set precision score: {int(100 * trainPrec)}%
{trainMat}''')

print(f'''
Testing set precision score: {int(100 * testPrec)}%
{testMat}''')

print(f'''
Validation set precision score: {int(100 * valPrec)}%
{valMat}''')

print("The training set is used to train the model.  The precision score and confusion maxtrix both show very accurate predictions because the values have been seen before by the model.")
print("The testing set is used to introduce unseen data to the model.")
print("The validation set is used to assess the model for a general prediction.  The validation set and testing set both show similar results which would indicate the model is able to be applied to unseen data and it would have similar performance for different sets of data.")

x train: 50.0%
y train: 50.0%
x val: 30.0%
y val: 30.0%
x_test: 20.0%
y_test: 20.0%

Training set precision score: 96%
[[2253    6]
 [  94  147]]

Testing set precision score: 91%
[[908   4]
 [ 43  45]]

Validation set precision score: 95%
[[1346    3]
 [  82   69]]
The training set is used to train the model.  The precision score and confusion maxtrix both show very accurate predictions because the values have been seen before by the model.
The testing set is used to introduce unseen data to the model.
The validation set is used to assess the model for a general prediction.  The validation set and testing set both show similar results which would indicate the model is able to be applied to unseen data and it would have similar performance for different sets of data.
