In [1]:
# Import libraries
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import re
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load and view data
data = pd.read_csv('ppp data.csv')
print(f"Shape of the data is: {data.shape}")
data.head()

Shape of the data is: (1680, 15)


Unnamed: 0,NAICSCode,(All),Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,,,,,,,,,,,,,,
1,Count of LoanRange,Column Labels,,,,,,,,,,,,,
2,Row Labels,1,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,Grand Total,#VALUE!,
3,"JPMorgan Chase Bank, National Association",14431,6429.0,3091.0,1757.0,1214.0,770.0,2314.0,1109.0,266.0,155.0,20.0,31556,8%,
4,"Bank of America, National Association",16167,4794.0,2080.0,1122.0,657.0,450.0,1173.0,463.0,116.0,55.0,19.0,27096,6%,14%


In [3]:
# investigate rows with null values
data[data.isnull().any(axis=1)]

Unnamed: 0,NAICSCode,(All),Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,,,,,,,,,,,,,,
1,Count of LoanRange,Column Labels,,,,,,,,,,,,,
2,Row Labels,1,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,Grand Total,#VALUE!,
3,"JPMorgan Chase Bank, National Association",14431,6429.0,3091.0,1757.0,1214.0,770.0,2314.0,1109.0,266.0,155.0,20.0,31556,8%,
5,"Kabbage, Inc.",19006,519.0,252.0,188.0,107.0,65.0,165.0,85.0,9.0,,,20396,5%,19%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1675,Bruning Bank,1,,,,,,,,,,,1,0%,
1676,"Bank of Hillsboro, National Association",1,,,,,,,,,,,1,0%,
1677,"\tFarm Credit of Western Oklahoma, ACA",1,,,,,,,,,,,1,0%,
1678,Bank of Holly Springs,,,1.0,,,,,,,,,1,0%,


In [4]:
# only take rows of businesses that CD is not null

data = data[data['CD'].notna()]
data.head()

#drop rows with NA cities
data = data[data['City'].notna()]

# drop rows with NA NAICS code
data = data[data['NAICSCode'].notna()]

#drop rows with NA business type
data = data[data['BusinessType'].notna()]

data.isnull().sum()

data.shape

data.head()

data['NAICSCode'] = data['NAICSCode'].astype(int)

data

KeyError: 'CD'

In [None]:
## read in data for NAICS code

naics = pd.read_csv('US-Business-Profiles-By-Sales-and-Employees (2).csv')

naics.head()

In [None]:
naics.dtypes
naics[naics.isnull().any(axis=1)]


naics['NAICS 1 Code'] = naics['NAICS 1 Code'].astype(int)

naics

In [None]:
## merge tables 

loan_data = pd.merge(data, naics, left_on='NAICSCode', right_on='NAICS 1 Code')


# drop categorical columns
loan_data_top = loan_data.drop(columns=['City', 'DateApproved', 'CD', 'LoanAmount', 'BusinessType', 'State', 'NAICS 1 Description'], axis=1)

# loan_data = pd.get_dummies(data, columns=["BusinessType"])

loan_data_top.head()


In [None]:
# drop lenders with loans of 5 or more
counts = loan_data_top['Lender'].value_counts()

loan_data_top = loan_data_top.loc[loan_data_top['Lender'].isin(counts.index[counts > 5])]
loan_data_top.shape

loan_data_top.head()


In [None]:
# Assign X (data) and y(target)
X = loan_data_top.drop("Lender", axis=1)
y = loan_data_top["Lender"]
print(X.shape, y.shape)

In [None]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Create a logistic Regression Model
classifier = LogisticRegression()
classifier

In [None]:
# Fit model using the training data
classifier.fit(X_train, y_train)

In [None]:
# evaluate the model's performance using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
# Make predictions
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

In [None]:
# View predictions along with their probabilities of being within each class 
##  this will be the difficult part of our project because data is not binary like example: male/female
probs = classifier.predict_proba(X_test)
pred_df = pd.DataFrame({
    "Prediction": predictions, 
    "Actual": y_test, 
    "P(Female)": [p[0] for p in probs], 
    "P(Male)": [p[1] for p in probs]
    }).reset_index(drop=True)

pred_df.head(10)

In [None]:
# Model Evaluation
# Create a Confusion Matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,predictions)

In [None]:
# Label the confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f"True Neg: {tn}   False Pos: {fp}")
print(f"False Neg: {fn}  True Pos: {tp}")

In [None]:
# Plot confusion matrix
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(classifier, X_test, y_test, cmap="Blues")

In [None]:
# Precision and Recall
### we don't technically have a "positive" label, like we would if we were classifying emails as Spam, or patients as having cancer. 
## This means that we need to tell our precision and recall functions which class label we want to consider to be our "positive" class i.e. which lender
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
## Random tests of lenders first test is JPMorgan Chase Bank, National Association, the lender with the most loans
precision_score(y_test, predictions, pos_label="JPMorgan Chase Bank, National Association")

In [None]:
## Random tests of lenders second test is First National Bank of Louisiana, random test of lender with lower loan count of 10 loans
precision_score(y_test, predictions, pos_label="First National Bank of Louisiana")

In [None]:
#  what is the harmonic mean of JP Morgan from first test
f1_score(y_test, predictions, pos_label="JPMorgan Chase Bank, National Association")

In [None]:
#  what is the harmonic mean of First National Bank of Louisiana from second test
f1_score(y_test, predictions, pos_label="First National Bank of Louisiana")