# I.	Opening File and Joining Other Data Sets

### A.	Library List

In [None]:
import numpy as np
import pandas as pd

### B.	Opening Data

In [2]:
df = pd.read_csv("loan.csv", low_memory = False)
df.shape


(2260668, 145)

### C.	Changing Several Columns’ Data Type and Adding Needed Columns

In [None]:
# Change certain columns to date time

## Getting the year out of the month-year format in the column
df['earliest_cr_line_year'] = df['earliest_cr_line'].str.strip().str[-4:].fillna(0).astype('int')

## Changing earliest_cr_line into pd_datetime format
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])

## Changing issue_d into pd_datetime format
df['issue_d'] = pd.to_datetime(df['issue_d'])

# Create new column in loan data of previous quarter
df['pqissue_d'] = df['issue_d'] - pd.tseries.offsets.DateOffset(months = 3)

# Create new column in loan data of current quarter of issue date
df['issue_q'] = pd.PeriodIndex(pd.to_datetime(df['issue_d']), freq = 'Q')

# Create new column in loan data for quarter of issue_date minus 1 quarter - previous quarter issue quarter
df['pqissue_q'] = pd.PeriodIndex(pd.to_datetime(df['pqissue_d']), freq = 'Q')

# Change earliest_credit_line and issue_date to correct datetime format to calculate age of Credit
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'])
df['issue_d'] = pd.to_datetime(df['issue_d'])

# Change last payment date to datetime format
df['last_pymnt_d'] = pd.to_datetime(df['last_pymnt_d'])


In [None]:
# Calculate age of credit in year
df['ageOfCredit'] = 
((df['issue_d']-df['earliest_cr_line']))/np.timedelta64(1,'Y')


In [None]:
# Transforming employment length longer than 10 years to 10 years and less than 1 years to 0 years
df['emp_length'].replace(to_replace='10+ years', value='10 years', inplace=True)
df['emp_length'].replace('< 1 year', '0 years', inplace=True)


def emp_length_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])

df['emp_length'] = df['emp_length'].apply(emp_length_to_int)


In [None]:
# Log base 10 the two numeric variables for easier visualizations
df['log_annual_inc'] = df['annual_inc'].apply(lambda x: np.log10(x+1))
df['log_revol_bal'] = df['revol_bal'].apply(lambda x: np.log10(x+1))


### D.	Opening Other Data Set for Easier Join of State Names

In [None]:
# Open csv state abbrv to state name
sn = pd.read_csv("https://worldpopulationreview.com/static/states/abbr-name.csv", header = None)

# Rename column 
sn = sn.rename(columns = {0: "addr_state" , 1:"length_as"})

# Merge
df = df.merge(sn, how = 'left', on = 'addr_state')


In [None]:
# Changing the o in the new joined data set to capital O for joining with other data sets
df.loc[df['length_as'] == 'District Of Columbia', 'length_as'] = 'District of Columbia'


### E.	Opening Other Data Set for Joining GDP Information

In [None]:
# gdp open
gdp = pd.read_csv("SQGDP1__ALL_AREAS_2005_2020.csv")

# get only real GDP description
gdp = gdp.loc[gdp['Description'] == 'Real GDP (millions of chained 2012 dollars)']

# Drop unnecessary columns
gdp = gdp.drop(columns = ['GeoFIPS', 'Region', 'TableName', 'LineCode', 'IndustryClassification', 'Description', 'Unit'])

# Readjust into time-series table
gdp = gdp.melt(id_vars = ['GeoName'],
         var_name = "YEAR:Q",
         value_name = "RealGDP")

# Change time period into datetime pandas format
gdp['YEAR:Q'] = gdp['YEAR:Q'].str.replace(r'(\d+):(Q\d)', r'\1-\2')
gdp['startQuarter'] = pd.to_datetime(gdp['YEAR:Q'])
gdp['endQuarter'] = pd.to_datetime(gdp['startQuarter'] + pd.tseries.offsets.QuarterEnd(0))


# Sort values
gdp = gdp.sort_values(by = ['GeoName', 'YEAR:Q'])

# Get first difference percentage change
gdp['Diff'] = gdp.groupby(['GeoName'])['RealGDP'].pct_change().fillna(0)

# Get previous quarter in 2 new columns
gdp['prevQuarterStartDate'] = gdp['startQuarter'] - pd.tseries.offsets.DateOffset(months = 3)
gdp['prevQuarterEndDate'] = gdp['endQuarter'] - pd.tseries.offsets.DateOffset(months = 3)
gdp['prevQuarterDiff'] = gdp.groupby(['GeoName'])['Diff'].shift(1)
gdp['issue_q'] = pd.PeriodIndex(pd.to_datetime(gdp['startQuarter']), freq = 'Q')
gdp['pqissue_q'] = pd.PeriodIndex(pd.to_datetime(gdp['prevQuarterStartDate']), freq = 'Q')

# Check for final and ready to join
gdp = gdp.rename(columns = {'GeoName':"length_as"})
gdp = gdp[['length_as', 'issue_q', 'Diff', 'pqissue_q', 'prevQuarterDiff']]
gdp


### F.	Joining with GDP Information

In [None]:
df = pd.merge(df, gdp, on = ['length_as', 'issue_q'], how = 'left')
df = df.drop(columns = 'pqissue_q_x')
df = df.rename(columns = {'pqissue_q_y':'pqissue_q'})
df


###  G.	Opening Other Data Set for Joining Unemployment Rate Information


In [None]:
bls = pd.read_excel('ststdsadata.xlsx')

# Renaming the state to it's abbreviation 
bls.loc[bls['Unnamed: 1'] == "Alabama", ['Unnamed: 1']] =  'AL'
bls.loc[bls['Unnamed: 1'] == "Alaska", ['Unnamed: 1']] =  'AK'
bls.loc[bls['Unnamed: 1'] == "Arizona", ['Unnamed: 1']] =  'AZ'
bls.loc[bls['Unnamed: 1'] == "Arkansas", ['Unnamed: 1']] =  'AR'
bls.loc[bls['Unnamed: 1'] == "California", ['Unnamed: 1']] =  'CA'
bls.loc[bls['Unnamed: 1'] == "Colorado", ['Unnamed: 1']] =  'CO'
bls.loc[bls['Unnamed: 1'] == "Connecticut", ['Unnamed: 1']] =  'CT'
bls.loc[bls['Unnamed: 1'] == "Delaware", ['Unnamed: 1']] =  'DE'
bls.loc[bls['Unnamed: 1'] == "District of Columbia", ['Unnamed: 1']] =  'DC'
bls.loc[bls['Unnamed: 1'] == "Florida", ['Unnamed: 1']] =  'FL'
bls.loc[bls['Unnamed: 1'] == "Georgia", ['Unnamed: 1']] =  'GA'
bls.loc[bls['Unnamed: 1'] == "Hawaii", ['Unnamed: 1']] =  'HI'
bls.loc[bls['Unnamed: 1'] == "Idaho", ['Unnamed: 1']] =  'ID'
bls.loc[bls['Unnamed: 1'] == "Illinois", ['Unnamed: 1']] =  'IL'
bls.loc[bls['Unnamed: 1'] == "Indiana", ['Unnamed: 1']] =  'IN'
bls.loc[bls['Unnamed: 1'] == "Iowa", ['Unnamed: 1']] =  'IA'
bls.loc[bls['Unnamed: 1'] == "Kansas", ['Unnamed: 1']] =  'KS'
bls.loc[bls['Unnamed: 1'] == "Kentucky", ['Unnamed: 1']] =  'KY'
bls.loc[bls['Unnamed: 1'] == "Louisiana", ['Unnamed: 1']] =  'LA'
bls.loc[bls['Unnamed: 1'] == "Maine", ['Unnamed: 1']] =  'ME'
bls.loc[bls['Unnamed: 1'] == "Maryland", ['Unnamed: 1']] =  'MD'
bls.loc[bls['Unnamed: 1'] == "Massachusetts", ['Unnamed: 1']] =  'MA'
bls.loc[bls['Unnamed: 1'] == "Michigan", ['Unnamed: 1']] =  'MI'
bls.loc[bls['Unnamed: 1'] == "Minnesota", ['Unnamed: 1']] =  'MN'
bls.loc[bls['Unnamed: 1'] == "Mississippi", ['Unnamed: 1']] =  'MS'
bls.loc[bls['Unnamed: 1'] == "Missouri", ['Unnamed: 1']] =  'MO'
bls.loc[bls['Unnamed: 1'] == "Montana", ['Unnamed: 1']] =  'MT'
bls.loc[bls['Unnamed: 1'] == "Nebraska", ['Unnamed: 1']] =  'NE'
bls.loc[bls['Unnamed: 1'] == "Nevada", ['Unnamed: 1']] =  'NV'
bls.loc[bls['Unnamed: 1'] == "New Hampshire", ['Unnamed: 1']] =  'NH'
bls.loc[bls['Unnamed: 1'] == "New Jersey", ['Unnamed: 1']] =  'NJ'
bls.loc[bls['Unnamed: 1'] == "New Mexico", ['Unnamed: 1']] =  'NM'
bls.loc[bls['Unnamed: 1'] == "New York", ['Unnamed: 1']] =  'NY'
bls.loc[bls['Unnamed: 1'] == "North Carolina", ['Unnamed: 1']] =  'NC'
bls.loc[bls['Unnamed: 1'] == "North Dakota", ['Unnamed: 1']] =  'ND'
bls.loc[bls['Unnamed: 1'] == "Ohio", ['Unnamed: 1']] =  'OH'
bls.loc[bls['Unnamed: 1'] == "Oklahoma", ['Unnamed: 1']] =  'OK'
bls.loc[bls['Unnamed: 1'] == "Oregon", ['Unnamed: 1']] =  'OR'
bls.loc[bls['Unnamed: 1'] == "Pennsylvania", ['Unnamed: 1']] =  'PA'
bls.loc[bls['Unnamed: 1'] == "Rhode Island", ['Unnamed: 1']] =  'RI'
bls.loc[bls['Unnamed: 1'] == "South Carolina", ['Unnamed: 1']] =  'SC'
bls.loc[bls['Unnamed: 1'] == "South Dakota", ['Unnamed: 1']] =  'SD'
bls.loc[bls['Unnamed: 1'] == "Tennessee", ['Unnamed: 1']] =  'TN'
bls.loc[bls['Unnamed: 1'] == "Texas", ['Unnamed: 1']] =  'TX'
bls.loc[bls['Unnamed: 1'] == "Utah", ['Unnamed: 1']] =  'UT'
bls.loc[bls['Unnamed: 1'] == "Vermont", ['Unnamed: 1']] =  'VT'
bls.loc[bls['Unnamed: 1'] == "Virginia", ['Unnamed: 1']] =  'VA'
bls.loc[bls['Unnamed: 1'] == "Washington", ['Unnamed: 1']] =  'WA'
bls.loc[bls['Unnamed: 1'] == "West Virginia", ['Unnamed: 1']] =  'WV'
bls.loc[bls['Unnamed: 1'] == "Wisconsin", ['Unnamed: 1']] =  'WI'
bls.loc[bls['Unnamed: 1'] == "Wyoming", ['Unnamed: 1']] =  'WY'

# Creating Month column 
df['Month'] = pd.DatetimeIndex(df['issue_d']).month

# Converting to leading 0, because that's the format used in BLS dataset 
# for instance, January is 01 instead of 1
df["Month"] = df.Month.map("{:02}".format)

# Creating Year column in df 
df['Year'] = pd.DatetimeIndex(df['issue_d']).year

# renaming columns in bls dataset 
bls = bls.rename(columns = {"Unnamed: 1": "addr_state"})
bls = bls.rename(columns = {"Unnamed: 2": "Year"})
bls = bls.rename(columns = {"Unnamed: 3": "Month"})
bls = bls.rename(columns = {"Unnamed: 10": "UR"}) #unemployment rate

bls = pd.DataFrame(bls, columns=['addr_state', 'Year', 'Month', 'UR'])
bls = bls.dropna() #CLEANED bls 
bls.head()


In [None]:
# Adding columns for better joining
bls['YearMonth'] = bls['Year'].astype(str)+'-'+bls['Month'].astype(str)
bls['YearQuarter'] = pd.PeriodIndex(pd.to_datetime(bls['YearMonth']), freq = 'Q')

# Sorting values to calculate difference between one period to another
bls = bls.sort_values(by = ['addr_state', 'YearMonth'])
bls['URDiff'] = bls.groupby(['addr_state'])['UR'].pct_change().fillna(0)

# Sorting values to calculate difference between previous period and two periods ago
bls['URprevMonthDiff'] = bls.groupby(['addr_state'])['URDiff'].shift(1)
bls


### H.	Joining with Unemployment Rate Information

In [None]:
bls['Year']=bls['Year'].astype(int) #has to convert to allow merging; datatypes have to be the same. 
df = pd.merge(df, bls, on = ['Month', 'Year', 'addr_state'], how = 'left')


### I.	Transfer to CSV

In [None]:
df.to_csv('df.csv')

# II.	Data Pre-processing

### A.	Library List

In [None]:
import numpy as np
import pandas as pd


### B.	Opening Folder

In [None]:
# Opening the data frame after joining (resultant of previous section)
df = pd.read_csv("df.csv", low_memory = False)

# Removing that first column: ("Unnamed: 0")
df = df.drop(columns = "Unnamed: 0")


### C.	Dealing with Target Values

In [None]:
# We want to train our model to evaluate charged-off (failing loans) or fully paid (successful loans)
# Thus, we want to separate all other loan status categories from these two categories

print(df['loan_status'].describe(), "\n")
print("Before removing other categories")
print(df['loan_status'].value_counts(dropna = False))
df = df.loc[df['loan_status'].isin(['Fully Paid', 'Charged Off'])].copy()
print("After removing other categories", "\n")
print(df['loan_status'].value_counts(dropna = False))


### D.	Filtering Data Sets

In [None]:
# What is available to investor
keep_list1 = ['loan_amnt', 'issue_d', 'loan_status', 'funded_amnt', 
              'funded_amnt_inv', 'verification_status', 'installment', 'grade', 
              'sub_grade', 'home_ownership', 'emp_length', 'emp_title', 
              'addr_state', 'zip_code', 'annual_inc', 'dti', 
              'ageOfCredit', 'earliest_cr_line', 'earliest_cr_line_year',
              'open_acc', 'total_acc', 'revol_bal', 'revol_util', 
              'inq_last_6mths', 'acc_now_delinq', 'delinq_amnt', 'delinq_2yrs', 
              'pub_rec', 'collections_12_mths_ex_med', 'int_rate', 
              'tot_coll_amt', 'purpose', 'term', 'initial_list_status',    
              'application_type', 'length_as', 'pqissue_d', 'issue_q',
              'log_annual_inc', 'log_revol_bal', 'Diff', 'pqissue_q', 
              'prevQuarterDiff', 'Month', 'Year', 'UR', 'YearMonth',
              'YearQuarter', 'URDiff', 'URprevMonthDiff']


In [None]:
# Adjusting after correlation analysis and Cramer's V
keep_list3 = ['Month', 'loan_amnt', 'issue_d', 'loan_status',
              'verification_status', 'grade', 'home_ownership', 'emp_length', 
              'addr_state', 'annual_inc', 'log_annual_inc', 'dti',
              'ageOfCredit', 'open_acc', 'total_acc', 'revol_bal', 
              'revol_util', 'acc_now_delinq', 'delinq_amnt', 'delinq_2yrs',  
              'pub_rec', 'collections_12_mths_ex_med',
              'int_rate', 'inq_last_6mths', 'tot_coll_amt', 'purpose', 'term',
              'initial_list_status', 'application_type',
              'log_revol_bal', 'Diff', 'prevQuarterDiff', 'UR', 'URDiff',
              'URprevMonthDiff']


In [None]:
# df1 will contain whatever in keep_list1 - before correlation analysis
# df2 will contain whatever in keep_list3 - after correlation analysis
# We discard keep_list2 BECAUSE it's not available to potential investor in the GUI
df1 = df[keep_list1]
df2 = df[keep_list3]


In [None]:
# Remove NA
df1 = df1.dropna()
df2 = df2.dropna()

print("DF1 shape is: ", df1.shape)
print("DF2 shape is: ", df2.shape)


### E.	Creating Dummy Variables

In [None]:
df2_with_dummies = df2.copy()
# Creating a category for charged off or not
df2_with_dummies['charged_off'] = (df2_with_dummies['loan_status'] == 'Charged Off').apply(np.uint8)

# Change grade into different kinds of dummies
df2_with_dummies['grade'] = pd.factorize(df2_with_dummies['grade'])[0] + 1

# Get dummies for certain variables only
df2_with_dummies = pd.get_dummies(df2_with_dummies,columns=['verification_status', 'home_ownership', 'addr_state', 'purpose', 'term', 'initial_list_status',
                           'application_type'], 
               drop_first = True,
               dummy_na = True)

# Getting observations between 2015 and 2018
df2_with_dummies = df2_with_dummies[(df2_with_dummies['issue_d'] >= '2015-01-01') & (df2_with_dummies['issue_d'] <= '2018-12-31')]
# Dropping issue date because it's unnecessary
df2_with_dummies.drop(columns = ['issue_d', 'loan_status'], axis = 1, inplace = True)


In [None]:
# Check results
df2_with_dummies.shape

Results:
(797003, 108)

# Create new CSV
df2_with_dummies.to_csv('df2_with_dummies.csv')


# III.	Principal Component Analysis

### A.	Library List

In [None]:
import pandas as pd, numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import metrics, datasets


### B.	Opening Folder

In [None]:
# Open data
df2_with_dummies = pd.read_csv("df2_with_dummies.csv", low_memory = False)

# Drop unnecessary columns
# annual_inc and revol_bal are replaced with its log version
# month isn't used
df2_with_dummies = df2_with_dummies.drop(columns = ["Unnamed: 0", 'annual_inc', 'revol_bal', 'Month'])


### C.	Selecting Numerical Variables to Scale and Run PCA on the Data Set

In [None]:
# Separating out the features
Scaled = ['loan_amnt', 'emp_length', 'log_annual_inc', 'dti', 'ageOfCredit',
          'open_acc', 'total_acc', 'revol_util', 'inq_last_6mths',          
          'acc_now_delinq', 'delinq_amnt', 'delinq_2yrs', 'pub_rec',
          'collections_12_mths_ex_med', 'int_rate', 'tot_coll_amt',
          'log_revol_bal', 'Diff', 'prevQuarterDiff', 'UR', 'URDiff',  
           'URprevMonthDiff']


In [None]:
df2_with_dummies[Scaled] = StandardScaler().fit_transform(df2_with_dummies[Scaled])
df2_with_dummies

# Create a copy for PCA
df2_with_dummies_PCA = df2_with_dummies.copy()

# Isolate target variable
y = df2_with_dummies_PCA['charged_off']
x = df2_with_dummies_PCA.drop(columns = ['charged_off'])

# Apply PCA
pca = PCA()
x = pca.fit_transform(x)

# Check the x aftermath - a bunch of eigenvector
x


In [None]:
# Print to determine elbow
ev = pca.explained_variance_ratio_
print(ev)
plt.plot(ev)


In [None]:
# Print to cumulative sum
plt.plot(ev.cumsum())


In [None]:
# Isolate target variable
y = df2_with_dummies_PCA['charged_off']
x = df2_with_dummies_PCA.drop(columns = ['charged_off'])

# Apply PCA 25
pca = PCA(n_components=25)
x = pca.fit_transform(x)

# Print to determine elbow
ev = pca.explained_variance_ratio_
print(ev)
plt.plot(ev)

# Getting cumulative sum
plt.plot(ev.cumsum())


In [None]:

# Export to csv
pd.DataFrame(x).to_csv("x_with_dummies_PCA25.csv")
pd.DataFrame(y).to_csv("y_with_dummies_PCA25.csv")


# IV.	Machine Learning Algorithms

### A.	Library List

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import scale
from sklearn.neural_network import MLPClassifier
from sklearn import metrics, datasets,tree
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
import math


### B.	Opening Files and Separating Target and Features

In [None]:
df = pd.read_csv("df2_with_dummies.csv")
df = df.drop(columns = ['Unnamed: 0','log_annual_inc', 'revol_bal', 'UR'])


In [None]:
# defining target and feature variable
X = df.drop(columns = 'charged_off')
X = X.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y = df['charged_off']

# splitting dataset 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =.3,random_state=1234, stratify=y)


### C.	Naive Bayes

In [None]:
# Create a Bayesian Classifier instance for classification
gnb = GaussianNB() 

# Build a Bayesian Classification Model and predict the type using the test data.
gnb.fit(X_train, y_train)
y_pred_nb = gnb.predict(X_test)
cm = metrics.confusion_matrix(y_test,y_pred_nb)
print(metrics.classification_report(y_test,y_pred_nb))

print('PERFORMANCE SCORE: NB ENTIRE DATASET')
print ('Accuracy:', accuracy_score(y_test, y_pred_nb))
print ('F1 score:', f1_score(y_test, y_pred_nb))
print ('Recall:', recall_score(y_test, y_pred_nb))
print ('Precision:', precision_score(y_test, y_pred_nb))
print ('AUC:', roc_auc_score(y_test, y_pred_nb))

mse_nb = mean_squared_error(y_test, y_pred_nb)
rmse_nb = math.sqrt(mse_nb)
print(rmse_nb)


### D.	Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Create a model (object) for classification
dtm = DecisionTreeClassifier()
 
# Build a decision tree
dtm.fit(X_train, y_train)
y_pred_dt = dtm.predict(X_test)
 
# Calculate accuracy
accuracy = dtm.score(X_test, y_test)

print (metrics.classification_report(y_test,y_pred_dt))
print ('Accuracy:', accuracy_score(y_test, y_pred_dt))
print ('F1 score:', f1_score(y_test, y_pred_dt))
print ('Recall:', recall_score(y_test, y_pred_dt))
print ('Precision:', precision_score(y_test, y_pred_dt))
print ('AUC:', roc_auc_score(y_test, y_pred_dt))
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = math.sqrt(mse_dt)
print(rmse_dt)


### E.	Random Forest

In [None]:
# Create a model (object) for classification
rfcm = RandomForestClassifier()

# Build a random forest classification model
rfcm.fit(X_train, y_train)
y_pred_rf = rfcm.predict(X_test)

# Calculate accuracy
#accuracy = rfcm.score(X_test, y_test)
#print('Accuracy: {0:.2f}'.format(accuracy))

# Build a confusion matrix and show the Classification Report
cm1 = metrics.confusion_matrix(y_test,y_pred_rf)
print('\nConfusion Matrix','\n',cm1)
print('\nClassification Report','\n',metrics.classification_report(y_test,y_pred_rf))

print ('Accuracy:', accuracy_score(y_test, y_pred_rf))
print ('F1 score:', f1_score(y_test, y_pred_rf))
print ('Recall:', recall_score(y_test, y_pred_rf))
print ('Precision:', precision_score(y_test, y_pred_rf))
print ('AUC:', roc_auc_score(y_test, y_pred_rf))
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = math.sqrt(mse_rf)
print(rmse_rf)


### F.	Neural Network

In [None]:
# Normalize the data
Xn = scale(X)
 
# Set the 'stratify' option 'y' to sample 
Xn_train, Xn_test = train_test_split(Xn, test_size =.3,random_state=1234, stratify=y)
y_train, y_test = train_test_split(y, test_size=.3, random_state=1234, stratify=y)
nnm = MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000,activation='logistic')

# Make predictions
nnm.fit(Xn_train, y_train)
y_pred_NN = nnm.predict(Xn_test)

print('\nClassification Report','\n',metrics.classification_report(y_test,y_pred_NN))
print ('Accuracy:', accuracy_score(y_test, y_pred_NN))
print ('F1 score:', f1_score(y_test, y_pred_NN))
print ('Recall:', recall_score(y_test, y_pred_NN))
print ('Precision:', precision_score(y_test, y_pred_NN))
print ('AUC:', roc_auc_score(y_test, y_pred_NN))
mse_nn = mean_squared_error(y_test, y_pred_NN)
rmse_nn = math.sqrt(mse_nn)
print(rmse_nn)


### G.	Naive Bayesian Using PCA25 Data Set

In [None]:
# defining target and feature variable
X = pd.read_csv("x_with_dummies_PCA25.csv")
X = X.drop(columns = 'Unnamed: 0')
y = pd.read_csv("y_with_dummies_PCA25.csv")
y = y.drop(columns = 'Unnamed: 0')

# splitting dataset 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =.3,random_state=1234, stratify=y)

# Create a Bayesian Classifier instance for classification
gnb_PCA = GaussianNB() 

# Build a Bayesian Classification Model and predict the type using the test data.
gnb_PCA.fit(X_train, y_train)
y_pred_nb_pca = gnb_PCA.predict(X_test)

cm_nb_pca = metrics.confusion_matrix(y_test,y_pred_nb_pca)
print(metrics.classification_report(y_test,y_pred_nb_pca))

print('PERFORMANCE SCORE: NB ENTIRE DATASET')
print ('Accuracy:', accuracy_score(y_test, y_pred_nb_pca))
print ('F1 score:', f1_score(y_test, y_pred_nb_pca))
print ('Recall:', recall_score(y_test, y_pred_nb_pca))
print ('Precision:', precision_score(y_test, y_pred_nb_pca))
print ('AUC:', roc_auc_score(y_test, y_pred_nb_pca))
mse_nb_pca = mean_squared_error(y_test, y_pred_nb_pca)
rmse_nb_pca = math.sqrt(mse_nb_pca)
print('RMSE:', rmse_nn)


### H.	Decision Tree Using PCA25 Data Set

In [None]:
dtm_pca = DecisionTreeClassifier()
 
# Build a decision tree
dtm_pca.fit(X_train, y_train)
y_pred_dt_pca = dtm_pca.predict(X_test)

cm_dt_pca = metrics.confusion_matrix(y_test,y_pred_dt_pca)
print(metrics.classification_report(y_test,y_pred_dt_pca))

print ('Accuracy:', accuracy_score(y_test, y_pred_dt_pca))
print ('F1 score:', f1_score(y_test, y_pred_dt_pca))
print ('Recall:', recall_score(y_test, y_pred_dt_pca))
print ('Precision:', precision_score(y_test, y_pred_dt_pca))
print ('AUC:', roc_auc_score(y_test, y_pred_dt_pca))
mse_dt_pca = mean_squared_error(y_test, y_pred_dt_pca)
rmse_dt_pca = math.sqrt(mse_dt_pca)
print(rmse_dt_pca)


### I.	Random Forest Using PCA25 Data Set

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Create a model (object) for classification
rfcm_pca = RandomForestClassifier()

# Build a random forest classification model
rfcm_pca.fit(X_train, y_train)
y_pred_rf_pca = rfcm_pca.predict(X_test)

print(metrics.classification_report(y_test,y_pred_rf_pca))
print ('Accuracy:', accuracy_score(y_test, y_pred_rf_pca))
print ('F1 score:', f1_score(y_test, y_pred_rf_pca))
print ('Recall:', recall_score(y_test, y_pred_rf_pca))
print ('Precision:', precision_score(y_test, y_pred_rf_pca))
print ('AUC:', roc_auc_score(y_test, y_pred_rf_pca))
mse_rf_pca = mean_squared_error(y_test, y_pred_rf_pca)
rmse_rf_pca = math.sqrt(mse_rf_pca)
print(rmse_rf_pca)


### J.	Neural Network Using PCA25 Data Set

In [None]:
# Normalize the data
Xn = scale(X)
 
# Set the 'stratify' option 'y' to sample 
Xn_train, Xn_test = train_test_split(Xn, test_size =.3,random_state=1234, stratify=y)
y_train, y_test = train_test_split(y, test_size=.3, random_state=1234, stratify=y)

nnm_PCA = MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000,activation='logistic')

# Make predictions
nnm_PCA.fit(Xn_train, y_train)
y_pred_NN_PCA = nnm_PCA.predict(Xn_test)

print('PERFORMANCE SCORE NN- ENTIRE DATASET')
print(metrics.classification_report(y_test,y_pred_rf_pca), "\n")
print ('Accuracy:', accuracy_score(y_test, y_pred_NN_PCA))
print ('F1 score:', f1_score(y_test, y_pred_NN_PCA))
print ('Recall:', recall_score(y_test, y_pred_NN_PCA))
print ('Precision:', precision_score(y_test, y_pred_NN_PCA))
print ('AUC:', roc_auc_score(y_test, y_pred_NN_PCA))
mse_nn_pca = mean_squared_error(y_test, y_pred_NN_PCA)
rmse_nn_pca = math.sqrt(mse_nn_pca)
print(rmse_nn_pca)


# V.	Machine Learning Algorithms Using Oversampling and Undersampling

### A.	Library List and Opening File

In [None]:
import pandas as pd, numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.datasets import make_classification
from matplotlib import pyplot as plt
import imblearn
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import RandomUnderSampler


In [None]:
df = pd.read_csv("df2_with_dummies.csv")
df = df.drop(columns = ['Unnamed: 0','log_annual_inc', 'revol_bal', 'UR'])

# defining target and feature variable
X = df.drop(columns = 'charged_off')
X = X.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y = df['charged_off']

# Separate train and test
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size =.3,random_state=1234, stratify=y)
print("The shape of train data set using df2_with_dummies is: x = {} and y = {}".format(x_train.shape, y_train.shape))
print("The shape of test data set using df2_with_dummies is: x = {} and y = {}". format(x_test.shape, y_test.shape))


In [None]:
# define undersample strategy
rus = RandomUnderSampler(sampling_strategy='majority')
x_train_rus, y_train_rus = rus.fit_sample(x_train, y_train.ravel())

#define smote strategy
smote = SMOTE(sampling_strategy='minority')
x_train_smote, y_train_smote = smote.fit_sample(x_train, y_train.ravel())

print("Before RUS, counts of label '1': {}".format(sum(y_train==1)))
print("Before RUS, counts of label '0': {} \n".format(sum(y_train==0)))

print("The shape of train data set using df2_with_dummies is: x = {} and y = {}".format(x_train_res.shape, y_train_res.shape))
print("The shape of test data set using df2_with_dummies is: x = {} and y = {} \n". format(x_test.shape, y_test.shape))

print("After RUS, counts of label '1': {}".format(sum(y_train_res==1)))
print("After RUS, counts of label '0': {}".format(sum(y_train_res==0)))


### B.	Naive Bayesian Classifier Using df2_with_dummies Data Set

In [None]:
# Create a model (object) for classification
gnb_rus = GaussianNB()
gnb_smote = GaussianNB()

# Build a random forest classification model
gnb_rus.fit(x_train_rus, y_train_rus)
gnb_smote.fit(x_train_smote, y_train_smote)

y_pred_gnb_rus = gnb_rus.predict(x_test)
y_pred_gnb_smote = gnb_smote.predict(x_test)

# Build a confusion matrix and show the Classification Report
cm_nb_rus = metrics.confusion_matrix(y_test,y_pred_gnb_rus)
print('\nConfusion Matrix Naive Bayes - RUS','\n',cm_nb_rus)
print('\nClassification Report','\n',metrics.classification_report(y_test,y_pred_gnb_rus))
print("---------------------------------------------------------------------------------") 
cm_nb_smote = metrics.confusion_matrix(y_test,y_pred_gnb_smote)
print('\nConfusion Matrix Naive Bayes - SMOTE','\n',cm_nb_smote)
print('\nClassification Report Naive Bayes - SMOTE','\n',metrics.classification_report(y_test,y_pred_gnb_smote))
print("---------------------------------------------------------------------------------") 

print("Performance score for NB - RUS ")
print ('Accuracy:', accuracy_score(y_test, y_pred_gnb_rus))
print ('F1 score:', f1_score(y_test, y_pred_gnb_rus))
print ('Recall:', recall_score(y_test, y_pred_gnb_rus))
print ('Precision:', precision_score(y_test, y_pred_gnb_rus))
print ('AUC:', roc_auc_score(y_test, y_pred_gnb_rus))

print("---------------------------------------------------------------------------------------")

print("Performance score for NB - SMOTE ")
print ('Accuracy:', accuracy_score(y_test, y_pred_gnb_smote))
print ('F1 score:', f1_score(y_test, y_pred_gnb_smote))
print ('Recall:', recall_score(y_test, y_pred_gnb_smote))
print ('Precision:', precision_score(y_test, y_pred_gnb_smote))
print ('AUC:', roc_auc_score(y_test, y_pred_gnb_smote))


### A.	Decision Trees Classifier Using df2_with_dummies Data Set

In [None]:
# Create instance of Decision Tree Classifier
dtm_rus = DecisionTreeClassifier()
dtm_smote = DecisionTreeClassifier()

# Fit the instance with training data
dtm_rus.fit(x_train_rus, y_train_rus)
dtm_smote.fit(x_train_smote, y_train_smote)

# Predict using the fitted model
y_pred_dt_rus = dtm_rus.predict(x_test)
y_pred_dt_smote = dtm_smote.predict(x_test)

# Build a confusion matrix and show the Classification Report
cm_dt_rus = metrics.confusion_matrix(y_test,y_pred_dt_rus)
print('\nConfusion Matrix DT RUS','\n',cm_dt_rus)
print('\nClassification Report DECISION TREE RUS','\n',metrics.classification_report(y_test,y_pred_dt_rus))
print('--------------------------------------------------------------------------------')
      
cm_dt_smote = metrics.confusion_matrix(y_test,y_pred_dt_smote)
print('\nConfusion Matrix DT SMOTE','\n',cm_dt_smote)
print('\nClassification Report DECISION TREE SMOTE','\n',metrics.classification_report(y_test,y_pred_dt_smote))
print('--------------------------------------------------------------------------------')


### B.	Random Forest Classifier Using df2_with_dummies Data Set

In [None]:
# Create a model (object) for classification
rfcm_rus = RandomForestClassifier()
rfcm_smote = RandomForestClassifier()

# Build a random forest classification model
rfcm_rus.fit(x_train_rus, y_train_rus)
rfcm_smote.fit(x_train_smote, y_train_smote)

y_pred_rf_rus = rfcm_rus.predict(x_test)
y_pred_rf_smote = rfcm_smote.predict(x_test)

# Build a confusion matrix and show the Classification Report
cm_rf_rus = metrics.confusion_matrix(y_test,y_pred_rf_rus)
print('\nConfusion Matrix RF RUS','\n',cm_rf_rus)
print('\nClassification Report RF RUS','\n',metrics.classification_report(y_test,y_pred_rf_rus))
print("---------------------------------------------------------------------------------------")


cm_rf_smote = metrics.confusion_matrix(y_test, y_pred_rf_smote)
print('\nConfusion Matrix RF SMOTE','\n',cm_rf_smote)
print('\nClassification Report RF SMOTE','\n',metrics.classification_report(y_test,y_pred_rf_smote))


In [None]:
print("Performance score for RF - RUS ")
print ('Accuracy:', accuracy_score(y_test, y_pred_rf_rus))
print ('F1 score:', f1_score(y_test, y_pred_rf_rus))
print ('Recall:', recall_score(y_test, y_pred_rf_rus))
print ('Precision:', precision_score(y_test, y_pred_rf_rus))
print ('AUC:', roc_auc_score(y_test, y_pred_rf_rus))

print("---------------------------------------------------------------------------------------")

print("Performance score for RF - SMOTE ")
print ('Accuracy:', accuracy_score(y_test, y_pred_rf_smote))
print ('F1 score:', f1_score(y_test, y_pred_rf_smote))
print ('Recall:', recall_score(y_test, y_pred_rf_smote))
print ('Precision:', precision_score(y_test, y_pred_rf_smote))
print ('AUC:', roc_auc_score(y_test, y_pred_rf_smote))


### C.	Neural Network Classifier Using df2_with_dummies Data Set

In [None]:
# Normalize the data
Xn = scale(X)
# Set the 'stratify' option 'y' to sample 
Xn_train, Xn_test = train_test_split(Xn, test_size =.3,random_state=1234, stratify=y)
y_train, y_test = train_test_split(y, test_size=.3, random_state=1234, stratify=y)

# define undersample strategy
rus = RandomUnderSampler(sampling_strategy='majority')
smote = SMOTE(sampling_strategy = 'minority')
xn_train_rus, y_train_rus = rus.fit_sample(Xn_train, y_train.ravel())
xn_train_smote, y_train_smote = smote.fit_sample(Xn_train, y_train.ravel())

nnm_rus = MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000,activation='logistic')
nnm_smote = MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000,activation='logistic')


In [None]:

# Make predictions
nnm_rus.fit(xn_train_rus, y_train_rus)
nnm_smote.fit(xn_train_smote, y_train_smote)

y_pred_nn_rus = nnm_rus.predict(Xn_test)
y_pred_nn_smote = nnm_smote.predict(Xn_test)

print('\n ** Performance Scores **')
# Build a confusion matrix and show the Classification Report
cm_nn_rus = metrics.confusion_matrix(y_test,y_pred_nn_rus)
print('\nConfusion Matrix','\n',cm_nn_rus)
print('\nClassification Report Neural Network - RUS','\n',metrics.classification_report(y_test,y_pred_nn_rus))

print("---------------------------------------------------------------------------------")

cm_nn_smote = metrics.confusion_matrix(y_test,y_pred_nn_smote)
print('\nConfusion Matrix','\n',cm_nn_smote)
print('\nClassification Report Neural Network - SMOTE','\n',metrics.classification_report(y_test,y_pred_nn_smote))


In [None]:
print ('Accuracy:', accuracy_score(y_test, y_pred_nn_rus))
print ('F1 score:', f1_score(y_test, y_pred_nn_rus))
print ('Recall:', recall_score(y_test, y_pred_nn_rus))
print ('Precision:', precision_score(y_test, y_pred_nn_rus))
print ('AUC:', roc_auc_score(y_test, y_pred_nn_rus))
print("--------------------------------------------")
print ('Accuracy:', accuracy_score(y_test, y_pred_nn_smote))
print ('F1 score:', f1_score(y_test, y_pred_nn_smote))
print ('Recall:', recall_score(y_test, y_pred_nn_smote))
print ('Precision:', precision_score(y_test, y_pred_nn_smote))
print ('AUC:', roc_auc_score(y_test, y_pred_nn_smote))


In [None]:
from sklearn.metrics import mean_squared_error
import math

mse = mean_squared_error(y_test, y_pred_nn_rus)
rmse = math.sqrt(mse)
print(rmse)


# VI.	Machine Learning Algorithms Using K-Fold Cross Validation

In [None]:
# Opening the data frame after joining
df2_with_dummies = pd.read_csv("df2_with_dummies.csv", low_memory = False)
# Removing that first column: ("Unnamed: 0")
df2_with_dummies = df2_with_dummies.drop(columns = "Unnamed: 0")
# Separate X from Y
y = df2_with_dummies['charged_off']
x = df2_with_dummies.drop(columns = ['charged_off'])
x = x.replace([np.inf, -np.inf], np.nan).dropna(axis=1)


### A.	Naive Bayesian

In [None]:
# Create instance of Gaussian Naive Bayesian Classifier
gnb = GaussianNB()

# Gather all the scores from the cross-validation
scores = cross_validate(gnb, x, y, cv = 5, scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc', 'neg_root_mean_squared_error'])

print("Average accuracy is: ", scores['test_accuracy'].mean(), "with values as following: ", scores['test_accuracy'])
print("Average F1 is: ", scores['test_f1'].mean(), "with values as following: ", scores['test_f1'])
print("Average recall/sensitivity is: ", scores['test_recall'].mean(), "with values as following: ", scores['test_recall'])
print("Average precision is: ", scores['test_precision'].mean(), "with values as following: ", scores['test_precision'])
print("Average ROC is: ", scores['test_roc_auc'].mean(), "with values as following: ", scores['test_roc_auc'])
print("Average RMSE is: ", -scores['test_neg_root_mean_squared_error'].mean(), "with values as following: ", -scores['test_neg_root_mean_squared_error'])


### B.	Decision Tree

In [None]:
# Separate X from Y
y = df2_with_dummies['charged_off']
x = df2_with_dummies.drop(columns = ['charged_off'])
x = x.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

# Create instance of Decision Tree Classifier
dtm = DecisionTreeClassifier()

# Get all the scores
scores = cross_validate(dtm, x, y, cv = 5, scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc', 'neg_root_mean_squared_error'])

print("Average accuracy is: ", scores['test_accuracy'].mean())
print("Average F1 is: ", scores['test_f1'].mean())
print("Average recall/sensitivity is: ", scores['test_recall'].mean())
print("Average precision is: ", scores['test_precision'].mean())
print("Average ROC is: ", scores['test_roc_auc'].mean())
print("Average RMSE is: ", -scores['test_neg_root_mean_squared_error'].mean())


### C.	Random Forest

In [None]:
# Separate X from Y
y = df2_with_dummies['charged_off']
x = df2_with_dummies.drop(columns = ['charged_off'])
x = x.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

rfcm = RandomForestClassifier()

scores = cross_validate(rfcm, x, y, cv = 5, scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc', 'neg_root_mean_squared_error'])

print("Average accuracy is: ", scores['test_accuracy'].mean(), "with values as following: ", scores['test_accuracy'])
print("Average F1 is: ", scores['test_f1'].mean(), "with values as following: ", scores['test_f1'])
print("Average recall/sensitivity is: ", scores['test_recall'].mean(), "with values as following: ", scores['test_recall'])
print("Average precision is: ", scores['test_precision'].mean(), "with values as following: ", scores['test_precision'])
print("Average ROC is: ", scores['test_roc_auc'].mean(), "with values as following: ", scores['test_roc_auc'])
print("Average RMSE is: ", -scores['test_neg_root_mean_squared_error'].mean(), "with values as following: ", -scores['test_neg_root_mean_squared_error'])


### D.	Neural Network

In [None]:
y = df2_with_dummies['charged_off']
x = df2_with_dummies.drop(columns = ['charged_off'])
x = x.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
x = StandardScaler().fit_transform(x)

nnm = MLPClassifier(hidden_layer_sizes=(20,), max_iter=2000,activation='logistic')

scores = cross_validate(nnm, x, y, cv = 5, scoring = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc', 'neg_root_mean_squared_error'])

print("Average accuracy is: ", scores['test_accuracy'].mean(), "with values as following: ", scores['test_accuracy'])
print("Average F1 is: ", scores['test_f1'].mean(), "with values as following: ", scores['test_f1'])
print("Average recall/sensitivity is: ", scores['test_recall'].mean(), "with values as following: ", scores['test_recall'])
print("Average precision is: ", scores['test_precision'].mean(), "with values as following: ", scores['test_precision'])
print("Average ROC is: ", scores['test_roc_auc'].mean(), "with values as following: ", scores['test_roc_auc'])
print("Average RMSE is: ", -scores['test_neg_root_mean_squared_error'].mean(), "with values as following: ", -scores['test_neg_root_mean_squared_error'])


# VII.	Area Under Curve Plotting

### A.	Library List and Opening the File

In [None]:
import pandas as pd, numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.datasets import make_classification
from matplotlib import pyplot as plt
import imblearn
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score

from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from sklearn.model_selection  import train_test_split
from sklearn import metrics
import statsmodels.api as sm
from sklearn.metrics import *
import statsmodels.formula.api as smf
import random

from imblearn.under_sampling import RandomUnderSampler


In [None]:
df = pd.read_csv("df2_with_dummies.csv")
df = df.drop(columns = ['Unnamed: 0','log_annual_inc', 'revol_bal', 'UR'])

# defining target and feature variable
X = df.drop(columns = 'charged_off')
X = X.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y = df['charged_off']

# Separate train and test
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size =.3,random_state=1234, stratify=y)
print("The shape of train data set using df2_with_dummies is: x = {} and y = {}".format(x_train.shape, y_train.shape))
print("The shape of test data set using df2_with_dummies is: x = {} and y = {}". format(x_test.shape, y_test.shape))


In [None]:
# define undersample strategy
rus = RandomUnderSampler(sampling_strategy='majority')
x_train_rus, y_train_rus = rus.fit_sample(x_train, y_train.ravel())

# Normalize the data
Xn = scale(X)


### A.	Neural Network

In [None]:
# Set the 'stratify' option 'y' to sample 
Xn_train, Xn_test = train_test_split(Xn, test_size =.3,random_state=1234, stratify=y)
y_train, y_test = train_test_split(y, test_size=.3, random_state=1234, stratify=y)

# define undersample strategy
rus = RandomUnderSampler(sampling_strategy='majority')
# smote = SMOTE(sampling_strategy = 'minority')
xn_train_rus, y_train_rus = rus.fit_sample(Xn_train, y_train.ravel())
# xn_train_smote, y_train_smote = smote.fit_sample(Xn_train, y_train.ravel())

nnm_rus = MLPClassifier(hidden_layer_sizes=(20,), max_iter=2000,activation='logistic')
nnm_rus.fit(xn_train_rus, y_train_rus)

y_pred_nn_rus = nnm_rus.predict(Xn_test)


In [None]:
false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, y_pred_nn_rus)

plt.subplots(1, figsize=(10,10))
plt.title('ROC Curve for NN - RUS', fontsize = 30)
plt.plot(false_positive_rate1, true_positive_rate1)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate', fontsize = 20)
plt.xlabel('False Positive Rate', fontsize = 20)
plt.show()


In [None]:
X = df.drop(columns = 'charged_off')
X = X.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y = df['charged_off']


In [None]:
# splitting dataset 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =.3,random_state=1234, stratify=y)

 # Normalize the data
Xn = scale(X)
 
# Set the 'stratify' option 'y' to sample 
Xn_train, Xn_test = train_test_split(Xn, test_size =.3,random_state=1234, stratify=y)
y_train, y_test = train_test_split(y, test_size=.3, random_state=1234, stratify=y)
 
nnm = MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000,activation='logistic')

# Make predictions
nnm.fit(Xn_train, y_train)
y_pred_NN = nnm.predict(Xn_test)

false_positive_rate1, true_positive_rate1, threshold1 = roc_curve(y_test, y_pred_NN)

plt.subplots(1, figsize=(10,10))
plt.title('ROC Curve for NN - Before RUS', fontsize = 30)
plt.plot(false_positive_rate1, true_positive_rate1)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate', fontsize = 20)
plt.xlabel('False Positive Rate', fontsize = 20)

plt.show()


### B.	Random Forest

In [None]:
# Create a model (object) for classification
rfcm_pca = RandomForestClassifier()

# Build a random forest classification model
rfcm_pca.fit(X_train, y_train)
y_pred_rf_pca = rfcm_pca.predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_rf_pca)


In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)


In [None]:
# Create a model (object) for classification
rfcm_pca = RandomForestClassifier()

# Build a random forest classification model
rfcm_pca.fit(x_train_rus, y_train_rus)
y_pred_rf_pca = rfcm_pca.predict(X_test)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_rf_pca)

def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)


# VII.	Getting Important Features

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index = X.columns)
feat_importances.nlargest(10).plot(kind = 'barh')
plt.show


# VIII	Attempting Clustering using K-Means Clustering

In [None]:
Create the k number of clusters after finding k using the elbow method. 
df_n = minmax_scale(X)
df_n


In [None]:

ssd = [] # Initialize the list for inertia values - sum of squared distances
for i in range(2,50):
    km = KMeans(n_clusters=i, random_state=1234)
    km.fit(df_n)
    ssd.append(km.inertia_)
 
# Check the inertia values.
for i in range(len(ssd)):
    print('{0}: {1:.2f}'.format(i+2, ssd[i]))   
 
# Draw the plot to find the elbow
    
plt.plot(range(2,50), ssd)
plt.grid(True)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Distances')
plt.show()

km = KMeans(n_clusters=8, random_state=1234)
 
# Create clusters
km.fit(df_n)
km.inertia_
 
# Add the cluster number to the original data.
df2_with_dummies['ClusterNo'] = km.labels_
df2_with_dummies.head()

# Divide the original data into the clusters.
 
Cluster0 = df2_with_dummies.loc[df_with_dummies.ClusterNo == 0]
Cluster0.describe()
Cluster0.info()
 
Cluster1 = df2_with_dummies.loc[df2_with_dummies.ClusterNo == 1]
Cluster2 = df2_with_dummies.loc[df2_with_dummies.ClusterNo == 2]
Cluster3 = df2_with_dummies.loc[df2_with_dummies.ClusterNo == 3]
Cluster4 = df2_with_dummies.loc[df2_with_dummies.ClusterNo == 4]
Cluster5 = df2_with_dummies.loc[df2_with_dummies.ClusterNo == 5]
Cluster6 = df2_with_dummies.loc[df2_with_dummies.ClusterNo == 6]
Cluster7 = df2_with_dummies.loc[df2_with_dummies.ClusterNo == 7]

# Now, you can apply ml algorithms to each cluster.
 


In [None]:
# Description of profiles of each cluster
df2_with_dummies.groupby(['ClusterNo']).count()
df2_with_dummies.groupby(['ClusterNo']).mean()
df2_with_dummies.groupby(['ClusterNo']).median()
df2_with_dummies.groupby(['ClusterNo']).max()
df2_with_dummies.groupby(['ClusterNo']).min()
 
# Set up X and y for each cluster
X0 = Cluster0.drop(columns=['charged_off','ClusterNo'], axis = 1)
X0 = X.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y0 = Cluster0.charged_off
X1 = Cluster1.drop(columns=['charged_off','ClusterNo'], axis = 1)
X1 = X1.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y1 = Cluster1.charged_off
X2 = Cluster2.drop(columns=['charged_off','ClusterNo'], axis = 1)
X2 = X2.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y2 = Cluster2.charged_off
X3 = Cluster3.drop(columns=['charged_off','ClusterNo'], axis = 1)
X3 = X3.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y3 = Cluster3.charged_off
X4 = Cluster4.drop(columns=['charged_off','ClusterNo'], axis = 1)
X4 = X4.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y4 = Cluster4.charged_off
X5 = Cluster5.drop(columns=['charged_off','ClusterNo'], axis = 1)
X5 = X5.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y5 = Cluster5.charged_off
X6 = Cluster6.drop(columns=['charged_off','ClusterNo'], axis = 1)
X6 = X6.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y6 = Cluster6.charged_off
X7 = Cluster7.drop(columns=['charged_off','ClusterNo'], axis = 1)
X7 = X7.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
y7 = Cluster7.charged_off


In [None]:
#Split to train and test set
X_train0, X_test0, y_train0, y_test0 = train_test_split(X0,y0,test_size =.3,random_state=1234, stratify=y0)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y1,test_size =.3,random_state = 1234, stratify=y1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2,test_size =.3,random_state=1234, stratify=y2)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3,y3,test_size =.3,random_state=1234, stratify=y3)
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4,y4,test_size =.3,random_state=1234, stratify=y4)
X_train5, X_test5, y_train5, y_test5 = train_test_split(X5,y5,test_size =.3,random_state=1234, stratify=y5)
X_train6, X_test6, y_train6, y_test6 = train_test_split(X6,y6,test_size =.3,random_state=1234, stratify=y6)
X_train7, X_test7, y_train7, y_test7 = train_test_split(X7,y7,test_size =.3,random_state=1234, stratify=y7)
 
# Create a model (object) for classification
rfcm0 = RandomForestClassifier()
rfcm1 = RandomForestClassifier()
rfcm2 = RandomForestClassifier()
rfcm3 = RandomForestClassifier()
rfcm4 = RandomForestClassifier()
rfcm5 = RandomForestClassifier()
rfcm6 = RandomForestClassifier()
rfcm7 = RandomForestClassifier()
 
# Build random forest classification models
rfcm0.fit(X_train0, y_train0)
y_pred0 = rfcm0.predict(X_test0)
rfcm1.fit(X_train1, y_train1)
y_pred1 = rfcm1.predict(X_test1)
rfcm2.fit(X_train2, y_train2)
y_pred2 = rfcm2.predict(X_test2)
rfcm3.fit(X_train3, y_train3)
y_pred3 = rfcm3.predict(X_test3)
rfcm4.fit(X_train4, y_train4)
y_pred4 = rfcm4.predict(X_test4)
rfcm5.fit(X_train5, y_train5)
y_pred5 = rfcm5.predict(X_test5)
rfcm6.fit(X_train6, y_train6)
y_pred6 = rfcm6.predict(X_test6)
rfcm7.fit(X_train7, y_train7)
y_pred7 = rfcm7.predict(X_test7)


In [None]:
 
# Print the performance 
print('\n ** Performance Scores **')

 
# Calculate accuracy
accuracy0 = rfcm0.score(X_test0, y_test0)
print('Accuracy0: {0:.2f}'.format(accuracy0))
 
# Build a confusion matrix and show the Classification Report
cm0 = metrics.confusion_matrix(y_test0,y_pred0)
print('\nConfusion Matrix','\n',cm0)

print('\nClassification Report','\n',metrics.classification_report(y_test0,y_pred0))
 
# Print the performance 
print('\n ** Performance Scores **')
 
# Calculate accuracy
accuracy1 = rfcm1.score(X_test1, y_test1)
print('Accuracy1: {0:.2f}'.format(accuracy1))
 
# Build a confusion matrix and show the Classification Report
cm1 = metrics.confusion_matrix(y_test1,y_pred1)
print('\nConfusion Matrix','\n',cm1)
print('\nClassification Report','\n',metrics.classification_report(y_test1,y_pred1))
 
# Print the performance 
print('\n ** Performance Scores **')
 
# Calculate accuracy
accuracy2 = rfcm2.score(X_test2, y_test2)
print('Accuracy2: {0:.2f}'.format(accuracy2))
 
# Build a confusion matrix and show the Classification Report
cm2 = metrics.confusion_matrix(y_test2,y_pred2)
print('\nConfusion Matrix','\n',cm2)
print('\nClassification Report','\n',metrics.classification_report(y_test2,y_pred2))
 
# Print the performance 
print('\n ** Performance Scores **')
 
# Calculate accuracy
accuracy3 = rfcm3.score(X_test3, y_test3)
print('Accuracy3: {0:.2f}'.format(accuracy3))
 
# Build a confusion matrix and show the Classification Report
cm3 = metrics.confusion_matrix(y_test3,y_pred3)
print('\nConfusion Matrix','\n',cm3)
print('\nClassification Report','\n',metrics.classification_report(y_test3,y_pred3))
 
# Print the performance 
print('\n ** Performance Scores **')


In [None]:
# Calculate accuracy
accuracy4 = rfcm4.score(X_test4, y_test4)
print('Accuracy4: {0:.2f}'.format(accuracy4))
 
# Build a confusion matrix and show the Classification Report
cm4 = metrics.confusion_matrix(y_test4,y_pred4)
print('\nConfusion Matrix','\n',cm4)
print('\nClassification Report','\n',metrics.classification_report(y_test4,y_pred4))
 
# Print the performance 
print('\n ** Performance Scores **')
 
# Calculate accuracy
accuracy5 = rfcm5.score(X_test5, y_test5)
print('Accuracy5: {0:.2f}'.format(accuracy5))
 
# Build a confusion matrix and show the Classification Report
cm5 = metrics.confusion_matrix(y_test5,y_pred5)
print('\nConfusion Matrix','\n',cm)
print('\nClassification Report','\n',metrics.classification_report(y_test5,y_pred5))
 
# Print the performance 
print('\n ** Performance Scores **')
 
# Calculate accuracy
accuracy6 = rfcm6.score(X_test6, y_test6)
print('Accuracy6: {0:.2f}'.format(accuracy6))
 
# Build a confusion matrix and show the Classification Report
cm6 = metrics.confusion_matrix(y_test6,y_pred6)
print('\nConfusion Matrix','\n',cm6)
print('\nClassification Report','\n',metrics.classification_report(y_test6,y_pred6))
 
# Print the performance 
print('\n ** Performance Scores **')
 
# Calculate accuracy
accuracy7 = rfcm7.score(X_test7, y_test7)
print('Accuracy7: {0:.2f}'.format(accuracy7))
 
# Build a confusion matrix and show the Classification Report
cm7 = metrics.confusion_matrix(y_test7,y_pred7)
print('\nConfusion Matrix','\n',cm7)
print('\nClassification Report','\n',metrics.classification_report(y_test7,y_pred7))


In [None]:
# Create a Bayesian Classifier instance for classification
gnb0 = GaussianNB()
gnb1 = GaussianNB()
gnb2 = GaussianNB()
gnb3 = GaussianNB()
gnb4 = GaussianNB()
gnb5 = GaussianNB()
gnb6 = GaussianNB()
gnb7 = GaussianNB()
 
gnb0.fit(X_train0, y_train0)
y_pred0 = gnb0.predict(X_test0)
gnb1.fit(X_train1, y_train1)
y_pred1 = gnb1.predict(X_test1)
gnb2.fit(X_train2, y_train2)
y_pred2 = gnb2.predict(X_test2)
gnb3.fit(X_train3, y_train3)
y_pred3 = gnb3.predict(X_test3)
gnb4.fit(X_train4, y_train4)
y_pred4 = gnb4.predict(X_test4)
gnb5.fit(X_train5, y_train5)
y_pred5 = gnb5.predict(X_test5)
gnb6.fit(X_train6, y_train6)
y_pred6 = gnb6.predict(X_test6)
gnb7.fit(X_train7, y_train7)
y_pred7 = gnb7.predict(X_test7)
 


In [None]:
# Calculate the accuracy
accuracy0 = gnb0.score(X_test0, y_test0)
print('Accuracy0: {0:.2f}'.format(accuracy0))
accuracy1 = gnb1.score(X_test1, y_test1)
print('Accuracy1: {0:.2f}'.format(accuracy1))
accuracy2 = gnb2.score(X_test2, y_test2)
print('Accuracy2: {0:.2f}'.format(accuracy2))
accuracy3 = gnb3.score(X_test3, y_test3)
print('Accuracy3: {0:.2f}'.format(accuracy3))
accuracy4 = gnb4.score(X_test4, y_test4)
print('Accuracy4: {0:.2f}'.format(accuracy4))
accuracy5 = gnb5.score(X_test5, y_test5)
print('Accuracy5: {0:.2f}'.format(accuracy5))
accuracy6 = gnb6.score(X_test6, y_test6)
print('Accuracy6: {0:.2f}'.format(accuracy6))
accuracy7 = gnb7.score(X_test7, y_test7)
print('Accuracy7: {0:.2f}'.format(accuracy7))


In [None]:
# Build a confusion matrix
cm0 = metrics.confusion_matrix(y_test0,y_pred0)
print(metrics.classification_report(y_test0,y_pred0))
cm1 = metrics.confusion_matrix(y_test1,y_pred1)
print(metrics.classification_report(y_test1,y_pred1))
cm2 = metrics.confusion_matrix(y_test2,y_pred2)
print(metrics.classification_report(y_test2,y_pred2))
cm3 = metrics.confusion_matrix(y_test3,y_pred3)
print(metrics.classification_report(y_test3,y_pred3))
cm4 = metrics.confusion_matrix(y_test4,y_pred4)
print(metrics.classification_report(y_test4,y_pred4))
cm5 = metrics.confusion_matrix(y_test5,y_pred5)
print(metrics.classification_report(y_test5,y_pred5))
cm6 = metrics.confusion_matrix(y_test6,y_pred6)
print(metrics.classification_report(y_test6,y_pred6))
cm7 = metrics.confusion_matrix(y_test7,y_pred7)
print(metrics.classification_report(y_test7,y_pred7))
