In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import math
import seaborn as sns 
import sklearn
from uszipcode import SearchEngine
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import pickle
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
credit = pd.read_csv(r"C:\Users\Dell\Documents\Python Credit Risk\Data\SBAnational.csv", low_memory=False)

In [None]:
credit.head()

In [None]:
# Summary 
credit.describe()

In [None]:
# type of data

credit.info()

In [None]:
print(credit.columns)

In [None]:
# Preprocessing and feature engenieering

# we use the information about the features in the pdf we received
# they added some other features to the dataset like recession percentage of loan by SBA, loans backed by realestate which we could use too
# also we will have to deal with all the missing values and impute them somehow or delete them
# we will have to dummify the features like y/n to 0/1 and also the target to 0/1 meaning default/paied
# we will have to deal with all the dates and use them to build the other column and discard them after
# also city/ state/ zip describe same variable kinda same bank / bank state whci we will dleete because is says nothing about the customer
# NAICS is type of business so we might need it
# We have to fix the nas
# We have to decide which features we keep as predctors and which we dont and if we need to build new features


In [None]:
# check na
credit.isnull().sum()

In [None]:
# see the distribution of target feature
Counter(credit['MIS_Status'])  # unbalanced + nas Counter({'P I F': 739609, 'CHGOFF': 157558, nan: 1997})

In [None]:
# Fixing state nas
missing_rows = credit[credit['State'].isnull()].index
search = SearchEngine()

# impute State using zearch.by_zipcode function
for i in missing_rows:
    zipcode = search.by_zipcode(credit.iloc[i,1])
    credit.iloc[i,0] = zipcode.state

In [None]:
#Check how our NA's was imputed. We still have 2 NAs. One zipcode = 0 
#and other is not in list the of search.by_zipcode function. We will remove them 
credit = credit.dropna(how='any', subset=['State'])

In [None]:
# we have missing values in mis_status (our target variable). We will remove the rows in question because we cannot 
# impute them in a way that we are 100 % that it will be prepresentative , also they are nit that many
credit = credit.dropna(how='any', subset=['MIS_Status'])
# Change labels target feature
credit.loc[credit['MIS_Status'] == "P I F", 'MIS_Status'] = 1  # Paid in full = 1
credit.loc[credit['MIS_Status'] == "CHGOFF", 'MIS_Status'] = 0  # Charged off = 0 
credit["MIS_Status"] = credit.MIS_Status.astype(object)

In [None]:
# Nas is new exist and change to 0/1 label
# after change type to object we will impute nas to most frequent value
credit["NewExist"] = credit.NewExist.astype(object)
credit['NewExist'].fillna(1, inplace=True)  # fill nas with 1 (most frequent)
credit.loc[credit['NewExist'] == 0, 'NewExist'] = 1 # change the 0 to 1 (most frequent)
credit.loc[credit['NewExist'] == 2, 'NewExist'] = 0 # change 2 to 0 so 0 = new, 1 = established
credit["NewExist"] = credit.NewExist.astype(object)

In [None]:
# add column loans backed by “RealEstate,” where  “RealEstate” = 1 if “Term” > 240 months 
# and “RealEstate” = 0 if “Term” <240 months/ Counter({0: 831027, 1: 66138}) 
credit['RealEstate'] = np.where(credit['Term'] > 240, 1, 0)

In [None]:
# fix na and relabel LowDoc Loan Program: Y = Yes, N = No to 0= no and 1 = yes # we have also other values (S,A,0,R,C,1) that
# we will also change to 1 or 0
# We will fill nas with most frequent aka N (no)
credit['LowDoc'].fillna("N", inplace=True)
credit['LowDoc'] = credit.LowDoc.replace(dict.fromkeys(['C','1','S','A','R','0'], 'N'))

# Change the label to low doc from N and Y to 0 for no and 1 for yes
credit.loc[credit['LowDoc'] == "N", 'LowDoc'] = 0
credit.loc[credit['LowDoc'] == "Y", 'LowDoc'] = 1
credit["LowDoc"] = credit.LowDoc.astype(object)

In [None]:
# build a column with the pecentage of sba covered of the loan(ratio of the
# amount of the loan SBA guarantees and the gross amount approved by the bank (SBA_Appv/GrAppv).)
cols_to_change = ['SBA_Appv', 'GrAppv']

for col in cols_to_change:
    credit[col] = credit[col].str[1:]
    credit[col] = credit[col].str.slice(0, -2)
    credit[col] = credit[col].replace(',', '', regex=True)
    credit[col] = credit[col].astype(float)

credit['Portion'] = round((credit['SBA_Appv'] / credit['GrAppv']) *100)  #Build the new column
credit["Portion"] = credit.Portion.astype(int)

In [None]:
# NAICS is the code for the industry
# We can see we have the value 0 for 201666 of the rows

# Let's check the mis status for those rows. If they are not many party of the minority class we could delete them
NAICS_0 = credit[credit['NAICS'] == 0]
Counter(NAICS_0.MIS_Status).values() # 184868 - 0, 16798 - 1, we can delete them
credit = credit[credit.NAICS != 0]

# We will leave just the two first numbers of the code that tells the industry(general) instead of the 5 numbers that are more in detail
def first_two(d):                
     return (d // 10 ** (int(math.log(d, 10)) - 1))
credit["NAICS"] = credit.NAICS.astype(int)
credit['NAICS'] = credit.NAICS.apply(first_two)
credit["NAICS"] = credit.NAICS.astype(object)

In [None]:
sns.pairplot(credit, kind="scatter", hue="MIS_Status")
plt.show()

In [None]:
# After analyzing the plot above we will delete the NoEmp, Createjob, 'RetainedJob','FranchiseCode', 'UrbanRural', 'RevLineCr' bacause they say nothing about the label
# The 'DisbursementGross', 'BalanceGross', 'ChgOffPrinGr' they are all 3 amounts that we can get only after the end of the credit period so we will not use it for the prediction
# so we will delete them as they have no use in our predictions
# We can delete the zip because we already used it to fill the state
# we can delete the term because we used it for bulding the real estate column 
# we can delete all the dates because they have a lot of na and we will not use them in the prediction ( 'ChgOffDate', 'DisbursementDate')
# We also delete sba app value because we used it to build the portion %
# Name and id are unique we drop entire column.
# we also delete the city (because we have state and zip and we dont want duplicated predictors), Bank Name (many values and not related to customer)

# def delete_columns(df):
#     df = df.drop(columns = [['NoEmp', 'CreateJob','RetainedJob','FranchiseCode','UrbanRural','RevLineCr','DisbursementGross', 'BalanceGross','ChgOffPrinGr','Zip', 'ChgOffDate','DisbursementDate','SBA_Appv','ApprovalDate','ApprovalFY','Term','LoanNr_ChkDgt', 'City','Name','Bank','BankState']])
#     return df
# delete_columns(credit)
credit = credit.drop(columns=['LoanNr_ChkDgt', 'City','Name','Bank','BankState'])
credit = credit.drop(columns=['Zip', 'ChgOffDate','DisbursementDate','SBA_Appv','ApprovalDate','ApprovalFY','Term'])
credit = credit.drop(columns=['NoEmp', 'CreateJob','RetainedJob','FranchiseCode','UrbanRural','RevLineCr','DisbursementGross', 'BalanceGross','ChgOffPrinGr'])

In [None]:
# check GrAppv(maybe we bin it) highly skewed to the left
plt.figure(figsize = (15, 8))
sns.distplot(credit.GrAppv, color = "g", kde = False)
plt.ylabel('Density')
plt.title('Distribution of Approved ammount')
plt.show()

In [None]:
# binning the gr app
credit['GrAppv'] = pd.cut(credit['GrAppv'], 4)

# we will also bin the portion column
credit['Portion'] = pd.cut(credit['Portion'], 5)

In [None]:
# save preprocessed dataframe 
credit.to_csv('credit_preprocessed.csv')

In [None]:
# Histogram of the label
# Class imbalance we have many more of 1 (paid) than not paid loans. We will try fix this by downsampling the majority class
credit['MIS_Status'].value_counts().plot(kind='bar')

In [None]:
Counter(credit['MIS_Status'])

In [None]:
# Fixing the imbalance by downsampling the minority class

# Separate majority and minority classes
credit_majority = credit[credit.MIS_Status==1]
credit_minority = credit[credit.MIS_Status==0]
 
# Downsample majority class
credit_majority_downsampled = resample(credit_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=140758,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
credit_downsampled = pd.concat([credit_majority_downsampled, credit_minority])
 
# Display new class counts
Counter(credit_downsampled['MIS_Status'])  # Counter({1: 140758, 0: 140758})

credit_downsampled['MIS_Status'].value_counts().plot(kind='bar') # class imbalance fixed

In [None]:
# One Hot encoding categorical/ other values
state_dummy = pd.get_dummies(credit_downsampled['State'])
portion_dummy = pd.get_dummies(credit_downsampled['Portion'])
grappv_dummy = pd.get_dummies(credit_downsampled['GrAppv'])
naics_dummy = pd.get_dummies(credit_downsampled['NAICS'])

In [None]:
# Remove the dummyfied columns in order to append them after
credit_downsampled = credit_downsampled.drop(columns=['State', 'Portion','GrAppv','NAICS'])

In [None]:
# create the dummified data
credit_dummy = pd.concat([credit_downsampled, naics_dummy], axis=1)
credit_dummy = pd.concat([credit_dummy, grappv_dummy], axis=1)
credit_dummy = pd.concat([credit_dummy, portion_dummy], axis=1)
credit_dummy = pd.concat([credit_dummy, state_dummy], axis=1)


In [None]:
# save data
# credit_dummy.to_csv('credit_downsampled_dummy.csv')

# bring data
credit_dummy =  pd.read_csv(r"C:\Users\Dell\Documents\Python Credit Risk\Data\credit_downsampled_dummy.csv", low_memory=False, index_col=0)

In [None]:
# Create test and train datasets
credit_dummy = credit_dummy.astype(int)
y = credit_dummy.MIS_Status
X = credit_dummy.drop(['MIS_Status'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
#------------------------------------------------------- RANDOM FOREST
# OOB Random Forest Model 
# Check if exists already if not train
try:
    rf_OOB = pickle.load(open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\rf_OOB.sav', 'rb'))
except FileNotFoundError:
    print("File doesen't exist, will train the model")
    # Build model
    rf_OOB = RandomForestClassifier()
    rf_OOB.fit(X_train,y_train)
    # Save the model to disk
    pickle.dump(rf_OOB, open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\rf_OOB.sav', 'wb'))

#  predict and performance
rf_OOB_predict = rf_OOB.predict(X_test)
rfc_cv_score = cross_val_score(rf_OOB, X, y, cv=10, scoring="roc_auc")
prob_rf_OOB = rf_OOB.predict_proba(X_test)
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rf_OOB_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rf_OOB_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())
print("Accuracy for model: %.2f" % (accuracy_score(y_test, rf_OOB_predict) * 100), ' % ')  #Accuracy for model: 64  %

In [None]:
# Feature importance
print ("Features sorted by their score:")
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_test.columns, rf_OOB.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=45)
pd.set_option('display.max_rows', 8)
print(importances.sort_values(by='Gini-importance', ascending=False))

In [None]:
#------------------------------------------------------- SVM
# Train a linear SVM model using linear kernel
svm_OOB_linear = sklearn.svm.LinearSVC(max_iter = 10000)
svm_OOB_linear.fit(X_train, y_train)
    
# Make prediction
svm_OOB_linear_pred = svm_OOB_linear.predict(X_test)
    
# Evaluate our model
print("Evaluation linear kernel")
print(classification_report(y_test,svm_OOB_linear_pred))
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, svm_OOB_linear_pred)) # 63 % accuracy
pickle.dump(svm_OOB_linear, open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\svm_OOB_linear.sav', 'wb'))

In [None]:
#------------------------------------------------------- KNN
knn_5 = KNeighborsClassifier(n_neighbors=5, n_jobs = 3)
knn_5.fit(X_train, y_train)
knn_5_pred = knn_5.predict(X_test)
prob_knn_5 = knn_5.predict_proba(X_test)---------------------------------------------
print(confusion_matrix(y_test, knn_5_pred ))
print(classification_report(y_test, knn_5_pred ))
print("Accuracy for model: %" ,(accuracy_score(y_test, knn_5_pred ) * 100))
pickle.dump(knn_5, open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\knn_5.sav', 'wb'))
# 60 % accuracy

In [None]:
# Try to find best number neighbours
error = []

# Calculating error for K values between 90 and 101
for i in range(90, 101):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

In [None]:
# Plot the error for the number neighbours
plt.figure(figsize=(90, 101))
plt.plot(range(1, 100), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

# Best knn 97 

In [None]:
# KNN 97 neighbours model
knn_97 = KNeighborsClassifier(n_neighbors=97, n_jobs = 3)
knn_97.fit(X_train, y_train)
knn_97_pred = knn_97.predict(X_test)
prob_knn = knn_97.predict_proba(X_test)
print(confusion_matrix(y_test, knn_97_pred))
print(classification_report(y_test, knn_97_pred))
print("Accuracy for model: %" ,(accuracy_score(y_test, knn_97_pred) * 100))
# 63 % accuracy
pickle.dump(knn_97, open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\knn_97.sav', 'wb'))

In [None]:
# At this point we tried 3 models Random Forest, SVM and KNN with K = 97,they all have an accuracy close to 60 % minus the svm
# We will try a model ensemble in order to better the predictions with knn and rf
# Averaging
final_pred_average = (prob_knn+prob_rf_OOB)/2
final_pred_average_df = pd.DataFrame()
final_pred_average_df["pred_0"], final_pred_average_df["pred_1"] = final_pred_average.T
final_pred_average_df['Prediction'] = final_pred_average_df.idxmax(axis=1)

dct = {"pred_0": 0 ,
       'pred_1': 1 }

final_pred_average_df = final_pred_average_df.assign(Prediction=final_pred_average_df.Prediction.map(dct))
final_pred_average_df = final_pred_average_df.drop(columns=["pred_0", "pred_1"])

In [None]:
# Compare the results of the average with the test values
y_test_df = pd.DataFrame(y_test)
y_test_df.index = np.arange(1, len(y_test_df) + 1)
final_pred_average_df.index = np.arange(1, len(final_pred_average_df) + 1)
final_pred_average_df = pd.concat([final_pred_average_df,y_test_df ], axis=1)
final_pred_average_df = final_pred_average_df.astype(int)
final_pred_average_df['result'] = np.where(final_pred_average_df['Prediction'] == final_pred_average_df['MIS_Status'], "correct", "incorrect")
Counter(final_pred_average_df["result"])

In [None]:
def percentage(part, whole):
  return 100 * float(part)/float(whole)

percentage(36738,56304) # 65 % accuracy

# While this is better than 1 algorithm at the time it seems that still we have problems 
# We will tru some other aproaches

In [None]:
# remove features and train a random forest again. Features removed by importance. We will keep only the top 6 imp values
credit_feature = credit_dummy[["(41.8, 61.2]", "RealEstate", "(61.2, 80.6]","NewExist","LowDoc","62", 'MIS_Status' ]]
yf = credit_feature.MIS_Status
Xf = credit_feature.drop(['MIS_Status'], axis=1)
Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, test_size=0.20)

In [None]:
# Check if exists already if not train
try:
    rf_features = pickle.load(open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\rf_features.sav', 'rb'))
except FileNotFoundError:
    print("File doesen't exist, will train the model")
    # Build model
    rf_features = RandomForestClassifier()
    rf_features.fit(Xf_train,yf_train)
    # Save the model to disk
    pickle.dump(rf_features, open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\rf_features.sav', 'wb'))

#  predict and performance
rf_features_predict = rf_features.predict(Xf_test)
rfc_cv_score = cross_val_score(rf_features, Xf, yf, cv=10, scoring="roc_auc")
print("=== Confusion Matrix ===")
print(confusion_matrix(yf_test, rf_features_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(yf_test, rf_features_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())
print("Accuracy for model: %.2f" % (accuracy_score(yf_test, rf_features_predict) * 100), ' % ')  #Accuracy for model: 62  %

# This model has the same accuracy as the first one but trains much faster
# will try to tune it

In [None]:
# Hyperparameter tuning

# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)

# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }

# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rf_features, param_distributions = random_grid, n_iter = 3, cv = 3, verbose=2, random_state=42, n_jobs = 3)

# Fit the model
rfc_random.fit(Xf_train, yf_train)

# print results
print(rfc_random.best_params_)  #{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 140}

In [None]:
# Tuned model Random Forest
# Check if exists already if not train
try:
    rf_tuned = pickle.load(open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\rf_tuned.sav', 'rb'))
except FileNotFoundError:
    print("File doesen't exist, will train the model")
    # Build model with chosen hyperparamenters
    rf_tuned = RandomForestClassifier(n_estimators=1000, max_depth=140, max_features='auto')
    rf_tuned.fit(Xf_train,yf_train)
    # Save the model to disk
    pickle.dump(rf_tuned, open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\rf_tuned.sav', 'wb'))

#  predict and performance
rf_tuned_predict = rf_tuned.predict(Xf_test)
rf_tuned_cv_score = cross_val_score(rf_tuned, X, y, cv=10, scoring='roc_auc', n_jobs = 3)
prob_rf_tuned = rf_tuned.predict_proba(Xf_test)
print("=== Confusion Matrix ===")
print(confusion_matrix(yf_test, rf_tuned_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(yf_test, rf_tuned_predict))
print('\n')
print("=== All AUC Scores ===")
print(rf_tuned_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rf_tuned_cv_score.mean())
print("Accuracy for model: %.2f" % (accuracy_score(yf_test, rf_tuned_predict) * 100))  # Accuracy for model: 62 % same perf

In [None]:
# XG BOOST

# Fix the feature names so we can pass them to xgboost
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
credit_feature.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) 
                        else col for col in credit_feature.columns.values]
yg = credit_feature.MIS_Status
Xg = credit_feature.drop(['MIS_Status'], axis=1)
Xg_train, Xg_test, yg_train, yg_test = train_test_split(Xg, yg, test_size=0.20)

# fit model on original training data
from xgboost import XGBClassifier
try:
    xgb = pickle.load(open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\xgb.sav', 'rb'))
except FileNotFoundError:
    print("File doesen't exist, will train the model")
    # Build model
    xgb = XGBClassifier()
    xgb.fit(Xg_train,yg_train)
    # Save the model to disk
    pickle.dump(xgb, open(r'C:\Users\Dell\Documents\Python Credit Risk\Models\rf_features.sav', 'wb'))

#  predict and performance
xgb_predict = xgb.predict(Xg_test)
kfold = sklearn.model_selection.KFold(n_splits=5)
results = cross_val_score(xgb, Xg, yg, cv=kfold)
print("=== Confusion Matrix ===")
print(confusion_matrix(yg_test, xgb_predict))
print('\n')
print("=== All AUC Scores ===")
print(results)
print("Accuracy: %.2f%%" % (accuracy_score(yg_test, xgb_predict) * 100.0)) 
# Accuracy: 61.66% 

In [None]:
# Logistic regression

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
print(classification_report(y_test, y_pred)) # 61 %

In [None]:
# Ada Boost
classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200
)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
cm= confusion_matrix(y_test, predictions)
print(cm)
print("Accuracy: %.2f%%" % (accuracy_score(y_test, predictions) * 100.0)) # 63 %


In [11]:
# At this poit it seem that all the models have more or less the same performance ~60/63 % 
# We will look into the balancing of classes as a reason in loss of information and poor performance
# Will try to add more data 
credit=pd.read_csv(r"C:\Users\Dell\Documents\Python Credit Risk\Data\credit_preprocessed.csv" )

In [12]:
state_dummy = pd.get_dummies(credit['State'])
portion_dummy = pd.get_dummies(credit['Portion'])
grappv_dummy = pd.get_dummies(credit['GrAppv'])
naics_dummy = pd.get_dummies(credit['NAICS'])

In [13]:
credit = pd.concat([credit, naics_dummy], axis=1)
credit = pd.concat([credit, grappv_dummy], axis=1)
credit = pd.concat([credit, portion_dummy], axis=1)
credit = pd.concat([credit, state_dummy], axis=1)
credit = credit.drop(columns=['State', 'Portion','GrAppv','NAICS'])
# # save data
# credit_down2.to_csv("downsample_clusters.csv")

In [15]:
from imblearn.under_sampling import ClusterCentroids
y = credit.MIS_Status
X = credit.drop(['MIS_Status'], axis=1)
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

MemoryError: Unable to allocate 467. MiB for an array with shape (88, 695492) and data type float64