# Modelos Vía Ensamble para clasificación - MCD UNI

## Librerías

In [1]:
# Importar librerías necesarias

import pandas as pd
import numpy as np
from sklearn import metrics
import warnings
from sklearn import preprocessing
warnings.filterwarnings('ignore')

## Modelos de Machine Learning
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier



## Métricas de los modelos
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report


## Selección de Variables
from sklearn.model_selection import train_test_split

# visualización
import seaborn as sns
import matplotlib.pyplot as plt

# Valor de semilla que se utilizará siempre que sea necesario
seed = 16
np.random.seed(seed)

### Leer los data set del caso de Negocio

In [2]:
## data 
dataset = pd.read_csv('CreditScoring.csv')

dataset.head(100)

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,0,0.245353,37,0,0.288417,6500.0,11,1,1,1,0.0
96,97,0,0.542243,48,2,10.000000,,2,0,0,0,
97,98,0,0.010531,57,0,0.280665,5714.0,6,0,1,0,0.0
98,99,0,0.363200,32,0,0.480524,2900.0,4,0,1,0,0.0


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   ID                                    150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 5   DebtRatio                             150000 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 8   NumberOfTimes90DaysLate               150000 non-null  int64  
 9   NumberRealEstateLoansOrLines          150000 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 11  

In [4]:
dataset.ID.nunique()

150000

### Resumen de analisis en los datos

In [5]:
dataset.describe()

Unnamed: 0,ID,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,75000.5,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,43301.414527,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37500.75,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,75000.5,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.25,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,150000.0,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [6]:
print(dataset['SeriousDlqin2yrs'].value_counts())
print(dataset['SeriousDlqin2yrs'].value_counts(normalize=True))

SeriousDlqin2yrs
0    139974
1     10026
Name: count, dtype: int64
SeriousDlqin2yrs
0    0.93316
1    0.06684
Name: proportion, dtype: float64


In [7]:
## Primeras inputaciones
### Imputation of the empty data
dataset['MonthlyIncome'] = dataset['MonthlyIncome'].fillna(dataset['MonthlyIncome'].median())
dataset['NumberOfDependents'] = dataset['NumberOfDependents'].fillna(dataset['NumberOfDependents'].median())

In [8]:
dataset.isnull().sum().sum()

0

In [9]:
dataset.drop('ID',axis=1).corr(method= 'pearson')

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
SeriousDlqin2yrs,1.0,-0.001802,-0.115386,0.125587,-0.007602,-0.017151,-0.029669,0.117175,-0.007038,0.102261,0.046869
RevolvingUtilizationOfUnsecuredLines,-0.001802,1.0,-0.005898,-0.001314,0.003961,0.006513,-0.011281,-0.001061,0.006235,-0.001048,0.001193
age,-0.115386,-0.005898,1.0,-0.062995,0.024188,0.027581,0.147705,-0.061005,0.03315,-0.057159,-0.215693
NumberOfTime30-59DaysPastDueNotWorse,0.125587,-0.001314,-0.062995,1.0,-0.006542,-0.00837,-0.055312,0.983603,-0.030565,0.987005,-0.00459
DebtRatio,-0.007602,0.003961,0.024188,-0.006542,1.0,-0.018006,0.049565,-0.00832,0.120046,-0.007533,-0.044476
MonthlyIncome,-0.017151,0.006513,0.027581,-0.00837,-0.018006,1.0,0.086949,-0.0105,0.116273,-0.009252,0.066314
NumberOfOpenCreditLinesAndLoans,-0.029669,-0.011281,0.147705,-0.055312,0.049565,0.086949,1.0,-0.079984,0.433959,-0.071077,0.074026
NumberOfTimes90DaysLate,0.117175,-0.001061,-0.061005,0.983603,-0.00832,-0.0105,-0.079984,1.0,-0.045205,0.992796,-0.011962
NumberRealEstateLoansOrLines,-0.007038,0.006235,0.03315,-0.030565,0.120046,0.116273,0.433959,-0.045205,1.0,-0.039722,0.129399
NumberOfTime60-89DaysPastDueNotWorse,0.102261,-0.001048,-0.057159,0.987005,-0.007533,-0.009252,-0.071077,0.992796,-0.039722,1.0,-0.012678


### Selección de muestras de entrenamiento y validación 

In [10]:
# Prepara la data

X,y = dataset.drop(['ID', 'SeriousDlqin2yrs'], axis=1),dataset[["SeriousDlqin2yrs"]]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25, random_state=1,stratify=y)

### Regresión Logística

In [11]:
model_RL = LogisticRegression()
model_RL.fit(X_train, y_train)


# Separate the features and the target variable
predict_train_rl = model_RL.predict_proba(X_train)[:,1]
predict_test_rl = model_RL.predict_proba(X_test)[:,1]


# Print scores on both  predict_proba(best_X_train)[:,1]
print("auc on training in LogisticRegression data : {:.3f}".format(roc_auc_score(y_train, predict_train_rl) ))
print("auc on testing in LogisticRegression  data : {:.3f}".format(roc_auc_score(y_test, predict_test_rl) ))

auc on training in LogisticRegression data : 0.681
auc on testing in LogisticRegression  data : 0.679


In [12]:
model_RL.predict(X_train)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [13]:
predict_train_rl

array([0.15391201, 0.01284115, 0.10221353, ..., 0.09217361, 0.14028766,
       0.0052841 ])

In [14]:
clasificacion_predict_train_logit= np.where(predict_train_rl<0.5, 0, 1)
clasificacion_predict_test_logit= np.where(predict_test_rl<0.5, 0, 1)

print("accuracy on training in LogisticRegression data : {:.3f}".format(accuracy_score(y_train, clasificacion_predict_train_logit)))
print("accuracy on testing in LogisticRegression  data : {:.3f}".format(accuracy_score(y_test,clasificacion_predict_test_logit)))

accuracy on training in LogisticRegression data : 0.933
accuracy on testing in LogisticRegression  data : 0.933


### Árboles de desición

In [15]:
model_DT = DecisionTreeClassifier()
model_DT.fit(X_train, y_train)


# Separate the features and the target variable
predict_train_dt = model_DT.predict_proba(X_train)[:,1]
predict_test_dt = model_DT.predict_proba(X_test)[:,1]


# Print scores on both
print("auc on training in DecisionTree data : {:.3f}".format(roc_auc_score(y_train, predict_train_dt) ))
print("auc on testing in DecisionTree  data : {:.3f}".format(roc_auc_score(y_test, predict_test_dt) ))

auc on training in DecisionTree data : 1.000
auc on testing in DecisionTree  data : 0.614


In [16]:
# Clasificación predicha
clasificacion_predict_train_dt= np.where(predict_train_dt<0.5, 0, 1)
clasificacion_predict_test_dt= np.where(predict_test_dt<0.5, 0, 1)

print("accuracy on training in DecisionTreeClassifier data : {:.3f}".format(accuracy_score(y_train, clasificacion_predict_train_dt)))
print("accuracy on testing in DecisionTreeClassifier  data : {:.3f}".format(accuracy_score(y_test,clasificacion_predict_test_dt)))

accuracy on training in DecisionTreeClassifier data : 1.000
accuracy on testing in DecisionTreeClassifier  data : 0.897


### Red Neuronal

In [17]:
model_RN = MLPClassifier()
model_RN.fit(X_train, y_train)


# Separate the features and the target variable
predict_train_rn = model_RN.predict_proba(X_train)[:,1]
predict_test_rn = model_RN.predict_proba(X_test)[:,1]


# Print scores on both
print("auc on training in Red Neuronal data : {:.3f}".format(roc_auc_score(y_train, predict_train_rn) ))
print("auc on testing in Red Neuronal  data : {:.3f}".format(roc_auc_score(y_test, predict_test_rn) ))

auc on training in Red Neuronal data : 0.738
auc on testing in Red Neuronal  data : 0.731


In [18]:
# Clasificación predicha
clasificacion_predict_train_rn= np.where(predict_train_rn<0.5, 0, 1)
clasificacion_predict_test_rn= np.where(predict_test_rn<0.5, 0, 1)

print("accuracy on training in Red Neuronal data : {:.3f}".format(accuracy_score(y_train, clasificacion_predict_train_rn)))
print("accuracy on testing in Red Neuronal  data : {:.3f}".format(accuracy_score(y_test,clasificacion_predict_test_rn)))

accuracy on training in Red Neuronal data : 0.574
accuracy on testing in Red Neuronal  data : 0.571


### Voting

In [19]:
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(estimators=[('lg', model_RL), ('Tree', model_DT), ('rn', model_RN)], voting='hard')
model.fit(X_train,y_train)

In [20]:
VotingClassifier_test_pred = model.predict(X_test)

# Print scores on both
print("accuracy on testing in VotingClassifier  data : {:.3f}".format(accuracy_score(y_test, VotingClassifier_test_pred)))

accuracy on testing in VotingClassifier  data : 0.934


### Averaging

In [21]:
finalpred_Averaging = (predict_test_rl+predict_test_dt+predict_test_rn)/3

print("auc on testing in Averaging  data : {:.3f}".format(roc_auc_score(y_test, finalpred_Averaging)))

auc on testing in Averaging  data : 0.756


### Weighted Average

In [22]:
finalpred_W_Averaging = (predict_test_rn*0.5+predict_test_dt*0.2+predict_test_rl*0.3)/3

print("auc on testing in Weighted Average  data : {:.3f}".format(roc_auc_score(y_test, finalpred_W_Averaging)))

auc on testing in Weighted Average  data : 0.753


### Stacking

In [23]:
estimators = [('lg', model_RL), ('Tree', model_DT), ('rn', model_RN)]

In [24]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
stacking = StackingClassifier(
 estimators=estimators, final_estimator=LogisticRegression()
 )

In [25]:
stacking.fit(X_train, y_train)

In [26]:
stacking_test_pred = stacking.predict(X_test)
stacking_test_prod = stacking.predict_proba(X_test)[:,1]

# Print scores on both
print("accuracy on testing in Stacking  data : {:.3f}".format(accuracy_score(y_test, stacking_test_pred))) 
print("auc on testing in Stacking  data : {:.3f}".format(roc_auc_score(y_test, stacking_test_prod)))  

accuracy on testing in Stacking  data : 0.934
auc on testing in Stacking  data : 0.741


### Ahora usaremos Bagging

In [27]:
bagging = BaggingClassifier(n_estimators=50,random_state=0)
bagging.fit(X_train, y_train)
# Separate the features and the target variable
bagging_test_pred = bagging.predict(X_test)
bagging_test_prob = bagging.predict_proba(X_test)[:,1]

# Print scores on both
print("accuracy on testing in BaggingClassifier  data : {:.3f}".format(accuracy_score(y_test, bagging_test_pred))) 
print("auc on testing in BaggingClassifier  data : {:.3f}".format(roc_auc_score(y_test, bagging_test_prob))) 

accuracy on testing in BaggingClassifier  data : 0.933
auc on testing in BaggingClassifier  data : 0.824


### Random Forest

In [28]:
rf = RandomForestClassifier(
 random_state      = 0,   # semilla inicial de aleatoriedad del algoritmo
 n_estimators      = 100, # cantidad de arboles a crear
 min_samples_split = 5,   # cantidad minima de observaciones para dividir un nodo
 min_samples_leaf  = 2,   # observaciones minimas que puede tener una hoja del arbol
 n_jobs            = -1    # tareas en paralelo. para todos los cores disponibles usar -1
 )

rf.fit(X_train, y_train)

rf_test_pred = rf.predict(X_test)
rf_test_prob = rf.predict_proba(X_test)[:,1]

# Print scores on both
print("accuracy on testing in RandomForestClassifier  data : {:.3f}".format(accuracy_score(y_test, rf_test_pred))) 
print("auc on testing in RandomForestClassifier  data : {:.3f}".format(roc_auc_score(y_test, rf_test_prob))) 

accuracy on testing in RandomForestClassifier  data : 0.935
auc on testing in RandomForestClassifier  data : 0.846


## Validacion -cruzada con Kfolds

In [29]:
data_train = pd.concat([X_train,  y_train], axis=1).reset_index(drop=True)
data_test = pd.concat([X_test,  y_test], axis=1).reset_index(drop=True)

In [30]:
from sklearn.model_selection import StratifiedKFold
kf=StratifiedKFold(n_splits=5, random_state=1, shuffle=True)

i=1

r=[]

features = X_train.columns
target = 'SeriousDlqin2yrs'

importancias=pd.DataFrame()

importancias['variable']=features

i=1


for train_index, test_index in kf.split(data_train[features],data_train[target]):

    rf_cv=RandomForestClassifier(n_estimators = 100,min_samples_split = 5,min_samples_leaf  = 2, random_state = 1)

    rf_cv.fit(data_train.loc[train_index,features],data_train.loc[train_index,target])
    data_test["FOLD_"+str(i)] = rf_cv.predict_proba(data_test[features])[:,1]



    print ("Fold_"+str(i))
    a= roc_auc_score(data_train.loc[test_index,target],np.expm1(rf_cv.predict_proba(data_train.loc[test_index,features])[:,1]))
    r.append(a)
    print (a)
    print ("")

    importancias['gain_'+str(i)]=rf_cv.feature_importances_

    i=i+1

w=[x for x in importancias.columns if 'gain_' in x]

importancias['gain-avg']=importancias[w].mean(axis=1)

importancias=importancias.sort_values('gain-avg',ascending=False).reset_index(drop=True)

importancias=importancias[['variable']+w+['gain-avg']]


print ("mean: "+str(np.mean(np.array(r))))
print ("std: "+str(np.std(np.array(r))))

Fold_1
0.8517649338810276

Fold_2
0.8500606150158289

Fold_3
0.8406779071140196

Fold_4
0.8532444946453833

Fold_5
0.8551806062096934

mean: 0.8501857113731905
std: 0.005044067092982864


In [31]:
data_test

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs,FOLD_1,FOLD_2,FOLD_3,FOLD_4,FOLD_5
0,0.534155,56,0,0.339333,14000.0,11,0,1,0,3.0,0,0.038095,0.029500,0.023262,0.042000,0.030667
1,1.004680,24,0,0.352465,1480.0,4,0,0,0,0.0,0,0.058286,0.095560,0.199889,0.119921,0.162778
2,0.929007,36,0,9.864000,124.0,11,0,1,0,2.0,0,0.121512,0.152091,0.156750,0.110810,0.141338
3,0.998002,30,1,0.730129,2100.0,4,0,0,0,6.0,1,0.248503,0.173421,0.195277,0.215431,0.260429
4,1.000000,43,3,0.699003,2607.0,3,1,1,0,2.0,0,0.469413,0.590690,0.606833,0.472226,0.647353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37495,0.461766,60,0,1284.000000,5400.0,11,0,0,0,0.0,0,0.052750,0.049762,0.054083,0.056333,0.105083
37496,0.190672,65,0,0.887819,5294.0,27,0,3,0,1.0,0,0.011667,0.003537,0.003710,0.018429,0.010357
37497,0.060226,37,0,3549.000000,5400.0,12,0,2,0,0.0,0,0.003509,0.013307,0.020274,0.012381,0.007273
37498,0.021249,82,0,0.106298,3000.0,6,0,0,0,0.0,0,0.000000,0.000000,0.000504,0.000000,0.000000


In [32]:
w=[x for x in data_test.columns if 'FOLD' in x]

data_test['Predict']=data_test[w].mean(axis=1)

In [33]:
clasificacion_predict_test_rf_cv= np.where(data_test.Predict<0.5, 0, 1)

print("auc on training in RF CV data : {:.3f}".format(roc_auc_score(data_test.SeriousDlqin2yrs, data_test.Predict)))
print("accuracy on testing in RF CV   data : {:.3f}".format(accuracy_score(data_test.SeriousDlqin2yrs,clasificacion_predict_test_rf_cv)))

auc on training in RF CV data : 0.851
accuracy on testing in RF CV   data : 0.935
