<a href="https://colab.research.google.com/github/jpantojaj/Credit_Scoring_Specialization/blob/main/Sesi%C3%B3n_12_13_LGD_EAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importar Datos

In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
loan_data_defaults = pd.read_csv('loan_data_defaults.csv')

# Exploramos los datos

In [None]:
loan_data_defaults.columns.values

In [None]:
loan_data_defaults.info()

In [None]:
loan_data_defaults.head()

In [None]:
loan_data_defaults.shape

### Eliminimos posibles duplicados

In [None]:
loan_data_defaults=loan_data_defaults.drop_duplicates()
loan_data_defaults.reset_index()
loan_data_defaults.shape

### Veamos el missing rate

In [None]:
null_values = pd.concat([loan_data_defaults.isnull().sum(), loan_data_defaults.isnull().sum() / len(loan_data_defaults)], axis = 1)
null_values.rename(columns = {0: 'number_null_values',1: 'ratio_null_values'}, inplace = True)
null_values

In [None]:
loan_data_defaults.select_dtypes(include=['number']).describe().transpose()

In [None]:
loan_data_defaults=loan_data_defaults.drop(columns = ['Unnamed: 0'])

In [None]:
loan_data_defaults.drop(columns = ['recovery_rate','CCF']).hist(figsize = (15, 15))
plt.show()

In [None]:
loan_data_defaults.drop(columns = ['recovery_rate','CCF']).boxplot(figsize = (20, 12))
plt.yscale('log')
plt.xticks(rotation = 45)
plt.show()

### Generemos un reporte de outliers

In [None]:
def outliers_col(df):
  for columna in df:
    if df[columna].dtype != object:
      q1 = stats.scoreatpercentile(df[columna], 25)
      q3 = stats.scoreatpercentile(df[columna], 75)
      iqr = q3-q1
      lim_inf = q1-1.5*iqr
      lim_sup = q3+1.5*iqr
      n_outliers_inf = len(df[(df[columna]<lim_inf)])
      n_outliers_sup = len(df[(df[columna]>lim_sup)])
      print("{} | {} | {}".format(
          df[columna].name,
          n_outliers_inf,
          n_outliers_sup
          ))

In [None]:
outliers_col(loan_data_defaults)

# Recovery Rate

In [None]:
loan_data_defaults['recovery_rate'].describe()

In [None]:
#Topea el recovery rate
loan_data_defaults['recovery_rate'] = np.where(loan_data_defaults['recovery_rate'] > 1, 1, loan_data_defaults['recovery_rate'])
loan_data_defaults['recovery_rate'] = np.where(loan_data_defaults['recovery_rate'] < 0, 0, loan_data_defaults['recovery_rate'])

In [None]:
loan_data_defaults['recovery_rate'].describe()

# Exploramos las variables

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
#Ploteamos el recovery rate
plt.hist(loan_data_defaults['recovery_rate'], bins = 20)
plt.show()

In [None]:
# Plotea variables
plt.scatter(loan_data_defaults['annual_inc'], loan_data_defaults['recovery_rate'],c='blue', alpha=0.5)
plt.xlabel('annual_inc')
plt.ylabel('recovery_rate')
plt.show()

In [None]:
# Plotea variables
plt.scatter(loan_data_defaults['mths_since_last_delinq'], loan_data_defaults['recovery_rate'],c='blue', alpha=0.5)
plt.xlabel('mths_since_last_delinq')
plt.ylabel('recovery_rate')
plt.show()

In [None]:
#Dada la alta concentración de recovery rate en 0, vamos a crear una nueva variable
loan_data_defaults['recovery_rate_0_1'] = np.where(loan_data_defaults['recovery_rate'] == 0, 0, 1)

In [None]:
loan_data_defaults['recovery_rate_0_1'].value_counts()

## Generamos un Modelo de LGD: Dividimos la generación del modelo en 2 etapas

### Dividimos la data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
lgd_inputs_stage_1_train, lgd_inputs_stage_1_test, lgd_targets_stage_1_train, lgd_targets_stage_1_test = train_test_split(loan_data_defaults.drop(['recovery_rate','recovery_rate_0_1', 'CCF'], axis = 1), loan_data_defaults['recovery_rate_0_1'], test_size = 0.2, random_state = 123)

### Preparing the Inputs

In [None]:
features_all = ['grade:A',
'grade:B',
'grade:C',
'grade:D',
'grade:E',
'grade:F',
'grade:G',
'home_ownership:MORTGAGE',
'home_ownership:NONE',
'home_ownership:OTHER',
'home_ownership:OWN',
'home_ownership:RENT',
'verification_status:Not Verified',
'verification_status:Source Verified',
'verification_status:Verified',
'purpose:car',
'purpose:credit_card',
'purpose:debt_consolidation',
'purpose:educational',
'purpose:home_improvement',
'purpose:house',
'purpose:major_purchase',
'purpose:medical',
'purpose:moving',
'purpose:other',
'purpose:renewable_energy',
'purpose:small_business',
'purpose:vacation',
'purpose:wedding',
'initial_list_status:f',
'initial_list_status:w',
'term_int',
'emp_length_int',
'mths_since_issue_d',
'mths_since_earliest_cr_line',
'funded_amnt',
'int_rate',
'installment',
'annual_inc',
'dti',
'delinq_2yrs',
'inq_last_6mths',
'mths_since_last_delinq',
'mths_since_last_record',
'open_acc',
'pub_rec',
'total_acc',
'acc_now_delinq',
'total_rev_hi_lim']

In [None]:
lgd_inputs_stage_1_train = lgd_inputs_stage_1_train[features_all]

### Estimamos el Modelo

In [None]:
#Estimamos el modelo logístico
from sklearn.linear_model import LogisticRegression
reg_lgd_st_1 = LogisticRegression()
reg_lgd_st_1.fit(lgd_inputs_stage_1_train, lgd_targets_stage_1_train)

In [None]:
feature_name = lgd_inputs_stage_1_train.columns.values
feature_name

In [None]:
print(reg_lgd_st_1.intercept_)
print(reg_lgd_st_1.coef_)

In [None]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(reg_lgd_st_1.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg_lgd_st_1.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

### Revisamos el performance del modelo (TEST)

In [None]:
lgd_inputs_stage_1_test = lgd_inputs_stage_1_test[features_all]
lgd_inputs_stage_1_test.head()

In [None]:
lgd_inputs_stage_1_test.shape

In [None]:
y_hat_test_lgd_stage_1 = reg_lgd_st_1.predict(lgd_inputs_stage_1_test)
y_hat_test_lgd_stage_1

In [None]:
y_hat_test_proba_lgd_stage_1 = reg_lgd_st_1.predict_proba(lgd_inputs_stage_1_test)
y_hat_test_proba_lgd_stage_1

In [None]:
y_hat_test_proba_lgd_stage_1 = y_hat_test_proba_lgd_stage_1[: ][: , 1]
y_hat_test_proba_lgd_stage_1

In [None]:
lgd_targets_stage_1_test.shape

In [None]:
preds_lr_df = pd.DataFrame(y_hat_test_proba_lgd_stage_1, columns = ['y_hat_test_proba_lgd_stage_1'])
preds_lr_df.head()

In [None]:
df_actual_predicted_probs = pd.concat([lgd_targets_stage_1_test.reset_index(drop = True).rename('lgd_targets_stage_1_test'), preds_lr_df], axis = 1)
df_actual_predicted_probs.head()

### Evaluamos la Precisión del Modelo

In [None]:
tr = 0.5
df_actual_predicted_probs['y_hat_test_lgd_stage_1'] = np.where(df_actual_predicted_probs['y_hat_test_proba_lgd_stage_1'] > tr, 1, 0)

In [None]:
pd.crosstab(df_actual_predicted_probs['lgd_targets_stage_1_test'], df_actual_predicted_probs['y_hat_test_lgd_stage_1'], rownames = ['Actual'], colnames = ['Predicted'])

In [None]:
pd.crosstab(df_actual_predicted_probs['lgd_targets_stage_1_test'], df_actual_predicted_probs['y_hat_test_lgd_stage_1'], rownames = ['Actual'], colnames = ['Predicted']) / df_actual_predicted_probs.shape[0]

In [None]:
(pd.crosstab(df_actual_predicted_probs['lgd_targets_stage_1_test'], df_actual_predicted_probs['y_hat_test_lgd_stage_1'], rownames = ['Actual'], colnames = ['Predicted']) / df_actual_predicted_probs.shape[0]).iloc[0, 0] + (pd.crosstab(df_actual_predicted_probs['lgd_targets_stage_1_test'], df_actual_predicted_probs['y_hat_test_lgd_stage_1'], rownames = ['Actual'], colnames = ['Predicted']) / df_actual_predicted_probs.shape[0]).iloc[1, 1]

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(df_actual_predicted_probs['lgd_targets_stage_1_test'], df_actual_predicted_probs['y_hat_test_proba_lgd_stage_1'])

In [None]:
plt.plot(fpr, tpr)
plt.plot(fpr, fpr, linestyle = '--', color = 'k')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()

In [None]:
AUC = roc_auc_score(df_actual_predicted_probs['lgd_targets_stage_1_test'], df_actual_predicted_probs['y_hat_test_proba_lgd_stage_1'])
AUC

In [None]:
gini=2*AUC-1
gini

### Entrenamos un modelos alternativo: Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_clf_lgd_st_1 = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
RF_clf_lgd_st_1.fit(lgd_inputs_stage_1_train, lgd_targets_stage_1_train)

In [None]:
y_hat_test_lgd_stage_1_rf_clf = RF_clf_lgd_st_1.predict(lgd_inputs_stage_1_test)
y_hat_test_proba_lgd_stage_1_rf_clf = RF_clf_lgd_st_1.predict_proba(lgd_inputs_stage_1_test)

In [None]:
preds_rf_df = pd.DataFrame(y_hat_test_proba_lgd_stage_1_rf_clf[: ][: , 1], columns = ['y_hat_test_proba_lgd_stage_1_rf_clf'])
preds_rf_df.head()

In [None]:
df_actual_predicted_probs_rf_clf = pd.concat([lgd_targets_stage_1_test.reset_index(drop = True).rename('lgd_targets_stage_1_test'), preds_rf_df], axis = 1)
df_actual_predicted_probs_rf_clf.head()

In [None]:
fpr, tpr, thresholds = roc_curve(df_actual_predicted_probs_rf_clf['lgd_targets_stage_1_test'], df_actual_predicted_probs_rf_clf['y_hat_test_proba_lgd_stage_1_rf_clf'])

In [None]:
plt.plot(fpr, tpr)
plt.plot(fpr, fpr, linestyle = '--', color = 'k')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()

In [None]:
AUC = roc_auc_score(df_actual_predicted_probs_rf_clf['lgd_targets_stage_1_test'], df_actual_predicted_probs_rf_clf['y_hat_test_proba_lgd_stage_1_rf_clf'])
AUC

In [None]:
gini=2*AUC-1
gini

### Con este modelo alternativo estaríamos ganando mas de 10pts de GINI que con el modelo de regresión logística

### Stage 2: Regresión Lineal sobre los valores >0

In [None]:
lgd_stage_2_data = loan_data_defaults[loan_data_defaults['recovery_rate_0_1'] == 1]

In [None]:
#Ploteamos el recovery rate
plt.hist(lgd_stage_2_data['recovery_rate'], bins = 50)
plt.show()

In [None]:
# Plotea variables
plt.scatter(lgd_stage_2_data['annual_inc'], lgd_stage_2_data['recovery_rate'],c='blue', alpha=0.5)
plt.xlabel('annual_inc')
plt.ylabel('recovery_rate')
plt.show()

In [None]:
# Dividimos los datos
lgd_inputs_stage_2_train, lgd_inputs_stage_2_test, lgd_targets_stage_2_train, lgd_targets_stage_2_test = train_test_split(lgd_stage_2_data.drop(['recovery_rate','recovery_rate_0_1', 'CCF'], axis = 1), lgd_stage_2_data['recovery_rate'], test_size = 0.2, random_state = 123)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
lgd_inputs_stage_2_train = lgd_inputs_stage_2_train[features_all]

In [None]:
reg_lgd_st_2 = LinearRegression()
reg_lgd_st_2.fit(lgd_inputs_stage_2_train, lgd_targets_stage_2_train)

In [None]:
print(reg_lgd_st_2.intercept_)
print(reg_lgd_st_2.coef_)

In [None]:
feature_name = lgd_inputs_stage_2_train.columns.values
feature_name

In [None]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(reg_lgd_st_2.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg_lgd_st_2.intercept_]
summary_table = summary_table.sort_index()
summary_table

### Stage 2 – Evaluación del Modelo

In [None]:
lgd_inputs_stage_2_test = lgd_inputs_stage_2_test[features_all]
lgd_inputs_stage_2_test.head()

In [None]:
lgd_inputs_stage_2_test.shape

In [None]:
y_hat_test_lgd_stage_2 = reg_lgd_st_2.predict(lgd_inputs_stage_2_test)
y_hat_test_lgd_stage_2

In [None]:
y_hat_test_lgd_stage_2.shape

In [None]:
pred_linreg_stg2=pd.DataFrame(y_hat_test_lgd_stage_2,columns=['recovery_rate_pred'])

In [None]:
pred_linreg_stg2.head()

In [None]:
df_pred_stg2=pd.concat([lgd_targets_stage_2_test.reset_index(drop = True), pred_linreg_stg2], axis = 1)
df_pred_stg2.head()

In [None]:
df_pred_stg2.corr()

In [None]:
# Plotea variables
plt.scatter(df_pred_stg2['recovery_rate_pred'], df_pred_stg2['recovery_rate'],c='blue', alpha=0.5)
plt.xlabel('recovery_rate_pred')
plt.ylabel('recovery_rate')
plt.show()

In [None]:
r2_score(lgd_targets_stage_2_test, y_hat_test_lgd_stage_2)

In [None]:
np.sqrt(mean_squared_error(lgd_targets_stage_2_test,y_hat_test_lgd_stage_2))

In [None]:
sns.distplot(lgd_targets_stage_2_test - y_hat_test_lgd_stage_2)

In [None]:
pd.DataFrame(y_hat_test_lgd_stage_2).describe()

In [None]:
y_hat_test_lgd_stage_2 = np.where(y_hat_test_lgd_stage_2 < 0, 0, y_hat_test_lgd_stage_2)
y_hat_test_lgd_stage_2 = np.where(y_hat_test_lgd_stage_2 > 1, 1, y_hat_test_lgd_stage_2)

In [None]:
df_pred_stg2.describe()

### Entrenamos un modelo alternativo: Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_reg_lgd_st_2 = RandomForestRegressor(n_estimators = 100, random_state = 0)
RF_reg_lgd_st_2.fit(lgd_inputs_stage_2_train, lgd_targets_stage_2_train)

In [None]:
y_hat_test_lgd_stage_2_rf_reg= RF_reg_lgd_st_2.predict(lgd_inputs_stage_2_test)
y_hat_test_lgd_stage_2_rf_reg

In [None]:
pred_rf_reg_stg2=pd.DataFrame(y_hat_test_lgd_stage_2_rf_reg, columns=['recovery_rate_pred'])

In [None]:
df_pred_stg2_rf_reg=pd.concat([lgd_targets_stage_2_test.reset_index(drop = True), pred_rf_reg_stg2], axis = 1)
df_pred_stg2_rf_reg.head()

In [None]:
df_pred_stg2_rf_reg.corr()

In [None]:
# Plotea variables
plt.scatter(df_pred_stg2_rf_reg['recovery_rate_pred'], df_pred_stg2_rf_reg['recovery_rate'],c='blue', alpha=0.5)
plt.xlabel('recovery_rate_pred')
plt.ylabel('recovery_rate')
plt.show()

In [None]:
r2_score(lgd_targets_stage_2_test, y_hat_test_lgd_stage_2_rf_reg)

In [None]:
np.sqrt(mean_squared_error(lgd_targets_stage_2_test, y_hat_test_lgd_stage_2_rf_reg))

In [None]:
sns.distplot(lgd_targets_stage_2_test - y_hat_test_lgd_stage_2_rf_reg)

In [None]:
df_pred_stg2_rf_reg.describe()

### Combinamos el Stage 1 y 2: predecir si es =0 o mayor a 0, y para los mayores a 0 cuál es su recuperación

In [None]:
y_hat_test_lgd_stage_2_all = reg_lgd_st_2.predict(lgd_inputs_stage_1_test)
y_hat_test_lgd_stage_2_all

In [None]:
y_hat_test_lgd = y_hat_test_lgd_stage_1_rf_clf * y_hat_test_lgd_stage_2_all

In [None]:
pd.DataFrame(y_hat_test_lgd).describe()

In [None]:
y_hat_test_lgd = np.where(y_hat_test_lgd < 0, 0, y_hat_test_lgd)
y_hat_test_lgd = np.where(y_hat_test_lgd > 1, 1, y_hat_test_lgd)

In [None]:
pd.DataFrame(y_hat_test_lgd).describe()

# Modelo para estimar EAD

In [None]:
loan_data_defaults['CCF'].describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
plt.hist(loan_data_defaults['CCF'], bins = 50)
plt.show()

In [None]:
ead_inputs_train, ead_inputs_test, ead_targets_train, ead_targets_test = train_test_split(loan_data_defaults.drop([ 'recovery_rate','recovery_rate_0_1', 'CCF'], axis = 1), loan_data_defaults['CCF'], test_size = 0.2, random_state = 123)

In [None]:
ead_inputs_train = ead_inputs_train[features_all]
ead_inputs_train.head()

In [None]:
reg_ead = LinearRegression()
reg_ead.fit(ead_inputs_train, ead_targets_train)

In [None]:
print(reg_ead.intercept_)
print(reg_ead.coef_)

In [None]:
feature_name = ead_inputs_train.columns.values

In [None]:
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
summary_table['Coefficients'] = np.transpose(reg_ead.coef_)
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg_ead.intercept_]
summary_table = summary_table.sort_index()
summary_table

### Evaluamos el modelo

In [None]:
ead_inputs_test = ead_inputs_test[features_all]

In [None]:
ead_inputs_test.head()

In [None]:
ead_inputs_test.shape

In [None]:
y_hat_test_ead = reg_ead.predict(ead_inputs_test)
y_hat_test_ead

In [None]:
y_hat_test_ead.shape

In [None]:
pred_ead=pd.DataFrame(y_hat_test_ead,columns=['y_hat_test_ead'])
pred_ead.head()

In [None]:
df_ead=pd.concat([ead_targets_test.reset_index(drop = True), pred_ead], axis = 1)
df_ead.head()

In [None]:
sns.distplot(ead_targets_test - y_hat_test_ead)

In [None]:
reg_ead.score(ead_inputs_test,ead_targets_test)

In [None]:
rmse_ead = np.sqrt(mean_squared_error(ead_targets_test,y_hat_test_ead))
rmse_ead

In [None]:
pred_ead.describe()

In [None]:
pred_ead['y_hat_test_ead'] = np.where(pred_ead['y_hat_test_ead'] < 0, 0, pred_ead['y_hat_test_ead'])
pred_ead['y_hat_test_ead'] = np.where(pred_ead['y_hat_test_ead'] > 1, 1, pred_ead['y_hat_test_ead'])

In [None]:
pred_ead.describe()