<div >
<img src = "../banner.jpg" />
</div>

<a target="_blank" href="https://colab.research.google.com/github/ignaciomsarmiento/BDML_202401/blob/main/Modulo04/Modulo04_Classification.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>



# Classification: Predicting Firm Exit

Our first task is to build a predictive model of the probability of a firm's default and therefore exit from business. 

In [None]:
options(warn=-1)

#Cargar librerías 
require("pacman")
p_load(tidyverse,
      caret,
      glmnet)


In [None]:
#Leer los datos 
db <- readRDS(url("https://github.com/ignaciomsarmiento/datasets/raw/main/bisnode_firms_clean.rds?raw=true"))
head(db)




| Category       | Description                                                                                                                                                        |
|----------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Firm           | Age of firm, squared age, a dummy if newly established, industry categories, location regions for its headquarters, and dummy if located in a big city.        |
| Financial 1    | Winsorized financial variables: sales, fixed, liquid, current, intangible assets, current liabilities, inventories, equity shares, subscribed capital, sales revenues, income before tax, extra income, material, personal and extra expenditure, extra profit. |
| Financial 2    | Flags (extreme, low, high, zero – when applicable) and polynomials: quadratic terms are created for profit and loss, extra profit and loss, income before tax, and share equity.                                   |
| Growth         | Sales growth is captured by a winsorized growth variable, its quadratic term and flags for extreme low and high values.                                                                   |
| HR             | For the CEO: female dummy, winsorized age and flags, flag for missing information; foreign management dummy; labor cost, and flag for missing labor cost information.                                                |
| Data Quality   | Variables related to the data quality of the financial information, flag for a problem, and the length of the year that the balance sheet covers.                                                                     |
| Interactions   | Interactions with sales growth, firm size, and industry.                                                                                                                                   |



In [None]:
prop.table(table(db$default_f))*100

In [None]:
prop.table(table(db$default))*100

## Model variables
### Hand-Picked

In [None]:

X1 <- c("sales_mil_log", "sales_mil_log_sq", "d1_sales_mil_log_mod", "profit_loss_year_pl", "ind2_cat")


### Hand-Picked + Firm specific

In [None]:
X2 <- c("sales_mil_log", "sales_mil_log_sq", "d1_sales_mil_log_mod", "profit_loss_year_pl", "fixed_assets_bs","share_eq_bs","curr_liab_bs ",   "curr_liab_bs_flag_high ", "curr_liab_bs_flag_error",  "age","foreign_management" , "ind2_cat")


###  Firm specific + Financial + Lags

In [None]:
firm <- c("age", "age2", "new", "ind2_cat", "m_region_loc", "urban_m")

engvar <- c("total_assets_bs", "fixed_assets_bs", "liq_assets_bs", "curr_assets_bs",
            "share_eq_bs", "subscribed_cap_bs", "intang_assets_bs", "extra_exp_pl",
            "extra_inc_pl", "extra_profit_loss_pl", "inc_bef_tax_pl", "inventories_pl",
            "material_exp_pl", "profit_loss_year_pl", "personnel_exp_pl")

d1 <-  c("d1_sales_mil_log_mod", "d1_sales_mil_log_mod_sq",
         "flag_low_d1_sales_mil_log", "flag_high_d1_sales_mil_log")

In [None]:
X3 <- c("sales_mil_log", "sales_mil_log_sq", firm, engvar,                   d1)

###  All but interactions

In [None]:

engvar2 <- c("extra_profit_loss_pl_quad", "inc_bef_tax_pl_quad",
             "profit_loss_year_pl_quad", "share_eq_bs_quad")

engvar3 <- c(grep("*flag_low$", names(db), value = TRUE),
             grep("*flag_high$", names(db), value = TRUE),
             grep("*flag_error$", names(db), value = TRUE),
             grep("*flag_zero$", names(db), value = TRUE))

hr <- c("female", "ceo_age", "flag_high_ceo_age", "flag_low_ceo_age",
        "flag_miss_ceo_age", "ceo_count", "labor_avg_mod",
        "flag_miss_labor_avg", "foreign_management")
qualityvars <- c("balsheet_flag", "balsheet_length", "balsheet_notfullyear")

In [None]:
X4 <- c("sales_mil_log", "sales_mil_log_sq", firm, engvar, engvar2, engvar3, d1, hr, qualityvars)

###  All 

In [None]:


# interactions
interactions1 <- c("ind2_cat*age", "ind2_cat*age2",
                   "ind2_cat*d1_sales_mil_log_mod", "ind2_cat*sales_mil_log",
                   "ind2_cat*ceo_age", "ind2_cat*foreign_management",
                   "ind2_cat*female",   "ind2_cat*urban_m", "ind2_cat*labor_avg_mod")
interactions2 <- c("sales_mil_log*age", "sales_mil_log*female",
                   "sales_mil_log*profit_loss_year_pl", "sales_mil_log*foreign_management")


In [None]:
X5 <- c("sales_mil_log", "sales_mil_log_sq", firm, engvar, engvar2, engvar3, d1, hr, qualityvars, interactions1, interactions2)

## Out of sample prediction

In [None]:

set.seed(13505)

train_indices <- as.integer(createDataPartition(db$default, p = 0.8, list = FALSE))
train <- db[train_indices, ]
test <- db[-train_indices, ]

dim(train)
dim(test)


### Logit

In [None]:
# 5 fold cross-validation
train_control <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = defaultSummary,
  savePredictions = TRUE
)


In [None]:
set.seed(13505)
glm_model_handpicked <- train(
    formula(paste0("default_f ~", paste0(X1, collapse = " + "))),
    method = "glm",
    data = train,
    family = "binomial",
    trControl = train_control
  )

In [None]:
glm_model_handpicked

In [None]:
set.seed(13505)
glm_model_all <- train(
    formula(paste0("default_f ~", paste0(X4, collapse = " + "))),
    method = "glm",
    data = train,
    family = "binomial",
    trControl = train_control
  )

glm_model_all

In [None]:
formula(paste0("default_f ~", paste0(X5, collapse = " + ")))

In [None]:

lambda <- 10^seq(-1, -4, length = 100)
grid <- expand.grid("alpha" = 1, lambda = lambda)

set.seed(13505)
glm_model_lasso <- train(
    formula(paste0("default_f ~", paste0(X5, collapse = " + "))),
    method = "glmnet",
    data = train,
    family = "binomial",
    trControl = train_control,
    tuneGrid = grid,
    preProcess = c("center", "scale")
  )

glm_model_lasso

In [None]:
head(glm_model_handpicked$pred)

In [None]:
confusionMatrix(data = glm_model_handpicked$pred$pred, reference = glm_model_handpicked$pred$obs, positive="default", mode = "prec_recall")

In [None]:
confusionMatrix(data = glm_model_all$pred$pred, reference = glm_model_all$pred$obs, positive="default", mode = "prec_recall")

In [None]:
glm_model_all$pred$pred2<-factor(ifelse(glm_model_all$pred$default>=0.3,"default","no_default"),levels=c("default","no_default"))

In [None]:
head(glm_model_all$pred)

In [None]:
confusionMatrix(data = glm_model_all$pred$pred2, reference = glm_model_all$pred$obs, positive="default", mode = "prec_recall")

In [None]:
p_load("MLeval")

roc_glm_model_all <- evalm(glm_model_all,plots="r") #r Roc

In [None]:
res_insample_all <- evalm(list(glm_model_handpicked, glm_model_all),gnames=c('Handpicked','All'),plots='r')

In [None]:
p_load(pROC)

In [None]:
head(glm_model_all$pred)

In [None]:
?roc

In [None]:
roc_obj_handpicked<-roc(response=glm_model_handpicked$pred$obs,
                        predictor=glm_model_handpicked$pred$default)

roc_obj_all<-roc(response=glm_model_all$pred$obs,
                 predictor=glm_model_all$pred$default)

In [None]:
rfThresh <- coords(roc_obj_all, x = "best", best.method = "closest.topleft")
rfThresh

In [None]:
glm_model_all$pred$pred3<-factor(ifelse(glm_model_all$pred$default>=rfThresh$threshold,"default","no_default"),levels=c("default","no_default"))

In [None]:
confusionMatrix(data = glm_model_all$pred$pred3, reference = glm_model_all$pred$obs, positive="default", mode = "prec_recall")

In [None]:
multiStats <- function(...) c(twoClassSummary(...), defaultSummary(...), prSummary(...))

ctrl_multiStats<- trainControl(method = "cv",
                     number = 5,
                     summaryFunction = multiStats,
                     classProbs = TRUE,
                     verbose=FALSE,
                     savePredictions = T)

lambda <- 10^seq(-1, -4, length = 100)
grid <- expand.grid("alpha" = 1, lambda = lambda)

set.seed(13505)
glm_model_lasso_spec <- train(
    formula(paste0("default_f ~", paste0(X5, collapse = " + "))),
    method = "glmnet",
    data = train,
    family = "binomial",
    trControl = ctrl_multiStats,
    tuneGrid = grid,
    preProcess = c("center", "scale"),
     ## Specify which metric to optimize
    metric = "Spec"
  )

glm_model_lasso_spec

In [None]:
roc_obj_lasso<-roc(response=glm_model_lasso$pred$obs[glm_model_lasso$pred$lambda==glm_model_lasso$bestTune$lambda],
                   predictor=glm_model_lasso$pred$default[glm_model_lasso$pred$lambda==glm_model_lasso$bestTune$lambda])

In [None]:
rfThresh_lasso <- coords(roc_obj_lasso, x = "best", best.method = "closest.topleft")
rfThresh_lasso

In [None]:
pred_lasso<-factor(ifelse(glm_model_lasso$pred$default[glm_model_lasso$pred$lambda==glm_model_lasso$bestTune$lambda]>=rfThresh_lasso$threshold,
                          "default","no_default"),levels=c("default","no_default"))

In [None]:
confusionMatrix(data = pred_lasso, 
                reference = glm_model_lasso$pred$obs[glm_model_lasso$pred$lambda==glm_model_lasso$bestTune$lambda], 
                positive="default", mode = "prec_recall")

In [None]:
prec_recall<-data.frame(coords(roc_obj_lasso, seq(0,1,length=100), ret=c("threshold", "precision", "recall")))

In [None]:
prec_recall<- prec_recall  %>% mutate(F1=(2*precision*recall)/(precision+recall))
prec_recall

In [None]:
prec_recall$threshold[which.max(prec_recall$F1)]

In [None]:
pred_lasso_F1<-factor(ifelse(glm_model_lasso$pred$default[glm_model_lasso$pred$lambda==glm_model_lasso$bestTune$lambda]>=prec_recall$threshold[which.max(prec_recall$F1)],
                          "default","no_default"),levels=c("default","no_default"))

In [None]:
confusionMatrix(data = pred_lasso_F1, 
                reference = glm_model_lasso$pred$obs[glm_model_lasso$pred$lambda==glm_model_lasso$bestTune$lambda], 
                positive="default", mode = "prec_recall")