<div >
<img src = "../banner.jpg" />
</div>

<a target="_blank" href="https://colab.research.google.com/github/ignaciomsarmiento//BDML_202402/blob/main/Lecture07/Notebook_Classification_intro.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>



# Classification

To work through the steps of probability-based classification, we’ll use a real dataset on unemployment from the Argentine Permanent Household Survey (EPH). This dataset includes socio-economic and demographic variables that allow us to predict whether a person is unemployed or not.

Unemployment prediction is a classic problem of classification and remains one of the key application areas for machine learning: we use previous employment results (employed versus unemployed) to train a model that can predict the employment status of individuals in new cases.

\begin{align}
Unemployment = f(x) + u
\end{align}

where $Unemployment = I(Unemployment=1)$


In [None]:
#Cargar librerías 
require("pacman")
p_load(tidyverse)
set.seed(1011)

In [None]:
#Leer los datos 
db <- readRDS(url("https://github.com/ignaciomsarmiento/datasets/blob/main/desempelo_arg_2010.Rds?raw=true"))
head(db)

In [None]:
prop.table(table(db$desempleado))*100

In [None]:
data<- db  %>% group_by(desempleado) %>% tally() 
data<-  data %>% mutate(desempleado =factor(desempleado, levels = c(0,1), labels = c("empleado","desempleado")) )

ggplot(data,aes(x = desempleado, y = n, fill = desempleado)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  scale_fill_manual(values = c("desempleado" = "orange", "empleado"= "blue")) + # Colors can be changed
  labs(x = "", y = "count") # Customize axis labels if needed



## Estimación Logit

\begin{align}
p_i &=\frac{e^{X_i\beta}}{1+e^{X_i\beta}}
\end{align}


In [None]:
mylogit <- glm(desempleado~edad+mujer, data = db, family = "binomial")
summary(mylogit,type="text")

## Prediction


\begin{align}
\hat{p}_i &=\frac{e^{X_i\hat{\beta}}}{1+e^{X_i\hat{\beta}}}
\end{align}

In [None]:

db<- db  %>% mutate(prob_hat=predict(mylogit,newdata = db, type = "response")) #type = "response" gives the predicted probabilities.

head(db  %>% select(desempleado,prob_hat))


## Classification 

\begin{align}
\hat{Y}_i= 1[\hat{p}_i >0.5]
\end{align}

In [None]:
rule <- 1/2 # Bayes Rule

db <-  db  %>% mutate(desempleado_hat=ifelse(prob_hat>rule,1,0))    ## predicted class labels

head(db  %>% select(desempleado,prob_hat,desempleado_hat))


## Out of sample prediction

In [None]:
p_load("caret")


inTrain <- createDataPartition(
  y = db$desempleado, ## La variable dependiente u objetivo 
  p = .7,  ## Usamos 70%  de los datos en el conjunto de entrenamiento 
  list = FALSE)


train <- db[ inTrain,]
test  <- db[-inTrain,]

In [None]:
head(train)

In [None]:
prop.table(table(train$desempleado))

In [None]:
prop.table(table(test$desempleado))

### Logit

In [None]:
ctrl<- trainControl(method = "cv",
                    number = 5,
                    classProbs = TRUE,
                    verbose=FALSE,
                    savePredictions = T)


In [None]:
#train<- train %>% mutate(desempleado_num =desempleado)
#train<- train %>% mutate(desempleado = factor(desempleado, levels = c(0,1), labels = c("empleado","desempleado")))


In [None]:
set.seed(1410)
mylogit_caret <- train(desempleado~edad+mujer+nivel_ed+ parentesco +
                        estado_civil+tipo_vivienda+ing_tot_fam+total_miembros_hogar+miembros_hogar_menores10,
                       data = train, 
                       method = "glm",
                       trControl = ctrl,
                       family = "binomial")


mylogit_caret

In [None]:
predictTest_logit <- data.frame(
  obs = test$desempleado,                                    ## observed class labels
  predict(mylogit_caret, newdata = test, type = "prob"),         ## predicted class probabilities
  pred = predict(mylogit_caret, newdata = test, type = "raw")    ## predicted class labels
)


In [None]:
head(predictTest_logit)

In [None]:
test<- test %>% mutate(desempleado = factor(desempleado, levels = c(0,1), labels = c("empleado","desempleado")))

In [None]:
test<- test  %>% mutate(desempleo_hat_logit_orig=predict(mylogit_caret,newdata = test,
                           type = "raw"))

In [None]:
confusionMatrix(data = test$desempleo_hat_logit_orig, 
                reference = test$desempleado, positive="desempleado")
