<a href="https://colab.research.google.com/github/imdiegolopes/puc-mvp-04-classificacao-de-risco-de-credito/blob/main/ML_Credit_Risk_Classifier_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
# Instalação do scikit-learn (caso não esteja instalado)
!pip install scikit-learn

!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

# fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

# metadata
print(statlog_german_credit_data.metadata)

# variable information
print(statlog_german_credit_data.variables)

{'uci_id': 144, 'name': 'Statlog (German Credit Data)', 'repository_url': 'https://archive.ics.uci.edu/dataset/144/statlog+german+credit+data', 'data_url': 'https://archive.ics.uci.edu/static/public/144/data.csv', 'abstract': 'This dataset classifies people described by a set of attributes as good or bad credit risks. Comes in two formats (one all numeric). Also comes with a cost matrix', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1000, 'num_features': 20, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Other', 'Marital Status', 'Age', 'Occupation'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1994, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5NC77', 'creators': ['Hans Hofmann'], 'intro_paper': None, 'additional_info': {'summary': 'Two datasets are provided.  the original dataset, in the form provided by

# Explicação das Variáveis do Modelo

**Attribute 1:** (qualitative)      
  *Status of existing checking account*
  - A11: ... < 0 DM
  - A12: 0 <= ... < 200 DM
  - A13: ... >= 200 DM / salary assignments for at least 1 year
  - A14: no checking account

**Attribute 2:** (numerical)
  - Duration in month

**Attribute 3:** (qualitative)
  *Credit history*
  - A30: no credits taken/ all credits paid back duly
  - A31: all credits at this bank paid back duly
  - A32: existing credits paid back duly till now
  - A33: delay in paying off in the past
  - A34: critical account/ other credits existing (not at this bank)

**Attribute 4:** (qualitative)
  *Purpose*
  - A40: car (new)
  - A41: car (used)
  - A42: furniture/equipment
  - A43: radio/television
  - A44: domestic appliances
  - A45: repairs
  - A46: education
  - A47: (vacation - does not exist?)
  - A48: retraining
  - A49: business
  - A410: others

**Attribute 5:** (numerical)
  - Credit amount

**Attribute 6:** (qualitative)
  *Savings account/bonds*
  - A61: ... < 100 DM
  - A62: 100 <= ... < 500 DM
  - A63: 500 <= ... < 1000 DM
  - A64: .. >= 1000 DM
  - A65: unknown/ no savings account

**Attribute 7:** (qualitative)
  *Present employment since*
  - A71: unemployed
  - A72: ... < 1 year
  - A73: 1  <= ... < 4 years  
  - A74: 4  <= ... < 7 years
  - A75: .. >= 7 years

**Attribute 8:** (numerical)
  - Installment rate in percentage of disposable income

**Attribute 9:** (qualitative)
  *Personal status and sex*
  - A91: male: divorced/separated
  - A92: female: divorced/separated/married
  - A93: male: single
  - A94: male: married/widowed
  - A95: female: single

**Attribute 10:** (qualitative)
  *Other debtors / guarantors*
  - A101: none
  - A102: co-applicant
  - A103: guarantor

**Attribute 11:** (numerical)
  - Present residence since

**Attribute 12:** (qualitative)
  *Property*
  - A121: real estate
  - A122: if not A121: building society savings agreement/ life insurance
  - A123: if not A121/A122: car or other, not in attribute 6
  - A124: unknown / no property

**Attribute 13:** (numerical)
  - Age in years

**Attribute 14:** (qualitative)
  *Other installment plans*
  - A141: bank
  - A142: stores
  - A143: none

**Attribute 15:** (qualitative)
  *Housing*
  - A151: rent
  - A152: own
  - A153: for free

**Attribute 16:** (numerical)
  - Number of existing credits at this bank

**Attribute 17:** (qualitative)
  *Job*
  - A171: unemployed/ unskilled  - non-resident
  - A172: unskilled - resident
  - A173: skilled employee / official
  - A174: management/ self-employed/ highly qualified employee/ officer

**Attribute 18:** (numerical)
  - Number of people being liable to provide maintenance for

**Attribute 19:** (qualitative)
  *Telephone*
  - A191: none
  - A192: yes, registered under the customer's name

**Attribute 20:** (qualitative)
  *Foreign worker*
  - A201: yes
  - A202: no


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Assuming 'class' is your target column
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets['class']

# Identify categorical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

# Create a ColumnTransformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Train the model
model.fit(X_train_preprocessed, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_preprocessed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)


Accuracy: 0.8

Confusion Matrix:
[[131  10]
 [ 30  29]]

Classification Report:
              precision    recall  f1-score   support

           1       0.81      0.93      0.87       141
           2       0.74      0.49      0.59        59

    accuracy                           0.80       200
   macro avg       0.78      0.71      0.73       200
weighted avg       0.79      0.80      0.79       200



## Resultados da Avaliação do Modelo

### 1. Accuracy:
Accuracy: 0.8
- **Interpretação:** O modelo tem uma acurácia global de 80%.

### 2. Matriz de Confusão:

```
[[131 10]
[ 30 29]]
```

- **Interpretação:**
  - **Verdadeiros Positivos (VP):** 131 instâncias corretamente previstas como '1'.
  - **Verdadeiros Negativos (VN):** 29 instâncias corretamente previstas como '2'.
  - **Falsos Positivos (FP):** 10 instâncias incorretamente previstas como '1'.
  - **Falsos Negativos (FN):** 30 instâncias incorretamente previstas como '2'.

### 3. Relatório de Classificação:

    index  precisão    recall  f1-score   suporte
       1       0.81      0.93      0.87       141
       2       0.74      0.49      0.59        59

accuracy                           0.80       200

média ponderada 0.79 0.80 0.79 200

- **Interpretação:**
  - **Precisão para '1' (Bom):** 81%, **Recall:** 93%, **F1-Score:** 87%, **Suporte:** 141.
  - **Precisão para '2' (Ruim):** 74%, **Recall:** 49%, **F1-Score:** 59%, **Suporte:** 59.

### Resumo:
- O modelo se sai bem na identificação de instâncias 'Bom' (classe '1') com alta precisão e recall.
- No entanto, ele tem mais dificuldade com as instâncias 'Ruim' (classe '2'), onde o recall é menor.


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

# Assuming 'class' is your target column
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets['class']

# Identify categorical columns
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

# Create a ColumnTransformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)
    ],
    remainder='passthrough'
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Define a parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_preprocessed, y_train)

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_preprocessed)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Export the best model
joblib.dump(best_model, 'best_credit_risk_classifier_trained_model.pkl')