[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/jorge-cardeno/inteligencia-artificial-proyectos/blob/main/04-modelo-con-SGD-Classifier.ipynb)

### Instalar dependencias y librerias

In [None]:
%pip install pandas matplotlib seaborn scikit-learn numpy jupyter notebook kaggle

Collecting matplot
  Downloading matplot-0.1.9-py2.py3-none-any.whl (5.0 kB)
Collecting pyloco>=0.0.134 (from matplot)
  Downloading pyloco-0.0.139-py2.py3-none-any.whl (60 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Collecting ushlex (from pyloco>=0.0.134->matplot)
  Downloading ushlex-0.99.1.tar.gz (4.7 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting twine (from pyloco>=0.0.134->matplot)
  Downloading twine-4.0.2-py3-none-any.whl (36 kB)
Collecting typing (from pyloco>=0.0.134->matplot)
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting

In [None]:
from google.colab import files

files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c home-credit-default-risk
!mkdir data
!unzip home-credit-default-risk.zip -d data/
!rm *.zip

Downloading home-credit-default-risk.zip to /home/andre/dev/intro-ia
100%|███████████████████████████████████████▊| 686M/688M [00:26<00:00, 28.2MB/s]
100%|████████████████████████████████████████| 688M/688M [00:26<00:00, 27.5MB/s]
mkdir: cannot create directory ‘data’: File exists
Archive:  home-credit-default-risk.zip
replace data/HomeCredit_columns_description.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    precision_score
)

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [None]:
# Variables de interes sacadas del Analisis Exploratorio
# de Datos.
INTEREST_VARIABLES = [
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "OCCUPATION_TYPE",
    "ORGANIZATION_TYPE",
    "CODE_GENDER",
    "OWN_CAR_AGE"
]

In [None]:
def model_pipeline(X, y, model):
    """Funcion para hacer preprocesamiento y entrenamiento
    del modelo.

    Parameters
    ----------
    X : DataFrame
        Conjunto de datos con las variables de interes ya filtradas.
    y : Serie
        Variable a predecir.
    model : Estimador
        Modelo a entrenar.

    Returns
    -------
    Estimador
        Modelo ya entrenado.
    """
    
    # Identificar las variables categoricas y numericas
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    categorical_cols = [cname for cname in X.columns if X[cname].dtype in ['object']]

    # Preprocesamiento para las variables numericas
    numerical_transformer = SimpleImputer(strategy="constant", fill_value=0)

    # Preprocesamiento para las variables categoricas
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    # Pipeline de procesamiento
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numerical_cols),
            ("cat", categorical_transformer, categorical_cols)
        ]
    )
    
    # Pipeline de procesamiento y entrenamiento
    pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
    ])

    # Fit the model
    pipe.fit(X, y)

    return pipe

In [3]:
# Leer los datos
df_full = pd.read_csv("data/application_train.csv")

# Separar las variables de entrenamiento y interes
X = df_full[INTEREST_VARIABLES]
y = df_full.TARGET

# Separar los datos en entrenamiento y testeo
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, test_size=0.1, random_state=0)

In [4]:
X_train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 276759 entries, 131869 to 305711
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   NAME_INCOME_TYPE     276759 non-null  object 
 1   NAME_EDUCATION_TYPE  276759 non-null  object 
 2   NAME_FAMILY_STATUS   276759 non-null  object 
 3   NAME_HOUSING_TYPE    276759 non-null  object 
 4   DAYS_BIRTH           276759 non-null  int64  
 5   DAYS_EMPLOYED        276759 non-null  int64  
 6   OCCUPATION_TYPE      189861 non-null  object 
 7   ORGANIZATION_TYPE    276759 non-null  object 
 8   CODE_GENDER          276759 non-null  object 
 9   OWN_CAR_AGE          94140 non-null   float64
dtypes: float64(1), int64(2), object(7)
memory usage: 23.2+ MB


### Entrenamiento del Modelo

In [5]:
# Entrenar el modelo
model = model_pipeline(X_train, y_train, SGDClassifier())

### Evaluacion del Modelo

In [10]:
# Hacer prediciones con el modelo
preds = model.predict(X_test)

# Calcular las metricas para el modelo
accuracy = accuracy_score(preds, y_test)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

In [11]:
print("Exactitud:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Exactitud: 0.887812174817898
Precision: 0.03771849126034959
Recall: 0.016768916155419224
F1 Score: 0.023216308040770108
