<a href="https://colab.research.google.com/github/inginddie/Maestria/blob/main/german_credit_con_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.dummy import DummyClassifier


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/camilousa/datasets/refs/heads/master/german_credit_data(2)(1).csv")

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
pipeline_cat = Pipeline([
    ("imputador", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", drop="first"))
])

In [None]:

pipeline_num = Pipeline([
    ("imputador", SimpleImputer(strategy="mean")),
    ("normalizador", MinMaxScaler())
])

In [None]:
column_transformer = ColumnTransformer([
    ("pipeline_cat", pipeline_cat, ["Sex", "Job", "Housing", "Saving_accounts", "Checking_account", "Purpose"]),
    ("pipeline_num", pipeline_num, ["Age", "Credit_amount", "Duration"])
])

In [None]:
pipeline = Pipeline([
    ("column_transformer", column_transformer),
    ("modelo", RandomForestClassifier(max_depth=3))
])

In [None]:
results = cross_validate(pipeline, train.drop(columns="Risk"),
                         train["Risk"]=='good', cv=5,  return_train_score=True,
                         scoring=["accuracy", "precision", "recall"])

In [None]:

for metric in ["accuracy", "precision", "recall"]:
  train_scores = results[f"train_{metric}"]
  test_scores = results[f"test_{metric}"]
  print(f"{metric}:")
  print(f"  Train - Mean: {np.mean(train_scores):.4f}, Std: {np.std(train_scores):.4f}")
  print(f"  Validation - Mean: {np.mean(test_scores):.4f}, Std: {np.std(test_scores):.4f}")


accuracy:
  Train - Mean: 0.7200, Std: 0.0069
  Validation - Mean: 0.7050, Std: 0.0047
precision:
  Train - Mean: 0.7144, Std: 0.0054
  Validation - Mean: 0.7058, Std: 0.0045
recall:
  Train - Mean: 0.9987, Std: 0.0011
  Validation - Mean: 0.9911, Std: 0.0138


### Pipeline con balanceo

In [None]:
pipeline = ImbPipeline([
    ("column_transformer", column_transformer),
    ("balanceo", SMOTE()),
    ("modelo", RandomForestClassifier(max_depth=3))
])

In [None]:
results = cross_validate(pipeline, train.drop(columns="Risk"),
                         train["Risk"]=='good', cv=5,  return_train_score=True,
                         scoring=["accuracy", "precision", "recall"])

In [None]:
for metric in ["accuracy", "precision", "recall"]:
  train_scores = results[f"train_{metric}"]
  test_scores = results[f"test_{metric}"]
  print(f"{metric}:")
  print(f"  Train - Mean: {np.mean(train_scores):.4f}, Std: {np.std(train_scores):.4f}")
  print(f"  Validation - Mean: {np.mean(test_scores):.4f}, Std: {np.std(test_scores):.4f}")


accuracy:
  Train - Mean: 0.6978, Std: 0.0328
  Validation - Mean: 0.6562, Std: 0.0471
precision:
  Train - Mean: 0.8274, Std: 0.0129
  Validation - Mean: 0.7992, Std: 0.0268
recall:
  Train - Mean: 0.7164, Std: 0.0446
  Validation - Mean: 0.6781, Std: 0.0611


In [None]:
results = cross_validate(DummyClassifier(strategy="most_frequent"), train.drop(columns="Risk"),
                         train["Risk"]=='good', cv=5,  return_train_score=True,
                         scoring=["accuracy", "precision", "recall"])

In [None]:
for metric in ["accuracy", "precision", "recall"]:
  train_scores = results[f"train_{metric}"]
  test_scores = results[f"test_{metric}"]
  print(f"{metric}:")
  print(f"  Train - Mean: {np.mean(train_scores):.4f}, Std: {np.std(train_scores):.4f}")
  print(f"  Validation - Mean: {np.mean(test_scores):.4f}, Std: {np.std(test_scores):.4f}")


accuracy:
  Train - Mean: 0.6987, Std: 0.0006
  Validation - Mean: 0.6988, Std: 0.0025
precision:
  Train - Mean: 0.6987, Std: 0.0006
  Validation - Mean: 0.6988, Std: 0.0025
recall:
  Train - Mean: 1.0000, Std: 0.0000
  Validation - Mean: 1.0000, Std: 0.0000
