In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd
from pathlib import Path
from decouple import config

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV

In [9]:
model_path = Path("../",config("MODEL_PATH"))
data_path = Path("../"+config("DATA_PATH"))

model_path.mkdir(parents=True, exist_ok=True)
data_path.mkdir(parents=True, exist_ok=True)


In [11]:
credit = fetch_openml(name="credit-g", as_frame=True)
X, y = credit.data, credit.target
X.to_csv(Path(data_path, "X.csv"))
y.to_csv(Path(data_path, "y.csv"))

  warn(
  warn(


In [3]:
categorical_features = list(X.dtypes[X.dtypes == 'category'].index)
numerical_features = list(X.dtypes[X.dtypes != 'category'].index)
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
numerical_pipe = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

vector_cleaning = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_features),
        ("num", numerical_pipe, numerical_features),
    ]
)

pca = PCA(n_components=10)

preprocessing = Pipeline(
    [("vector_cleaning", vector_cleaning),
    ("dim_reduction", pca),]
)

In [4]:
model = RandomForestClassifier()

In [5]:
pipeline = Pipeline([("preprocessing", preprocessing), ("classifier", model)])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1
)

In [14]:
param_grid = {
    "preprocessing__dim_reduction__n_components": [5, 15, 30, 45, 60],
    "classifier__n_estimators": [10, 50, 100, 150, 200],
    "classifier__criterion": ["gini", "entropy", "log_loss"],
}

In [15]:
search = GridSearchCV(pipeline, param_grid, n_jobs=2)
search.fit(X, y)

In [16]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.755):
{'classifier__criterion': 'log_loss', 'classifier__n_estimators': 50, 'preprocessing__dim_reduction__n_components': 45}


In [27]:
def save_obj(obj, name):
    filename = Path(model_path, name+".pkl")
    with open(filename, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [28]:
save_obj(search.best_estimator_, "pipeline")