In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
credit = fetch_openml(name="credit-g", as_frame=True)
X, y = credit.data, credit.target
X.to_csv("resources/data/X.csv")
y.to_csv("resources/data/y.csv")

  warn(
  warn(


In [3]:
categorical_features = list(X.dtypes[X.dtypes == 'category'].index)
numerical_features = list(X.dtypes[X.dtypes != 'category'].index)
categorical_encoder = OneHotEncoder(handle_unknown="ignore")
numerical_pipe = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())])

vector_cleaning = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_features),
        ("num", numerical_pipe, numerical_features),
    ]
)

pca = PCA(n_components=10)

preprocessing = Pipeline(
    [("vector_cleaning", vector_cleaning),
    ("dim_reduction", pca),]
)

In [4]:
model = RandomForestClassifier()

In [5]:
pipeline = Pipeline([("preprocessing", preprocessing), ("classifier", model)])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1
)

In [14]:
param_grid = {
    "preprocessing__dim_reduction__n_components": [5, 15, 30, 45, 60],
    "classifier__n_estimators": [10, 50, 100, 150, 200],
    "classifier__criterion": ["gini", "entropy", "log_loss"],
}

In [15]:
search = GridSearchCV(pipeline, param_grid, n_jobs=2)
search.fit(X, y)

In [16]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.755):
{'classifier__criterion': 'log_loss', 'classifier__n_estimators': 50, 'preprocessing__dim_reduction__n_components': 45}


In [27]:
def save_obj(obj, name):
    with open('resources/objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [28]:
save_obj(search.best_estimator_, "pipeline")

In [3]:
from fastapi.encoders import jsonable_encoder

In [33]:
categories_raw

checking_status           category
duration                   float64
credit_history            category
purpose                   category
credit_amount              float64
savings_status            category
employment                category
installment_commitment     float64
personal_status           category
other_parties             category
residence_since            float64
property_magnitude        category
age                        float64
other_payment_plans       category
housing                   category
existing_credits           float64
job                       category
num_dependents             float64
own_telephone             category
foreign_worker            category
dtype: object

In [62]:
type(X.sample()["checking_status"].values[0])

str

In [65]:
categories_dict = {}
categories_raw = X.dtypes
example_data = X.sample()
for category in categories_raw.index:
    numeric_data_types = {int, float, complex}
    cat_type = categories_raw[category]
    if cat_type not in numeric_data_types and not pd.api.types.is_numeric_dtype(cat_type):
        example_value = X.sample()[category].values[0]
        categories_dict[category] = (str, example_value)
    else:
        example_value = X.sample()[category].values[0]
        categories_dict[category] = (float, example_value)


In [66]:
def load_obj(name):
    with open('resources/objects/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
pipe = load_obj("pipeline")

In [74]:
data = {'checking_status': '0<=X<200', 'duration': 48.0, 'credit_history': 'existing paid', 'purpose': 'furniture/equipment', 'credit_amount': 7824.0, 'savings_status': '100<=X<500', 'employment': '1<=X<4', 'installment_commitment': 2.0, 'personal_status': 'male single', 'other_parties': 'none', 'residence_since': 4.0, 'property_magnitude': 'car', 'age': 35.0, 'other_payment_plans': 'bank', 'housing': 'rent', 'existing_credits': 2.0, 'job': 'skilled', 'num_dependents': 1.0, 'own_telephone': 'none', 'foreign_worker': 'yes'}

In [76]:
pd.Series(data)

checking_status                      0<=X<200
duration                                 48.0
credit_history                  existing paid
purpose                   furniture/equipment
credit_amount                          7824.0
savings_status                     100<=X<500
employment                             1<=X<4
installment_commitment                    2.0
personal_status                   male single
other_parties                            none
residence_since                           4.0
property_magnitude                        car
age                                      35.0
other_payment_plans                      bank
housing                                  rent
existing_credits                          2.0
job                                   skilled
num_dependents                            1.0
own_telephone                            none
foreign_worker                            yes
dtype: object

In [80]:
pipe.predict(pd.DataFrame([data]))

array(['good'], dtype=object)