In [1]:
import numpy as np
import pandas as pd
from faker import Faker
from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from feet import pipeline_from_config

In [2]:
iris_predictive_columns = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
target_column = 'target'

def get_iris_dataframe():
    iris = load_iris()
    df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                      columns= iris['feature_names'] + ['target'])\
           .assign(target=lambda y: y.target.apply(lambda x: 0 if x == 0 else 1))

    return df[iris["feature_names"]], df['target']

In [3]:
# test GMM transformer
df, y = get_iris_dataframe()
config = {
    "post_process": [
        {"name": "gmm", "config": {"clusters": 50}}
    ]
}
    
pipeline = pipeline_from_config(config)


skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(df, y):
    X_train = pipeline.fit_transform(df.loc[train_index])
    y_train = y[train_index]
    
    X_test = pipeline.transform(df.loc[test_index])
    y_test = y[test_index]
    
    clf = LogisticRegression().fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:,1]
    
    print ("AUC: %0.3f" % roc_auc_score(y_test, y_pred))

AUC: 1.000
AUC: 1.000
AUC: 1.000
AUC: 1.000
AUC: 1.000


In [4]:
pipeline = pipeline_from_config(config)


skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(df, y):
    for kernel in ["linear", "poly", "rbf", "sigmoid", "cosine"]:
        config = {
            "post_process": [
                {"name": "kpca", "config": {"n_components": 10, "kernel": kernel}}
            ]
        }
    
        pipeline = pipeline_from_config(config)
        X_train = pipeline.fit_transform(df.loc[train_index])
        y_train = y[train_index]
    
        X_test = pipeline.transform(df.loc[test_index])
        y_test = y[test_index]
    
        clf = LogisticRegression().fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:,1]
    
        print ("Kernel: %s, AUC: %0.3f" % (kernel, roc_auc_score(y_test, y_pred)))

Kernel: linear, AUC: 1.000
Kernel: poly, AUC: 1.000
Kernel: rbf, AUC: 1.000
Kernel: sigmoid, AUC: 1.000
Kernel: cosine, AUC: 1.000
Kernel: linear, AUC: 1.000
Kernel: poly, AUC: 1.000
Kernel: rbf, AUC: 1.000
Kernel: sigmoid, AUC: 1.000
Kernel: cosine, AUC: 1.000
Kernel: linear, AUC: 1.000
Kernel: poly, AUC: 1.000
Kernel: rbf, AUC: 1.000
Kernel: sigmoid, AUC: 1.000
Kernel: cosine, AUC: 1.000
Kernel: linear, AUC: 1.000
Kernel: poly, AUC: 1.000
Kernel: rbf, AUC: 1.000
Kernel: sigmoid, AUC: 1.000
Kernel: cosine, AUC: 1.000
Kernel: linear, AUC: 1.000
Kernel: poly, AUC: 1.000
Kernel: rbf, AUC: 1.000
Kernel: sigmoid, AUC: 0.900
Kernel: cosine, AUC: 1.000
