In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

import numpy as np
from collections import Counter

# Load Toy Dataset

In [3]:
from sklearn.datasets import load_diabetes

tmp = load_diabetes()
fn = np.vectorize(lambda x: 0 if x < 99 else 2 if x > 182 else 1)
y = fn(tmp.target)
X = tmp.data

print(Counter(y))

Counter({2: 149, 1: 148, 0: 145})


# Data Splitting

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Classifier

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(Counter(y_pred))
print(accuracy_score(y_test, y_pred))

Counter({1: 50, 0: 42, 2: 41})
0.39849624060150374


# Pipeline

In [6]:
from sklearn.pipeline import Pipeline, FeatureUnion
from pipelinetweak.pipe import PredT
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

model = Pipeline(steps=[
    ('trans', FeatureUnion(transformer_list=[
        ('prefit', PredT(DecisionTreeClassifier())),
        ('scaler', StandardScaler())
    ])),
    ('pred', LogisticRegression(solver='lbfgs', multi_class='multinomial'))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(Counter(y_pred))
print(accuracy_score(y_test, y_pred))

Counter({1: 51, 0: 44, 2: 38})
0.45864661654135336
