In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [2]:
from sklearn import datasets
iris = datasets.load_iris()

In [27]:
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [4]:
from sklearn.model_selection import train_test_split

In [86]:
X = pd.DataFrame(iris.data, columns= iris.feature_names)
y = iris.target

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [8]:
X_train[:3]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
96,5.7,2.9,4.2,1.3
105,7.6,3.0,6.6,2.1
66,5.6,3.0,4.5,1.5


In [9]:
np.unique(y_train, return_counts= True)

(array([0, 1, 2]), array([31, 35, 34], dtype=int64))

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression


In [11]:
X_train.columns.values[:4].tolist()

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [12]:
X.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [61]:
from sklearn.base import BaseEstimator, TransformerMixin

class GetSepalClusters(BaseEstimator, TransformerMixin):
    """Determine if a text starts with a verb."""

    def get_cluster(self, X):
        print(X.columns, '=========================')
        X_scaled = X.iloc[:, :2].copy()
        X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
        kmeans = KMeans(n_clusters= 3, n_init= 10, random_state=0).fit(X_scaled)
        preds_cluster = kmeans.predict(X_scaled)
        preds_cluster = [str(elm) for elm in preds_cluster]
        return preds_cluster

    def fit(self, X, y = None):
        return self

    def transform(self, X):
        res = self.get_cluster(X)
        df = pd.DataFrame({'sepal_cluster': res})
        print(f'Generated Clusters: ===============\n {df.columns}')
        return df

In [65]:
class ShowColumns(BaseEstimator, TransformerMixin):
    """Determine if a text starts with a verb."""

    def fit(self, X, y = None):
        return self
    def transform(self, X):
        print('SHOW', '=' * 50)
        print(X[:2])
        print('=' * 50)
        return X

In [66]:


cols_num = ['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']
transformer_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

cols_cat = ['sepal_cluster']
transformer_cat = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value= 'unknown')),
    ('onehot', OneHotEncoder(handle_unknown= 'ignore'))])

transformer = ColumnTransformer(transformers= [
    ('tnum', transformer_num, cols_num),
    ('tcat', transformer_cat, cols_cat)
])

cls = DecisionTreeClassifier()

pipe = Pipeline(steps= [
    ('gen_features', FeatureUnion(transformer_list= [
        ('transformer_num', transformer_num),
        ('cluster_sepal', Pipeline(steps= [
            ('get_cluster', GetSepalClusters()),
            ('transformer_cluster', transformer_cat)
        ]))
    ])),
    ('show_cols', ShowColumns()),
    ('classifier',  cls)
])

pipe.fit(X_train, y_train)



Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
 Index(['sepal_cluster'], dtype='object')
  (0, 0)	-0.1383560330969935
  (0, 1)	-0.26550845385878896
  (0, 2)	0.2222907219123989
  (0, 3)	0.10894943054727887
  (0, 4)	1.0
  (1, 0)	2.1475262528533645
  (1, 1)	-0.026311648580602864
  (1, 2)	1.611607733864892
  (1, 3)	1.1849931890389227
  (1, 6)	1.0


Pipeline(steps=[('gen_features',
                 FeatureUnion(transformer_list=[('transformer_num',
                                                 Pipeline(steps=[('imputer',
                                                                  SimpleImputer(strategy='median')),
                                                                 ('scaler',
                                                                  StandardScaler())])),
                                                ('cluster_sepal',
                                                 Pipeline(steps=[('get_cluster',
                                                                  GetSepalClusters()),
                                                                 ('transformer_cluster',
                                                                  Pipeline(steps=[('imputer',
                                                                                   SimpleImputer(fill_value='unknown',
                     

In [67]:
preds_y = pipe.predict(X_test)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
 Index(['sepal_cluster'], dtype='object')
  (0, 0)	0.3428823428925551
  (0, 1)	-0.5047052591369751
  (0, 2)	0.5117317660691684
  (0, 3)	-0.02555603926417671
  (0, 4)	1.0
  (1, 0)	-0.1383560330969935
  (1, 1)	1.887262793644884
  (1, 2)	-1.2249144988714487
  (1, 3)	-1.236105267567276
  (1, 6)	1.0


In [68]:
(preds_y == y_test).mean()

0.98

In [88]:
feats = X_train.columns
feats

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [89]:
imputer_num = SimpleImputer(strategy= 'median')
X_train = pd.DataFrame(imputer_num.fit_transform(X_train))
X_train.columns = feats

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_train.columns = feats

In [91]:
X_train.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.138356,-0.265508,0.222291,0.108949
1,2.147526,-0.026312,1.611608,1.184993
2,-0.258666,-0.026312,0.395955,0.37796


In [98]:
X_cluster = GetSepalClusters().fit_transform(X)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
 Index(['sepal_cluster'], dtype='object')


In [99]:
X_cluster.head()

Unnamed: 0,sepal_cluster
0,0
1,0
2,0
3,0
4,0


In [100]:
X_train = pd.concat([X_train, X_cluster], axis= 1)
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sepal_cluster
0,-0.138356,-0.265508,0.222291,0.108949,0
1,2.147526,-0.026312,1.611608,1.184993,0
2,-0.258666,-0.026312,0.395955,0.37796,0
3,-0.860214,1.169672,-1.398579,-1.370611,0
4,2.267836,-0.504705,1.669496,1.050488,0


In [101]:
X_train.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
sepal_cluster         object
dtype: object

In [102]:
feats = X_train.columns
feats

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'sepal_cluster'],
      dtype='object')

In [103]:
imputer_cat = SimpleImputer(strategy= 'constant', fill_value= 'unknown')
X_train = pd.DataFrame(imputer_num.fit_transform(X_train))
X_train.columns = feats

In [104]:
X_train.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sepal_cluster
0,-0.138356,-0.265508,0.222291,0.108949,0.0
1,2.147526,-0.026312,1.611608,1.184993,0.0


In [105]:
cols_cat

['sepal_cluster']

In [109]:
transform_1hot = OneHotEncoder(handle_unknown= 'ignore', sparse= False)
X1hot = pd.DataFrame(transform_1hot.fit_transform(X_train[cols_cat]))

In [114]:
transform_1hot.__dict__

{'categories': 'auto',
 'sparse': False,
 'dtype': numpy.float64,
 'handle_unknown': 'ignore',
 'drop': None,
 'categories_': [array([0., 1., 2.])],
 'drop_idx_': None}

In [115]:
transform_1hot.get_feature_names(cols_cat)

array(['sepal_cluster_0.0', 'sepal_cluster_1.0', 'sepal_cluster_2.0'],
      dtype=object)

In [116]:
X1hot.head()

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
