In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.feature_selection import RFE

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.metrics import accuracy_score, classification_report

In [4]:
from sklearn import set_config
set_config(display = "diagram")

# Pipeline Demo

## Adult dataset

**Prediction task** is to determine whether a person makes over 50K a year.

Data from [here](https://archive.ics.uci.edu/ml/datasets/adult).

### Process the data

In [5]:
income_data = pd.read_csv("../datasets/adult/adult.data", header=None)
income_data.columns = [
    "age",
    "workclass",
    "final_weight",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_class",
]

income_data.income_class = income_data.income_class.str.strip()

income_attributes = income_data.drop(columns="income_class")
income_target = income_data.income_class

X_train, X_test, y_train, y_test = train_test_split(income_attributes, income_target, train_size=0.7)

In [6]:
income_attributes

Unnamed: 0,age,workclass,final_weight,education,education-num,marital-status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


### Pipeline

In [47]:
categorical_columns = income_attributes.dtypes[income_attributes.dtypes == "object"].index.values

step1 = (
    "categorical", 
    FunctionTransformer(func = lambda data: pd.get_dummies(data)),
    categorical_columns
)
step2 = (
    "everything_else",
    MinMaxScaler(),
    range(0, 14)
)

transformer = ColumnTransformer([step1, step2])
transformer

In [29]:
column_selector = make_column_selector(
    pattern = "",
    dtype_exclude = "object",
)

In [48]:
poly = PolynomialFeatures(degree = 2, interaction_only = True)

In [49]:
adaboost = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth = 1),
    n_estimators = 2,
)

In [50]:
pipeline = Pipeline([
    ("transformer", transformer),
    ("selector", "passthrough"),
    ("polynomial_features", poly),
    ("classifier", adaboost),
    # ("", RFE()),
])

In [46]:
pipeline

In [55]:
pipeline.fit(income_attributes, income_target)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed