# Pipeline

In [26]:
import pandas as pd
data = pd.read_csv("../../data/processed/penguins.csv")
data.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,species
0,Torgersen,39.1,18.7,181.0,3750.0,2007,1
1,Torgersen,39.5,17.4,186.0,3800.0,2007,1
2,Torgersen,40.3,18.0,195.0,3250.0,2007,1
3,Torgersen,36.7,19.3,193.0,3450.0,2007,1
4,Torgersen,39.3,20.6,190.0,3650.0,2007,1


In [27]:
X = data.iloc[:, 0:-1]
X

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
0,Torgersen,39.1,18.7,181.0,3750.0,2007
1,Torgersen,39.5,17.4,186.0,3800.0,2007
2,Torgersen,40.3,18.0,195.0,3250.0,2007
3,Torgersen,36.7,19.3,193.0,3450.0,2007
4,Torgersen,39.3,20.6,190.0,3650.0,2007
...,...,...,...,...,...,...
269,Biscoe,47.2,13.7,214.0,4925.0,2009
270,Biscoe,46.8,14.3,215.0,4850.0,2009
271,Biscoe,50.4,15.7,222.0,5750.0,2009
272,Biscoe,45.2,14.8,212.0,5200.0,2009


In [28]:
y = data.iloc[:, -1]
y

0      1
1      1
2      1
3      1
4      1
      ..
269    0
270    0
271    0
272    0
273    0
Name: species, Length: 274, dtype: int64

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
import numpy as np
from sklearn.tree import DecisionTreeClassifier

num_pipeline = Pipeline(steps=[
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

model_pipeline = Pipeline(steps=[
    ('preprocessing', col_trans),
    ('model', DecisionTreeClassifier())
])

In [30]:
from sklearn.model_selection import train_test_split
test_size = 0.23
random_state = 6
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=test_size, 
                                                    random_state=random_state)

In [31]:
model_pipeline.fit(X_train, y_train)

In [32]:
y_pred_model_pipeline = model_pipeline.predict(X_test)
y_pred_model_pipeline

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1],
      dtype=int64)

In [33]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

print("Model score")
print("accuracy: ", accuracy_score(y_test, y_pred_model_pipeline))
print("recall: ", recall_score(y_test, y_pred_model_pipeline))
print("precision: ", precision_score(y_test, y_pred_model_pipeline))


Model score
accuracy:  1.0
recall:  1.0
precision:  1.0
