# Pipeline

In [42]:
import pandas as pd
data = pd.read_csv("../../data/processed/diabetes.csv")
data.head()

Unnamed: 0,num_pregnant,plas_glucose_concentr,blood_pressure,skin_thick,insulin,mass_index,pedigree_fun,age,class
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [43]:
X = data.iloc[:, 0:-1]
X

Unnamed: 0,num_pregnant,plas_glucose_concentr,blood_pressure,skin_thick,insulin,mass_index,pedigree_fun,age
0,6,148.0,72.0,35.0,,33.6,0.627,50
1,1,85.0,66.0,29.0,,26.6,0.351,31
2,8,183.0,64.0,,,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
756,10,101.0,76.0,48.0,180.0,32.9,0.171,63
757,2,122.0,70.0,27.0,,36.8,0.340,27
758,5,121.0,72.0,23.0,112.0,26.2,0.245,30
759,1,126.0,60.0,,,30.1,0.349,47


In [44]:
y = data.iloc[:, -1]
y

0      1
1      0
2      1
3      0
4      1
      ..
756    0
757    0
758    0
759    1
760    0
Name: class, Length: 761, dtype: int64

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer()),
    ('scale', MinMaxScaler())
])

cat_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

col_trans = ColumnTransformer([
    ('num_pipeline', num_pipeline, make_column_selector(dtype_include = np.number)),
    ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include = np.object_))
])

model_pipeline = Pipeline(steps=[
    ('preprocessing', col_trans),
    ('model', DecisionTreeClassifier())
])

In [46]:
from sklearn.model_selection import train_test_split
test_size = 0.23
random_state = 6
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=test_size, 
                                                    random_state=random_state)

In [47]:
model_pipeline.fit(X_train, y_train)

In [48]:
y_pred_model_pipeline = model_pipeline.predict(X_test)
y_pred_model_pipeline

array([1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1],
      dtype=int64)

In [49]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

print("Model score")
print("accuracy: ", accuracy_score(y_test, y_pred_model_pipeline))
print("recall: ", recall_score(y_test, y_pred_model_pipeline))
print("precision: ", precision_score(y_test, y_pred_model_pipeline))


Model score
accuracy:  0.7159090909090909
recall:  0.6470588235294118
precision:  0.6285714285714286
