# Entry 20 notebook - SciKit Learn Pipeline

In [108]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import category_encoders as ce

from sklearn.pipeline import make_pipeline

### Custom functions

In [2]:
def split_data(df, target, train_size):
    y = df[[target]]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_size, random_state=12)
    return X_train, X_test, y_train, y_test

# def preprocess_data(train_df, test_df, scaler=StandardScaler(), encoder=ce.OrdinalEncoder):
#     index = train_df.index.tolist()
#     test_index = test_df.index.tolist()
    
#     num_features = train_df.select_dtypes('number').columns.tolist()    
#     num_scale = scaler.fit_transform(train_df[num_features])
#     train_num_df = pd.DataFrame(num_scale, columns=num_features, index=index)
    
#     test_num_scale = scaler.transform(test_df[num_features])
#     test_num_df = pd.DataFrame(test_num_scale, columns=num_features, index=test_index)
    
#     cat_features = train_df.select_dtypes('object').columns.tolist()
#     cat_encoder = encoder(cols=cat_features)
#     cat_encode = cat_encoder.fit_transform(train_df[cat_features])
#     train_cat_df = pd.DataFrame(cat_encode, columns=cat_features, index=index)
    
#     test_cat_encode = cat_encoder.transform(test_df[cat_features])
#     test_cat_df = pd.DataFrame(test_cat_encode, columns=cat_features, index=test_index)
    
#     train_df = pd.concat([train_num_df, train_cat_df], axis=1).fillna(-1)
#     test_df = pd.concat([test_num_df, test_cat_df], axis=1).fillna(-1)
    
#     return train_df, test_df

In [None]:
def train_and_predict(X_train, y_train, X_test, model=LinearRegression()):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return model, preds

In [44]:
df_raw = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original', header=None, sep='\t')
cols = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_yr', 'origin']
df = df_raw[0].str.split(expand=True)
df.columns = cols
df[df == 'NA'] = np.nan
df = df.astype('float32')
df.drop(df[df['mpg'].isna()].index, inplace=True)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_yr,origin
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0


## Standard pipeline with make_pipeline

In [46]:
X_train, X_test, y_train, y_test = split_data(df, 'mpg', 0.8)
pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), LinearRegression())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8055297594035651

In [57]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2')

array([0.72524258, 0.61189525, 0.88478104, 0.83021284, 0.52796488])

In [56]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [60]:
cross_validate(pipe, X_train, y_train, cv=5, scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error',
                                                      'neg_median_absolute_error'])

{'fit_time': array([0.0048151 , 0.00404716, 0.01251006, 0.00448489, 0.00483203]),
 'score_time': array([0.00393391, 0.00417495, 0.00337911, 0.00355816, 0.00294971]),
 'test_r2': array([0.72524258, 0.61189525, 0.88478104, 0.83021284, 0.52796488]),
 'test_neg_mean_absolute_error': array([-2.93776155, -4.08900356, -2.30192876, -2.38466597, -3.62272048]),
 'test_neg_mean_squared_error': array([-15.96528435, -26.17324066,  -7.6212616 ,  -6.95487976,
        -17.44922638]),
 'test_neg_median_absolute_error': array([-1.88134241, -3.01408482, -2.11343098, -2.48243523, -2.80085754])}

## Mixed variable types with ColumnTransformer

In [61]:
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

steps
- load data
- preprocess data types
- identify numerical vs categorical features
- create transform pipelines for each data type
- create columntransform
- create pipeline the incorporates all of above and trains model
- score predictions

In [90]:
raw_df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data",
            names=['surgery', 'age', 'hosp_num', 'rectal_temp', 'pulse', 'respiratory_rate', 'extremity_temp',
                  'peripheral_pulse', 'mucous_membranes', 'cap_refill_time', 'pain', 'peristalsis',
                  'ab_distension', 'nasogastric_tube', 'nasogastric_reflux', 'nasogastric_reflux_ph',
                  'rectal_exam', 'abdomen', 'packed_cell_vol', 'total_protein', 'abdominocentesis_app',
                  'abdomcentesis_total_protein', 'outcome', 'surgical_lesion',
                   'lesion_type', 'lesion2', 'lesion3', 'cp_data'], sep='\s', engine='python', na_values='?')

In [91]:
raw_df[raw_df['lesion2'] == 3111]

Unnamed: 0,surgery,age,hosp_num,rectal_temp,pulse,respiratory_rate,extremity_temp,peripheral_pulse,mucous_membranes,cap_refill_time,...,packed_cell_vol,total_protein,abdominocentesis_app,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_type,lesion2,lesion3,cp_data
116,1.0,1,535208,38.0,44.0,12.0,1.0,1.0,1.0,1.0,...,42.0,65.0,,,1.0,1,3205,3111,0,2
178,1.0,1,533968,,60.0,30.0,3.0,3.0,4.0,2.0,...,45.0,70.0,3.0,2.0,1.0,1,6111,3111,0,2
229,1.0,9,534597,38.5,120.0,70.0,,,,,...,35.0,54.0,1.0,1.0,1.0,1,4205,3111,2209,2


Looks like columns 25 and 26 (renamed: lesion2 and lesion3) are for observations with multiple lesions. As I'm not currently working on this type of problem and there are only a few columns with multiple values, I'm going to just drop them.

The hospital number value is more of a unique identifier than a feature, so I'm dropping that too.

In [92]:
raw_df.drop(['lesion2','lesion3', 'hosp_num'], axis=1, inplace=True)
raw_df.head()

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,extremity_temp,peripheral_pulse,mucous_membranes,cap_refill_time,pain,...,rectal_exam,abdomen,packed_cell_vol,total_protein,abdominocentesis_app,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_type,cp_data
0,2.0,1,38.5,66.0,28.0,3.0,3.0,,2.0,5.0,...,3.0,5.0,45.0,8.4,,,2.0,2,11300,2
1,1.0,1,39.2,88.0,20.0,,,4.0,1.0,3.0,...,4.0,2.0,50.0,85.0,2.0,2.0,3.0,2,2208,2
2,2.0,1,38.3,40.0,24.0,1.0,1.0,3.0,1.0,3.0,...,1.0,1.0,33.0,6.7,,,1.0,2,0,1
3,1.0,9,39.1,164.0,84.0,4.0,1.0,6.0,2.0,2.0,...,3.0,,48.0,7.2,3.0,5.3,2.0,1,2208,1
4,2.0,1,37.3,104.0,35.0,,,6.0,2.0,,...,,,74.0,7.4,,,2.0,2,4300,2


In [93]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 25 columns):
surgery                        299 non-null float64
age                            300 non-null int64
rectal_temp                    240 non-null float64
pulse                          276 non-null float64
respiratory_rate               242 non-null float64
extremity_temp                 244 non-null float64
peripheral_pulse               231 non-null float64
mucous_membranes               253 non-null float64
cap_refill_time                268 non-null float64
pain                           245 non-null float64
peristalsis                    256 non-null float64
ab_distension                  244 non-null float64
nasogastric_tube               196 non-null float64
nasogastric_reflux             194 non-null float64
nasogastric_reflux_ph          53 non-null float64
rectal_exam                    198 non-null float64
abdomen                        182 non-null float64
packed_cell_vo

In [95]:
target = 'outcome'
cat_feats = ['surgery', 'age', 'mucous_membranes', 'cap_refill_time', 'pain', 'peristalsis',
             'ab_distension', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam',
             'abdomen', 'abdominocentesis_app', 'surgical_lesion', 'lesion_type', 'cp_data']
num_feats = ['rectal_temp', 'pulse', 'respiratory_rate', 'extremity_temp', 'peripheral_pulse',
             'nasogastric_reflux_ph', 'packed_cell_vol', 'total_protein', 'abdomcentesis_total_protein']

raw_df[cat_feats] = raw_df[cat_feats].astype('category')
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 25 columns):
surgery                        299 non-null category
age                            300 non-null category
rectal_temp                    240 non-null float64
pulse                          276 non-null float64
respiratory_rate               242 non-null float64
extremity_temp                 244 non-null float64
peripheral_pulse               231 non-null float64
mucous_membranes               253 non-null category
cap_refill_time                268 non-null category
pain                           245 non-null category
peristalsis                    256 non-null category
ab_distension                  244 non-null category
nasogastric_tube               196 non-null category
nasogastric_reflux             194 non-null category
nasogastric_reflux_ph          53 non-null float64
rectal_exam                    198 non-null category
abdomen                        182 non-null category
p

### Test data

The test data was separated into a different file. As such, I loaded it into it's own variable and created the train and test X and y DataFrames appropriately.

In [99]:
test_data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.test",
            names=['surgery', 'age', 'hosp_num', 'rectal_temp', 'pulse', 'respiratory_rate', 'extremity_temp',
                  'peripheral_pulse', 'mucous_membranes', 'cap_refill_time', 'pain', 'peristalsis',
                  'ab_distension', 'nasogastric_tube', 'nasogastric_reflux', 'nasogastric_reflux_ph',
                  'rectal_exam', 'abdomen', 'packed_cell_vol', 'total_protein', 'abdominocentesis_app',
                  'abdomcentesis_total_protein', 'outcome', 'surgical_lesion',
                   'lesion_type', 'lesion2', 'lesion3', 'cp_data'], sep='\s', engine='python', na_values='?')
test_data.drop(['lesion2','lesion3', 'hosp_num'], axis=1, inplace=True)
test_data[cat_feats] = test_data[cat_feats].astype('category')
test_data.head()

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,extremity_temp,peripheral_pulse,mucous_membranes,cap_refill_time,pain,...,rectal_exam,abdomen,packed_cell_vol,total_protein,abdominocentesis_app,abdomcentesis_total_protein,outcome,surgical_lesion,lesion_type,cp_data
0,2.0,1,38.5,54.0,20.0,,1.0,2.0,2.0,3.0,...,,2.0,42.0,6.3,,,1.0,2,3111,1
1,2.0,1,37.6,48.0,36.0,,,1.0,1.0,,...,,,44.0,6.3,1.0,5.0,1.0,2,3111,1
2,1.0,1,37.7,44.0,28.0,,4.0,3.0,2.0,5.0,...,3.0,5.0,45.0,70.0,3.0,2.0,1.0,1,3205,2
3,1.0,1,37.0,56.0,24.0,3.0,1.0,4.0,2.0,4.0,...,,,35.0,61.0,3.0,2.0,3.0,2,2205,2
4,2.0,1,38.0,42.0,12.0,3.0,,3.0,1.0,1.0,...,,2.0,37.0,5.8,,,1.0,2,3111,2


In [106]:
raw_df.dropna(subset=['outcome'], inplace=True)
y_train = raw_df['outcome']
X_train = raw_df.drop('outcome', axis=1)
y_test = test_data['outcome']
X_test = test_data.drop('outcome', axis=1)

# pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), LinearRegression())
# pipe.fit(X_train, y_train)
# pipe.score(X_test, y_test)

In [111]:
num_transf = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_transf = make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'), ce.OneHotEncoder())

In [118]:
# mixed_transf = ColumnTransformer(transformers=[
#     ('num', num_transf, raw_df.select_dtypes('number').columns.tolist()),
#     ('cat', cat_transf, raw_df.select_dtypes('category').columns.tolist())
# ])

mixed_transf = ColumnTransformer(transformers=[
    ('num', num_transf, make_column_selector(dtype_include='number')),
    ('cat', cat_transf, make_column_selector(dtype_include='category'))
])

In [122]:
raw_df['outcome'].value_counts()

1.0    178
2.0     77
3.0     44
Name: outcome, dtype: int64

In [123]:
from sklearn.linear_model import LogisticRegression

log_pipe = make_pipeline(mixed_transf,
                    LogisticRegression())

log_pipe.fit(X_train, y_train)
cross_validate(log_pipe, X_train, y_train, cv=5, scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error',
                                                      'neg_median_absolute_error'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

{'fit_time': array([0.13407588, 0.13737607, 0.1309433 , 0.13622332, 0.12957001]),
 'score_time': array([0.03298616, 0.03419995, 0.02904487, 0.03130174, 0.02948093]),
 'test_r2': array([ 0.14460285,  0.15948276, -0.52207002, -0.36986301, -0.1984375 ]),
 'test_neg_mean_absolute_error': array([-0.36666667, -0.3       , -0.5       , -0.48333333, -0.45762712]),
 'test_neg_mean_squared_error': array([-0.46666667, -0.43333333, -0.83333333, -0.75      , -0.66101695]),
 'test_neg_median_absolute_error': array([-0., -0., -0., -0., -0.])}