# Scikit-learn pipelines examples for only continuous variables
* Round 1 - Round 5 of Pydata Talk
* Toy example generated with https://guoguibing.github.io/librec/datagen.html

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
data = pd.read_csv("toy_example.csv")
data.iloc[[10,48,61], 0] = np.nan
data.iloc[[22, 34], 1] = np.nan
data.head()

In [None]:
data.describe()

In [None]:
sns.scatterplot(data = data, x='tenure', y='avg_products', hue='nonpayment')

## Round 1

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='nonpayment').copy()
y = data['nonpayment'].copy()

# SET a random_state to make your pipeline reproducible!
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### TRAIN

In [None]:
## TRAIN
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

ss = StandardScaler(with_mean=True, with_std=True)
X_train_transform_ss = ss.fit_transform(X_train)

si = SimpleImputer(strategy='mean')
X_train_transform_ss_si = si.fit_transform(X_train_transform_ss)

clf = KNeighborsClassifier(n_neighbors=1)

clf.fit(X_train_transform_ss_si, y_train)

print("Accuracy in train: ", accuracy_score(y_train, clf.predict(X_train_transform_ss_si)))


# DISCLAIMER: model registry
with open(os.path.join('pkl','round_1','standard_scaler.pkl'),'wb') as f:
    pickle.dump(ss,f)

with open(os.path.join('pkl','round_1','simple_imputer.pkl'),'wb') as f:
    pickle.dump(si,f)

with open(os.path.join('pkl','round_1','knn.pkl'),'wb') as f:
    pickle.dump(clf,f)

### PREDICT

In [None]:
# load
with open(os.path.join('pkl','round_1','standard_scaler.pkl'), 'rb') as f:
    ss = pickle.load(f)

with open(os.path.join('pkl','round_1','simple_imputer.pkl'), 'rb') as f:
    si = pickle.load(f)

with open(os.path.join('pkl','round_1','knn.pkl'), 'rb') as f:
    clf = pickle.load(f)

X_test_transf_ss = ss.transform(X_test)
X_test_transf_ss_si = si.transform(X_test_transf_ss)
prediction = clf.predict(X_test_transf_ss_si)

print("Accuracy in test: ", accuracy_score(y_test, prediction))

## Round 2

### TRAIN

In [None]:
ss = StandardScaler(with_mean=True, with_std=True)
X_train_transform_ss = ss.fit_transform(X_train)

si = SimpleImputer(strategy='mean')
X_train_transform_ss_si = si.fit_transform(X_train_transform_ss)

clf_1 = KNeighborsClassifier(n_neighbors=1)
clf_1.fit(X_train_transform_ss_si, y_train)

clf_3 = KNeighborsClassifier(n_neighbors=3)
clf_3.fit(X_train_transform_ss_si, y_train)

print("Accuracy in train k=1: ", 
      accuracy_score(y_train, clf_1.predict(X_train_transform_ss_si)))
print("Accuracy in train k=3: ", 
      accuracy_score(y_train, clf_3.predict(X_train_transform_ss_si)))


# save: DISCLAIMER: model registry
with open(os.path.join('pkl','round_2','standard_scaler.pkl'),'wb') as f:
    pickle.dump(ss,f)

with open(os.path.join('pkl','round_2','simple_imputer.pkl'),'wb') as f:
    pickle.dump(si,f)

with open(os.path.join('pkl','round_2','knn_1.pkl'),'wb') as f:
    pickle.dump(clf_1,f)

with open(os.path.join('pkl','round_2','knn_3.pkl'),'wb') as f:
    pickle.dump(clf_3,f)

### PREDICT

In [None]:
with open(os.path.join('pkl','round_2','standard_scaler.pkl'), 'rb') as f:
    ss = pickle.load(f)

with open(os.path.join('pkl','round_2','simple_imputer.pkl'), 'rb') as f:
    si = pickle.load(f)

with open(os.path.join('pkl','round_2','knn_1.pkl'), 'rb') as f:
    clf_1 = pickle.load(f)

with open(os.path.join('pkl','round_2','knn_3.pkl'), 'rb') as f:
    clf_3 = pickle.load(f)

X_test_transf_ss = ss.transform(X_test)
X_test_transf_ss_si = si.transform(X_test_transf_ss)

print("Accuracy in test: ", accuracy_score(y_test, clf_1.predict(X_test_transf_ss_si)))
print("Accuracy in test: ", accuracy_score(y_test, clf_3.predict(X_test_transf_ss_si)))

## Round 3

### TRAIN

In [None]:
from sklearn.model_selection import GridSearchCV

ss = StandardScaler(with_mean=True, with_std=True)
X_train_transform_ss = ss.fit_transform(X_train)

si = SimpleImputer(strategy='mean')
X_train_transform_ss_si = si.fit_transform(X_train_transform_ss)

clf = KNeighborsClassifier()

# Puedes poner tu propia métrica
cv = GridSearchCV(clf, 
                  param_grid = {'n_neighbors': [1, 3, 5]},
                  scoring = 'accuracy', refit = True,
                  cv = 3,
                 return_train_score=True)

cv.fit(X_train_transform_ss_si, y_train)
                  

cv_results = pd.DataFrame(cv.cv_results_)
display(cv_results[['param_n_neighbors', 'mean_train_score', 'mean_test_score', 'rank_test_score']])

display(cv.best_estimator_.get_params())

# save: DISCLAIMER: model registry
with open(os.path.join('pkl','round_3','standard_scaler.pkl'),'wb') as f:
    pickle.dump(ss,f)

with open(os.path.join('pkl','round_3','simple_imputer.pkl'),'wb') as f:
    pickle.dump(si,f)

# WATCH OUT!
with open(os.path.join('pkl','round_3','knn.pkl'),'wb') as f:
    pickle.dump(cv.best_estimator_,f)

### PREDICT

In [None]:
with open(os.path.join('pkl','round_3','standard_scaler.pkl'), 'rb') as f:
    ss = pickle.load(f)

with open(os.path.join('pkl','round_3','simple_imputer.pkl'), 'rb') as f:
    si = pickle.load(f)

with open(os.path.join('pkl','round_3','knn.pkl'), 'rb') as f:
    clf = pickle.load(f)


X_test_transf_ss = ss.transform(X_test)
X_test_transf_ss_si = si.transform(X_test_transf_ss)

print("Accuracy in test: ", accuracy_score(y_test, clf.predict(X_test_transf_ss_si)))

## Round 4

### TRAIN

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif 
from sklearn import set_config

set_config(display="diagram")

pl=Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer()),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier',KNeighborsClassifier())
])

my_param_grid = [
    {
     'imputer__strategy': ['mean', 'median'],
     'feature_selection__k': [1,2],
     'classifier__n_neighbors': [1, 3, 5],
    }
]

cv = GridSearchCV(pl, 
                  param_grid = my_param_grid,
                  scoring = 'accuracy', refit = True,
                  cv = 3,
                 return_train_score=True)

cv.fit(X_train,y_train)

cv_results = pd.DataFrame(cv.cv_results_).sort_values(by="rank_test_score")
display(cv_results[['param_imputer__strategy','param_feature_selection__k','param_classifier__n_neighbors', 
                    'mean_train_score', 'mean_test_score', 'rank_test_score']])
print(cv.best_estimator_.get_params())

# WATCH OUT!
with open(os.path.join('pkl','round_4','pipeline.pkl'),'wb') as f:
    pickle.dump(cv.best_estimator_,f)

cv

### PREDICT

In [None]:
with open(os.path.join('pkl','round_4','pipeline.pkl'), 'rb') as f:
    pipe = pickle.load(f)

print("Accuracy in test: ", accuracy_score(y_test, pipe.predict(X_test)))

## Round 5

### TRAIN

In [None]:
from sklearn.linear_model import LogisticRegression

pl=Pipeline([
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer()),
    ('feature_selection', SelectKBest(score_func=f_classif)),
    ('classifier',KNeighborsClassifier())
])

my_param_grid = [
    {
     'imputer__strategy': ['mean', 'median'],
     'feature_selection__k': [1,2],
     'classifier': [KNeighborsClassifier()],
     'classifier__n_neighbors': [1, 3, 5],
    },
    {
     'imputer__strategy': ['mean', 'median'],
     'feature_selection__k': [1,2],
     'classifier': [LogisticRegression(random_state=42)],
     'classifier__C': [0.01, 0.1, 1.0, 10.0, 100.0]
    }
]

cv = GridSearchCV(pl, 
                  param_grid = my_param_grid,
                  scoring = 'accuracy', refit = True,
                  cv = 3,
                 return_train_score=True)

cv.fit(X_train,y_train)

cv_results = pd.DataFrame(cv.cv_results_).sort_values(by="rank_test_score")
display(cv_results[['param_imputer__strategy','param_feature_selection__k',
                    'param_classifier','param_classifier__n_neighbors', 'param_classifier__C', 
                    'mean_train_score', 'mean_test_score', 'rank_test_score']])
print(cv.best_estimator_.get_params())

# WATCH OUT!
with open(os.path.join('pkl','round_5','pipeline.pkl'),'wb') as f:
    pickle.dump(cv.best_estimator_,f)

cv

### PREDICT

In [None]:
with open(os.path.join('pkl','round_5','pipeline.pkl'), 'rb') as f:
    pipe = pickle.load(f)

print("Accuracy in test: ", accuracy_score(y_test, pipe.predict(X_test)))