In [1]:
import fancypipes
from sklearn.datasets import load_boston, load_breast_cancer
from sklearn.linear_model import Lasso, LogisticRegression
import pandas as pd

## Regression

In [2]:
X, y = load_boston(return_X_y=True)
X = pd.DataFrame(X)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
pipeline_steps = [("A Simple Regression", Lasso())]
fancy_pipeline = fancypipes.pipeline(steps=pipeline_steps)
fancy_pipeline.prep_pipeline(X, y)

In [4]:
fancy_pipeline.evaluate_k_folds(X, y, 3)

Unnamed: 0,fold,AIC,MAE,MAPE,MdAE,MdAPE,RMSE,R^2 Score,record_count
0,1,571.031841,3.653338,0.184273,2.735404,0.141325,5.015435,0.665595,169
1,2,609.723181,3.776837,0.174419,2.172992,0.117006,5.62371,0.668021,169
2,3,583.650505,3.731285,0.176134,2.916553,0.134479,5.257597,0.659941,168


### Generating cross validation scores by a grouping of attributes

2 fold cross validation with scores calculated against unique values in column 3.

In [5]:
fancy_pipeline.evaluate_k_folds(X, y, k=2, by=3)

Unnamed: 0,fold,3,AIC,MAE,MAPE,MdAE,MdAPE,RMSE,R^2 Score,record_count
0,mean,0.0,780.083157,3.542496,0.176329,2.576874,0.131402,4.958184,0.684105,235.5
1,mean,1.0,100.264916,5.8609,0.187448,3.438695,0.140646,8.482625,0.439538,17.5
2,1,0.0,780.150443,3.502208,0.175405,2.480855,0.126087,4.975747,0.685576,235.0
3,1,1.0,97.010082,4.85555,0.19265,3.101275,0.133691,7.188642,0.478294,18.0
4,2,0.0,780.01587,3.582785,0.177254,2.672893,0.136716,4.940622,0.682633,236.0
5,2,1.0,103.519751,6.866249,0.182247,3.776116,0.147601,9.776609,0.400781,17.0


In [6]:
fancy_pipeline.get_model_coefficients()

{0: -0.060720937179267034,
 1: 0.061706492497533676,
 2: -0.0,
 3: 0.0,
 4: -0.0,
 5: 0.24051040665009998,
 6: 0.03888990767518686,
 7: -0.5488297435198559,
 8: 0.32299162681193655,
 9: -0.018278693436204096,
 10: -0.7307014012762643,
 11: 0.0048867655878474655,
 12: -0.7991113759740984}

## Classification

In [7]:
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
pipeline_steps = [("A Simple Classification", LogisticRegression())]
fancy_pipeline = fancypipes.pipeline(steps=pipeline_steps)
fancy_pipeline.prep_pipeline(X, y)

In [9]:
fancy_pipeline.evaluate_k_folds(X, y, k=5)

Unnamed: 0,fold,Accuracy,F1,Matt Corrcoeff,Precision,ROC-AUC,Recall,log-loss,record_count
0,1,0.95614,0.965517,0.906811,0.945946,0.997707,0.985915,0.078554,114
1,2,0.982456,0.986842,0.961289,1.0,0.998596,0.974026,0.071369,114
2,3,0.912281,0.930556,0.812147,0.917808,0.973141,0.943662,0.193013,114
3,4,0.964912,0.972222,0.925285,0.958904,0.996725,0.985915,0.0861,114
4,5,0.920354,0.933333,0.834587,0.926471,0.988644,0.940299,0.154777,113


In [10]:
fancy_pipeline.get_model_coefficients()

{0: 2.024554209488084,
 1: 0.11960360584516885,
 2: 0.008967067598776347,
 3: -0.0029435294247830124,
 4: -0.14990323355763854,
 5: -0.4039838737710136,
 6: -0.5474174920451653,
 7: -0.2816895461739351,
 8: -0.1765732442024034,
 9: -0.0308488750618698,
 10: 0.01268269141392565,
 11: 0.9772428520015076,
 12: 0.0855724085128264,
 13: -0.11072621398239281,
 14: -0.008658334388449042,
 15: 0.0022073339234964058,
 16: -0.024891329655306906,
 17: -0.028521579625072773,
 18: -0.035340826374865554,
 19: 0.0048337766597172155,
 20: 1.3941455998230001,
 21: -0.3314119175198063,
 22: -0.1833551882705208,
 23: -0.025837308886511104,
 24: -0.2281841847602618,
 25: -1.0720708496021343,
 26: -1.3186927394407701,
 27: -0.5309620676446356,
 28: -0.5913824764098377,
 29: -0.11470370458510881}

# Custome Scorers

You can define your own validation functions for your specific use-case

In [11]:
X, y = load_boston(return_X_y=True)
X = pd.DataFrame(X)

In [12]:
def an_alternative_scorer(model, X, y):
    y_pred = model.predict(X)
    n_predictors = len(X.columns)
    record_count = len(y)

    average_residual = sum(y-y_pred)/len(y)

    results = {"Average Residual": average_residual, 'record_count': record_count}

    results = pd.DataFrame.from_records([results])
    return(results)
    

In [13]:
pipeline_steps = [("A Simple Classification", Lasso())]
fancy_pipeline = fancypipes.pipeline(steps=pipeline_steps, diagnostic_package=an_alternative_scorer)
fancy_pipeline.prep_pipeline(X, y)

In [14]:
fancy_pipeline.evaluate_k_folds(X, y, k=2, by=3)

Unnamed: 0,fold,3,Average Residual,record_count
0,mean,0.0,-0.208009,235.5
1,mean,1.0,2.959223,17.5
2,1,0.0,-0.096463,235.0
3,1,1.0,1.217237,18.0
4,2,0.0,-0.319556,236.0
5,2,1.0,4.701209,17.0
