In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from pycaret.datasets import get_data
df = get_data('titanic')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Initial PyCaret Example

In [3]:
from pycaret.classification import *
clf = setup(data = df, target = 'Survived', session_id=123, silent = True, verbose=False, html=False)
models = compare_models(fold = 10,  round = 4,  sort = "Accuracy", turbo = True, n_select=5, verbose=False)
results = pull().reset_index(drop=True)
results

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.817,0.8584,0.6878,0.8198,0.7466,0.605,0.612,0.098
1,Ridge Classifier,0.8155,0.0,0.7462,0.7764,0.7603,0.6104,0.6114,0.022
2,Logistic Regression,0.8122,0.8582,0.7498,0.7691,0.7584,0.6048,0.606,0.556
3,Decision Tree Classifier,0.8106,0.7948,0.7215,0.7823,0.7493,0.5976,0.6002,0.023
4,Random Forest Classifier,0.7977,0.8573,0.6887,0.7688,0.7247,0.5662,0.5697,0.208
5,Ada Boost Classifier,0.7961,0.8507,0.6967,0.7679,0.7267,0.565,0.5703,0.047
6,Extra Trees Classifier,0.7898,0.853,0.6807,0.7585,0.7155,0.5501,0.5537,0.194
7,Light Gradient Boosting Machine,0.7881,0.8487,0.6925,0.754,0.7171,0.5489,0.5539,0.072
8,Linear Discriminant Analysis,0.7323,0.7612,0.6242,0.6703,0.6435,0.4303,0.4331,0.053
9,K Neighbors Classifier,0.6919,0.73,0.5865,0.6148,0.5984,0.3492,0.3509,0.066


# Creating a Wrapper

In [4]:
from fugue import transform
import pandas as pd

schema = """Model:str, Accuracy:float, AUC:float, Recall:float, Prec:float, 
F1:float, Kappa:float, MCC:float, TT_Sec:float"""

def wrapper(df: pd.DataFrame) -> pd.DataFrame:
    clf = setup(data = df, 
                target = 'Survived', 
                session_id=123, 
                silent = True, 
                verbose=False, 
                html=False)
    models = compare_models(fold = 10,  
                            round = 4,  
                            sort = "Accuracy", 
                            turbo = True, 
                            n_select=5, 
                            verbose=False)
    results = pull().reset_index(drop=True)
    # Fugue can't have spaces or . in column names
    results = results.rename(columns={"TT (Sec)": "TT_Sec", 
                                      "Prec.": "Prec"})
    return results.iloc[0:5]

res = transform(df, wrapper, schema=schema)
res

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec,F1,Kappa,MCC,TT_Sec
0,Gradient Boosting Classifier,0.817,0.8584,0.6878,0.8198,0.7466,0.605,0.612,0.1
1,Ridge Classifier,0.8155,0.0,0.7462,0.7764,0.7603,0.6104,0.6114,0.021
2,Logistic Regression,0.8122,0.8582,0.7498,0.7691,0.7584,0.6048,0.606,0.086
3,Decision Tree Classifier,0.8106,0.7948,0.7215,0.7823,0.7493,0.5976,0.6002,0.021
4,Random Forest Classifier,0.7977,0.8573,0.6887,0.7688,0.7247,0.5662,0.5697,0.206


# Partitioning. One model each for male and female

In [5]:
res = transform(df.replace({np.nan: None}), wrapper, schema=schema, partition={"by":"Sex"})
res

  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
Traceback (most recent call last):
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/pycaret/internal/pipeline.py", line 118, in fit
    result = super().fit(X, y=y, **fit_kwargs)
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/imblearn/pipeline.py", line 281, in fit
    self._final_estimator.fit(Xt, yt, **fit_params)
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1417, i

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec,F1,Kappa,MCC,TT_Sec
0,Ridge Classifier,0.831,0.0,0.9022,0.877,0.8879,0.5332,0.5435,0.007
1,Extra Trees Classifier,0.8264,0.8472,0.9029,0.8717,0.8859,0.5152,0.5245,0.168
2,Decision Tree Classifier,0.8219,0.7313,0.9092,0.8659,0.8847,0.4801,0.4965,0.004
3,Random Forest Classifier,0.8219,0.8421,0.9213,0.8556,0.8862,0.4763,0.4907,0.177
4,Gradient Boosting Classifier,0.8214,0.865,0.9026,0.8681,0.8833,0.4902,0.5018,0.021
5,Extra Trees Classifier,0.8288,0.6476,0.2125,0.7833,0.3251,0.2625,0.339,0.18
6,Light Gradient Boosting Machine,0.8116,0.7122,0.325,0.5542,0.4039,0.3021,0.3199,0.011
7,Ridge Classifier,0.8115,0.0,0.1375,0.4833,0.2083,0.1568,0.2017,0.006
8,Gradient Boosting Classifier,0.8089,0.6808,0.1125,0.5333,0.1794,0.1309,0.1862,0.039
9,Random Forest Classifier,0.8064,0.6668,0.1,0.4,0.1511,0.1098,0.1488,0.18


# Bringing it to Spark

The grid score is at the bottom of all the warnings

In [6]:
import fugue_spark

schema = """Model:str, Accuracy:float, AUC:float, Recall:float, Prec:float, 
F1:float, Kappa:float, MCC:float, TT_Sec:float, Sex:str"""

def wrapper(df: pd.DataFrame) -> pd.DataFrame:
    clf = setup(data = df, 
                target = 'Survived', 
                session_id=123, 
                silent = True, 
                verbose=False, 
                html=False)
    models = compare_models(fold = 10,  
                            round = 4,  
                            sort = "Accuracy", 
                            turbo = True, 
                            n_select=5, 
                            verbose=False)
    results = pull().reset_index(drop=True)
    # Fugue can't have spaces or . in column names
    results = results.rename(columns={"TT (Sec)": "TT_Sec", 
                                      "Prec.": "Prec"})
    results['Sex'] = df.iloc[0]["Sex"]
    return results.iloc[0:5]


res = transform(df.replace({np.nan: None}), wrapper, schema=schema, partition={"by":"Sex"}, engine="spark")
res = res.toPandas()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/27 00:47:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Traceback (most recent call last):
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/pycaret/internal/pipeline.py", line 118, in fit
    result = super().fit(X, y=y, **fit_kwargs)
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/imblearn/pipeline.py", line 281, in fit
    self._final_estimator.fit(Xt, yt, **fit_params)
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1417,

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp /

In [7]:
res

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec,F1,Kappa,MCC,TT_Sec,Sex
0,Ridge Classifier,0.831,0.0,0.9022,0.877,0.8879,0.5332,0.5435,0.005,female
1,Extra Trees Classifier,0.8264,0.8472,0.9029,0.8717,0.8859,0.5152,0.5245,0.166,female
2,Decision Tree Classifier,0.8219,0.7313,0.9092,0.8659,0.8847,0.4801,0.4965,0.008,female
3,Random Forest Classifier,0.8219,0.8421,0.9213,0.8556,0.8862,0.4763,0.4907,0.179,female
4,Gradient Boosting Classifier,0.8214,0.865,0.9026,0.8681,0.8833,0.4902,0.5018,0.023,female
5,Extra Trees Classifier,0.8288,0.6476,0.2125,0.7833,0.3251,0.2625,0.339,0.182,male
6,Light Gradient Boosting Machine,0.8116,0.7122,0.325,0.5542,0.4039,0.3021,0.3199,0.011,male
7,Ridge Classifier,0.8115,0.0,0.1375,0.4833,0.2083,0.1568,0.2017,0.007,male
8,Gradient Boosting Classifier,0.8089,0.6808,0.1125,0.5333,0.1794,0.1309,0.1862,0.039,male
9,Random Forest Classifier,0.8064,0.6668,0.1,0.4,0.1511,0.1098,0.1488,0.19,male


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
Traceback (most recent call last):
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/miniconda3/envs/pycaret/lib/python3.7/site-packages/pycaret/internal/pipeline.py", line 118, in fit
    result = super().

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = c

  overwrite_a=False)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp 

  overwrite_a=False)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  overwrite_a=False)
  overwrite_a=False)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
