# PyCaret AutoML

## Imports and Global Settings

In [1]:
import numpy as np
import pandas as pd
from pycaret.classification import *

pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)

## Loading Data

In [2]:
# EC2
filepath_prefix = "/home/ubuntu/"
# Local

# filepath_prefix = "/home/jeff/Documents/Data_Science_Projects/Yelp_Reviews/data/full_data/model_ready/"

In [3]:
train_records_to_load = 10000
test_records_to_load = 10000

In [4]:
datatypes = {'target_reg': 'int16',
 'review_stars': 'int16',
 'NB_prob': 'float32',
 'svm_pred': 'float32',
 'ft_prob': 'float32',
 'lda_t1': 'float32',
 'lda_t2': 'float32',
 'lda_t3': 'float32',
 'lda_t4': 'float32',
 'lda_t5': 'float32',
 'grade_level': 'float32',
 'polarity': 'float32',
 'subjectivity': 'float32',
 'word_cnt': 'int16',
 'character_cnt': 'int16',
 'num_cnt': 'int16',
 'uppercase_cnt': 'int16',
 '#@_cnt': 'int16',
 'sentence_cnt': 'int16',
 'lexicon_cnt': 'int16',
 'syllable_cnt': 'int16',
 'avg_word_len': 'float32',
 'token_cnt': 'int16',
 'stopword_cnt': 'int16',
 'stopword_pct': 'float32',
 'ent_cnt': 'int16',
 'ent_pct': 'float32',
 'pos_adj_pct': 'float32',
 'pos_adj_cnt': 'int16',
 'pos_adp_pct': 'float32',
 'pos_adp_cnt': 'int16',
 'pos_adv_pct': 'float32',
 'pos_adv_cnt': 'int16',
 'pos_aux_pct': 'float32',
 'pos_aux_cnt': 'int16',
 'pos_conj_pct': 'float32',
 'pos_conj_cnt': 'int16',
 'pos_det_pct': 'float32',
 'pos_det_cnt': 'int16',
 'pos_intj_pct': 'float32',
 'pos_intj_cnt': 'int16',
 'pos_noun_pct': 'float32',
 'pos_noun_cnt': 'int16',
 'pos_num_pct': 'float32',
 'pos_num_cnt': 'int16',
 'pos_part_pct': 'float32',
 'pos_part_cnt': 'int16',
 'pos_pron_pct': 'float32',
 'pos_pron_cnt': 'int16',
 'pos_propn_pct': 'float32',
 'pos_propn_cnt': 'int16',
 'pos_punct_pct': 'float32',
 'pos_punct_cnt': 'int16',
 'pos_sconj_pct': 'float32',
 'pos_sconj_cnt': 'int16',
 'pos_sym_pct': 'float32',
 'pos_sym_cnt': 'int16',
 'pos_verb_pct': 'float32',
 'pos_verb_cnt': 'int16',
 'pos_x_pct': 'float32',
 'pos_x_cnt': 'int16',
 'dep_root_pct': 'float32',
 'dep_root_cnt': 'int16',
 'dep_acl_pct': 'float32',
 'dep_acl_cnt': 'int16',
 'dep_acomp_pct': 'float32',
 'dep_acomp_cnt': 'int16',
 'dep_advcl_pct': 'float32',
 'dep_advcl_cnt': 'int16',
 'dep_advmod_pct': 'float32',
 'dep_advmod_cnt': 'int16',
 'dep_agent_pct': 'float32',
 'dep_agent_cnt': 'int16',
 'dep_amod_pct': 'float32',
 'dep_amod_cnt': 'int16',
 'dep_appos_pct': 'float32',
 'dep_appos_cnt': 'int16',
 'dep_attr_pct': 'float32',
 'dep_attr_cnt': 'int16',
 'dep_aux_pct': 'float32',
 'dep_aux_cnt': 'int16',
 'dep_auxpass_pct': 'float32',
 'dep_auxpass_cnt': 'int16',
 'dep_case_pct': 'float32',
 'dep_case_cnt': 'int16',
 'dep_cc_pct': 'float32',
 'dep_cc_cnt': 'int16',
 'dep_ccomp_pct': 'float32',
 'dep_ccomp_cnt': 'int16',
 'dep_compound_pct': 'float32',
 'dep_compound_cnt': 'int16',
 'dep_conj_pct': 'float32',
 'dep_conj_cnt': 'int16',
 'dep_csubj_pct': 'float32',
 'dep_csubj_cnt': 'int16',
 'dep_csubjpass_pct': 'float32',
 'dep_csubjpass_cnt': 'int16',
 'dep_dative_pct': 'float32',
 'dep_dative_cnt': 'int16',
 'dep_dep_pct': 'float32',
 'dep_dep_cnt': 'int16',
 'dep_det_pct': 'float32',
 'dep_det_cnt': 'int16',
 'dep_dobj_pct': 'float32',
 'dep_dobj_cnt': 'int16',
 'dep_expl_pct': 'float32',
 'dep_expl_cnt': 'int16',
 'dep_intj_pct': 'float32',
 'dep_intj_cnt': 'int16',
 'dep_mark_pct': 'float32',
 'dep_mark_cnt': 'int16',
 'dep_meta_pct': 'float32',
 'dep_meta_cnt': 'int16',
 'dep_neg_pct': 'float32',
 'dep_neg_cnt': 'int16',
 'dep_nmod_pct': 'float32',
 'dep_nmod_cnt': 'int16',
 'dep_npadvmod_pct': 'float32',
 'dep_npadvmod_cnt': 'int16',
 'dep_nsubj_pct': 'float32',
 'dep_nsubj_cnt': 'int16',
 'dep_nsubjpass_pct': 'float32',
 'dep_nsubjpass_cnt': 'int16',
 'dep_nummod_pct': 'float32',
 'dep_nummod_cnt': 'int16',
 'dep_oprd_pct': 'float32',
 'dep_oprd_cnt': 'int16',
 'dep_parataxis_pct': 'float32',
 'dep_parataxis_cnt': 'int16',
 'dep_pcomp_pct': 'float32',
 'dep_pcomp_cnt': 'int16',
 'dep_pobj_pct': 'float32',
 'dep_pobj_cnt': 'int16',
 'dep_poss_pct': 'float32',
 'dep_poss_cnt': 'int16',
 'dep_preconj_pct': 'float32',
 'dep_preconj_cnt': 'int16',
 'dep_predet_pct': 'float32',
 'dep_predet_cnt': 'int16',
 'dep_prep_pct': 'float32',
 'dep_prep_cnt': 'int16',
 'dep_prt_pct': 'float32',
 'dep_prt_cnt': 'int16',
 'dep_punct_pct': 'float32',
 'dep_punct_cnt': 'int16',
 'dep_quantmod_pct': 'float32',
 'dep_quantmod_cnt': 'int16',
 'dep_relcl_pct': 'float32',
 'dep_relcl_cnt': 'int16',
 'dep_xcomp_pct': 'float32',
 'dep_xcomp_cnt': 'int16',
 'ent_cardinal_pct': 'float32',
 'ent_cardinal_cnt': 'int16',
 'ent_date_pct': 'float32',
 'ent_date_cnt': 'int16',
 'ent_event_pct': 'float32',
 'ent_event_cnt': 'int16',
 'ent_fac_pct': 'float32',
 'ent_fac_cnt': 'int16',
 'ent_gpe_pct': 'float32',
 'ent_gpe_cnt': 'int16',
 'ent_language_pct': 'float32',
 'ent_language_cnt': 'int16',
 'ent_law_pct': 'float32',
 'ent_law_cnt': 'int16',
 'ent_loc_pct': 'float32',
 'ent_loc_cnt': 'int16',
 'ent_money_pct': 'float32',
 'ent_money_cnt': 'int16',
 'ent_norp_pct': 'float32',
 'ent_norp_cnt': 'int16',
 'ent_ordinal_pct': 'float32',
 'ent_ordinal_cnt': 'int16',
 'ent_org_pct': 'float32',
 'ent_org_cnt': 'int16',
 'ent_percent_pct': 'float32',
 'ent_percent_cnt': 'int16',
 'ent_person_pct': 'float32',
 'ent_person_cnt': 'int16',
 'ent_product_pct': 'float32',
 'ent_product_cnt': 'int16',
 'ent_quantity_pct': 'float32',
 'ent_quantity_cnt': 'int16',
 'ent_time_pct': 'float32',
 'ent_time_cnt': 'int16',
 'ent_work_of_art_pct': 'float32',
 'ent_work_of_art_cnt': 'int16'}

In [5]:
train = pd.read_csv(f"{filepath_prefix}train.csv", nrows=train_records_to_load,
                    true_values=["True"], false_values=["False"], dtype=datatypes)
test = pd.read_csv(f"{filepath_prefix}test.csv", nrows=test_records_to_load,
                   true_values=["True"], false_values=["False"], dtype=datatypes)

In [6]:
X_train = train.drop(columns=['review_id', 'target_clf', 'target_reg'])
X_test = test.drop(columns=['review_id', 'target_clf', 'target_reg'])
y_train = train['target_clf']
y_test = test['target_clf']

## Basic Overview

In [7]:
train.head(5)

Unnamed: 0,review_id,target_clf,target_reg,review_stars,nb_prob,svm_pred,ft_prob,lda_t1,lda_t2,lda_t3,lda_t4,lda_t5,grade_level,polarity,subjectivity,word_cnt,character_cnt,num_cnt,uppercase_cnt,#@_cnt,sentence_cnt,lexicon_cnt,syllable_cnt,avg_word_len,token_cnt,stopword_cnt,stopword_pct,ent_cnt,ent_pct,pos_adj_pct,pos_adj_cnt,pos_adp_pct,pos_adp_cnt,pos_adv_pct,pos_adv_cnt,pos_aux_pct,pos_aux_cnt,pos_conj_pct,pos_conj_cnt,pos_det_pct,pos_det_cnt,pos_intj_pct,pos_intj_cnt,pos_noun_pct,pos_noun_cnt,pos_num_pct,pos_num_cnt,pos_part_pct,pos_part_cnt,pos_pron_pct,pos_pron_cnt,pos_propn_pct,pos_propn_cnt,pos_punct_pct,pos_punct_cnt,pos_sconj_pct,pos_sconj_cnt,pos_sym_pct,pos_sym_cnt,pos_verb_pct,pos_verb_cnt,pos_x_pct,pos_x_cnt,dep_root_pct,dep_root_cnt,dep_acl_pct,dep_acl_cnt,dep_acomp_pct,dep_acomp_cnt,dep_advcl_pct,dep_advcl_cnt,dep_advmod_pct,dep_advmod_cnt,dep_agent_pct,dep_agent_cnt,dep_amod_pct,dep_amod_cnt,dep_appos_pct,dep_appos_cnt,dep_attr_pct,dep_attr_cnt,dep_aux_pct,dep_aux_cnt,dep_auxpass_pct,dep_auxpass_cnt,dep_case_pct,dep_case_cnt,dep_cc_pct,dep_cc_cnt,dep_ccomp_pct,dep_ccomp_cnt,dep_compound_pct,dep_compound_cnt,dep_conj_pct,dep_conj_cnt,dep_csubj_pct,dep_csubj_cnt,dep_csubjpass_pct,dep_csubjpass_cnt,dep_dative_pct,dep_dative_cnt,dep_dep_pct,dep_dep_cnt,dep_det_pct,dep_det_cnt,dep_dobj_pct,dep_dobj_cnt,dep_expl_pct,dep_expl_cnt,dep_intj_pct,dep_intj_cnt,dep_mark_pct,dep_mark_cnt,dep_meta_pct,dep_meta_cnt,dep_neg_pct,dep_neg_cnt,dep_nmod_pct,dep_nmod_cnt,dep_npadvmod_pct,dep_npadvmod_cnt,dep_nsubj_pct,dep_nsubj_cnt,dep_nsubjpass_pct,dep_nsubjpass_cnt,dep_nummod_pct,dep_nummod_cnt,dep_oprd_pct,dep_oprd_cnt,dep_parataxis_pct,dep_parataxis_cnt,dep_pcomp_pct,dep_pcomp_cnt,dep_pobj_pct,dep_pobj_cnt,dep_poss_pct,dep_poss_cnt,dep_preconj_pct,dep_preconj_cnt,dep_predet_pct,dep_predet_cnt,dep_prep_pct,dep_prep_cnt,dep_prt_pct,dep_prt_cnt,dep_punct_pct,dep_punct_cnt,dep_quantmod_pct,dep_quantmod_cnt,dep_relcl_pct,dep_relcl_cnt,dep_xcomp_pct,dep_xcomp_cnt,ent_cardinal_pct,ent_cardinal_cnt,ent_date_pct,ent_date_cnt,ent_event_pct,ent_event_cnt,ent_fac_pct,ent_fac_cnt,ent_gpe_pct,ent_gpe_cnt,ent_language_pct,ent_language_cnt,ent_law_pct,ent_law_cnt,ent_loc_pct,ent_loc_cnt,ent_money_pct,ent_money_cnt,ent_norp_pct,ent_norp_cnt,ent_ordinal_pct,ent_ordinal_cnt,ent_org_pct,ent_org_cnt,ent_percent_pct,ent_percent_cnt,ent_person_pct,ent_person_cnt,ent_product_pct,ent_product_cnt,ent_quantity_pct,ent_quantity_cnt,ent_time_pct,ent_time_cnt,ent_work_of_art_pct,ent_work_of_art_cnt
0,syrAB11Ayj0qb64M3orNyQ,False,0,4,0.0,0.805,0.78156,0.0049,0.00489,0.83379,0.15149,0.00493,12.1,0.13819,0.58714,91,552,0,3,0,3,95,123,4.71875,129,51,0.39535,9,0.06977,0.10078,13,0.06202,8,0.05426,7,0.05426,7,0.0,0,0.09302,12,0.00775,1,0.09302,12,0.0,0,0.0155,2,0.07752,10,0.12403,16,0.16279,21,0.02326,3,0.0,0,0.06977,9,0.0,0,0.06977,9,0.0,0,0.0155,2,0.0,0,0.05426,7,0.0,0,0.07752,10,0.0155,2,0.00775,1,0.02326,3,0.0,0,0.0155,2,0.02326,3,0.02326,3,0.06977,9,0.04651,6,0.0,0,0.0,0,0.0,0,0.0,0,0.04651,6,0.02326,3,0.0,0,0.00775,1,0.00775,1,0.0,0,0.0,0,0.0,0,0.0,0,0.06977,9,0.0,0,0.0,0,0.0,0,0.00775,1,0.0,0,0.06977,9,0.03101,4,0.0,0,0.0,0,0.08527,11,0.0,0,0.16279,21,0.0,0,0.00775,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.00775,1,0.0,0,0.0,0,0.0,0,0.0,0,0.02326,3,0.0,0,0.0,0,0.0,0,0.03876,5,0.0,0,0.0,0,0.0,0,0.0,0
1,ybCCcr1ICVynGJBx0lpBAw,False,0,4,0.885,-0.356,0.61055,0.00686,0.00729,0.97217,0.00684,0.00685,5.3,0.44034,0.66364,57,338,0,0,0,5,56,80,4.94737,66,29,0.43939,2,0.0303,0.06061,4,0.07576,5,0.09091,6,0.09091,6,0.0,0,0.10606,7,0.0,0,0.18182,12,0.0,0,0.0,0,0.0303,2,0.06061,4,0.13636,9,0.0303,2,0.0,0,0.12121,8,0.0,0,0.09091,6,0.01515,1,0.0303,2,0.0,0,0.09091,6,0.0,0,0.04545,3,0.01515,1,0.0,0,0.0303,2,0.0303,2,0.0,0,0.01515,1,0.0303,2,0.04545,3,0.01515,1,0.0,0,0.0,0,0.0,0,0.0,0,0.09091,6,0.0303,2,0.0,0,0.0,0,0.01515,1,0.0,0,0.0,0,0.0,0,0.0,0,0.09091,6,0.0303,2,0.0,0,0.0,0,0.0,0,0.0,0,0.06061,4,0.0,0,0.0,0,0.0,0,0.06061,4,0.0303,2,0.12121,8,0.0,0,0.01515,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.01515,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.01515,1
2,HBuNpb82_z25gLK2htZjZw,True,4,5,0.004,-0.014,0.59267,0.00297,0.00297,0.21996,0.51115,0.26294,27.6,0.28628,0.54253,151,764,0,6,0,2,143,188,4.0604,184,80,0.43478,3,0.0163,0.08696,16,0.08152,15,0.05978,11,0.05435,10,0.0,0,0.13587,25,0.0,0,0.17391,32,0.0,0,0.0163,3,0.06522,12,0.0163,3,0.14674,27,0.00543,1,0.00543,1,0.07609,14,0.0,0,0.10326,19,0.00543,1,0.00543,1,0.00543,1,0.04891,9,0.0,0,0.08152,15,0.0,0,0.02174,4,0.02174,4,0.0,0,0.0,0,0.03261,6,0.0163,3,0.03261,6,0.0163,3,0.0,0,0.0,0,0.0,0,0.0,0,0.08696,16,0.03261,6,0.00543,1,0.0,0,0.00543,1,0.0,0,0.01087,2,0.0,0,0.0,0,0.08152,15,0.0,0,0.0,0,0.0,0,0.00543,1,0.0,0,0.07065,13,0.02174,4,0.0,0,0.0,0,0.06522,12,0.00543,1,0.13587,25,0.00543,1,0.01087,2,0.01087,2,0.0,0,0.00543,1,0.0,0,0.0,0,0.00543,1,0.0,0,0.0,0,0.00543,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
3,RAIaaKEIg9gSJ-B4XcXBwA,False,0,1,0.0,0.892,0.55841,0.57003,0.0015,0.00151,0.42545,0.00151,6.1,0.02428,0.49481,327,1685,2,3,0,20,326,411,4.15596,373,204,0.54692,16,0.0429,0.04021,15,0.06971,26,0.06434,24,0.05362,20,0.0,0,0.13673,51,0.0134,5,0.16622,62,0.03753,14,0.02949,11,0.07239,27,0.01877,7,0.0992,37,0.0134,5,0.0,0,0.12869,48,0.0,0,0.06971,26,0.0,0,0.01609,6,0.01609,6,0.0563,21,0.0,0,0.03485,13,0.00536,2,0.00268,1,0.02949,11,0.00536,2,0.00536,2,0.05898,22,0.0134,5,0.01072,4,0.0563,21,0.0,0,0.0,0,0.0,0,0.00536,2,0.08311,31,0.06971,26,0.0,0,0.00804,3,0.00804,3,0.0,0,0.01609,6,0.00536,2,0.0,0,0.10992,41,0.00268,1,0.02145,8,0.00268,1,0.0,0,0.00268,1,0.05898,22,0.04021,15,0.0,0,0.0,0,0.06166,23,0.00268,1,0.0992,37,0.01072,4,0.00536,2,0.00536,2,0.03217,12,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.00268,1,0.00804,3,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0
4,4fbqvddoQTLa7ChLJDYreg,False,0,5,0.0,-3.863,0.25632,0.01018,0.01022,0.95915,0.01019,0.01026,4.6,0.5125,0.67375,37,198,0,2,0,4,37,50,4.37838,43,17,0.39535,1,0.02326,0.13953,6,0.04651,2,0.04651,2,0.02326,1,0.0,0,0.16279,7,0.0,0,0.18605,8,0.0,0,0.0,0,0.04651,2,0.04651,2,0.13953,6,0.0,0,0.0,0,0.09302,4,0.0,0,0.11628,5,0.0,0,0.02326,1,0.0,0,0.04651,2,0.0,0,0.06977,3,0.02326,1,0.0,0,0.0,0,0.0,0,0.0,0,0.06977,3,0.0,0,0.02326,1,0.06977,3,0.0,0,0.0,0,0.0,0,0.0,0,0.11628,5,0.04651,2,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.04651,2,0.09302,4,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.02326,1,0.02326,1,0.0,0,0.02326,1,0.02326,1,0.02326,1,0.13953,6,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.02326,1,0.0,0,0.0,0,0.0,0,0.0,0


In [8]:
train.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 189 columns):
 #    Column               Dtype  
---   ------               -----  
 0    review_id            object 
 1    target_clf           bool   
 2    target_reg           int16  
 3    review_stars         int16  
 4    nb_prob              float64
 5    svm_pred             float32
 6    ft_prob              float32
 7    lda_t1               float32
 8    lda_t2               float32
 9    lda_t3               float32
 10   lda_t4               float32
 11   lda_t5               float32
 12   grade_level          float32
 13   polarity             float32
 14   subjectivity         float32
 15   word_cnt             int16  
 16   character_cnt        int16  
 17   num_cnt              int16  
 18   uppercase_cnt        int16  
 19   #@_cnt               int16  
 20   sentence_cnt         int16  
 21   lexicon_cnt          int16  
 22   syllable_cnt         int16  
 23   avg_word_l

## PyCaret AutoML Script - EC2

### Optimizer Setup

In [9]:
# https://pycaret.readthedocs.io/en/latest/api/classification.html#module-pycaret.classification
pycaret_clf = setup(# Experiment Name
                    experiment_name="PyCaret_Clf_1M_2",
                    # Inbound Data
                    data=train.drop(columns=['review_id', 'target_reg']),
                    test_data=test.drop(columns=['review_id', 'target_reg']),
                    target='target_clf',
                    # Options
                    silent=True, preprocess=True, fold=5,
                    n_jobs=-1, log_experiment=True, session_id=7,
                    # Data Rescaling Options
                    normalize=True,
                    transformation=False,
                    # Data Trimming Options
                    remove_multicollinearity=False,
                    remove_outliers=False,
                    # PCA Dimensionality Reduction
                    pca=False, pca_components=10,
                    # Options to Add Features
                    create_clusters=False, 
                    polynomial_features=False,
                    trigonometry_features=False,
                    feature_interaction=False,
                    feature_ratio=False,
                    # Feature Reduction
                    feature_selection=False)

Unnamed: 0,Description,Value
0,session_id,7
1,Target,target_clf
2,Target Type,Binary
3,Label Encoded,"False: 0, True: 1"
4,Original Data,"(10000, 187)"
5,Missing Values,False
6,Numeric Features,186
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


### Compare Models

In [10]:
top_models = compare_models(n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.6713,0.0,0.632,0.6868,0.6581,0.3427,0.3439,0.054
lr,Logistic Regression,0.671,0.7338,0.6246,0.6894,0.6552,0.3421,0.3438,1.67
lda,Linear Discriminant Analysis,0.6708,0.7303,0.6318,0.6862,0.6577,0.3417,0.3429,0.182
gbc,Gradient Boosting Classifier,0.6706,0.7353,0.6386,0.6833,0.6599,0.3413,0.3423,4.894
ada,Ada Boost Classifier,0.6669,0.7263,0.6152,0.687,0.6488,0.3339,0.336,1.016
rf,Random Forest Classifier,0.6635,0.7186,0.6316,0.6757,0.6526,0.3271,0.328,0.926
lightgbm,Light Gradient Boosting Machine,0.6599,0.7246,0.6258,0.6727,0.6481,0.3199,0.3209,0.842
et,Extra Trees Classifier,0.6518,0.7079,0.6424,0.6557,0.6488,0.3036,0.3038,0.582
qda,Quadratic Discriminant Analysis,0.6138,0.6631,0.4541,0.6684,0.5389,0.228,0.241,0.13
nb,Naive Bayes,0.6114,0.6767,0.4115,0.687,0.5144,0.2233,0.2438,0.05


### Specific Model Creation and Tuning

In [11]:
log_reg = create_model('lr', fold = 10)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.661,0.7076,0.63,0.6716,0.6502,0.322,0.3226
1,0.675,0.7523,0.62,0.6966,0.6561,0.35,0.3521
2,0.655,0.7228,0.6028,0.6741,0.6365,0.3101,0.3119
3,0.664,0.7193,0.6148,0.6829,0.6471,0.3281,0.3298
4,0.67,0.7358,0.6228,0.6887,0.6541,0.3401,0.3417
5,0.664,0.7221,0.5868,0.695,0.6364,0.3282,0.3323
6,0.655,0.7274,0.6048,0.6733,0.6372,0.3101,0.3118
7,0.71,0.769,0.6327,0.7494,0.6861,0.4202,0.4254
8,0.678,0.7535,0.6687,0.6823,0.6754,0.356,0.3561
9,0.673,0.7486,0.6407,0.6859,0.6625,0.3461,0.3468


In [12]:
tuned_log_reg = tune_model(log_reg, optimize = 'Accuracy')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6715,0.7293,0.6274,0.6886,0.6566,0.3431,0.3444
1,0.6605,0.7186,0.6034,0.6817,0.6402,0.3211,0.3232
2,0.6735,0.7276,0.6128,0.6985,0.6528,0.3472,0.3498
3,0.678,0.7426,0.6148,0.7048,0.6567,0.3562,0.3591
4,0.6805,0.7503,0.6627,0.6881,0.6751,0.361,0.3613
Mean,0.6728,0.7337,0.6242,0.6923,0.6563,0.3457,0.3476
SD,0.0069,0.0113,0.0207,0.0082,0.0112,0.0139,0.0136


### Evaluate Model/Plotting

In [13]:
evaluate_model(tuned_log_reg)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [15]:
interpret_model(tuned_log_reg)

ImportError: shap library not found. pip install shap to use interpret_model function.

### Predict on Test Data

In [16]:
predict_model(tuned_log_reg);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.6418,0.6951,0.5917,0.6567,0.6226,0.2835,0.2849


### Finalize and Save Model

In [21]:
# save_model(tuned_log_reg, 'pycaret_log_reg')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='target_clf', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_...
                                        target_variable='target_clf',
                                        threshold=0.9)),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  LogisticRegression(C=0.46, cl

In [22]:
# save_model(tuned_gbm, 'pycaret_light_gbm')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[],
                                       target='target_clf', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_...
                                 boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, feature_fraction=0.9,
                                 importance_type='split', learning_rate=0.2,
                                 max_depth=-1,