### Importing all of the classification modules from pycaret. Important to note that prior to this I created a python environment in which I installed pycaret. I did not do it within the notebook because I have other libraries and dependencies I didn't want to mess with.

In [1]:
from pycaret.classification import *

### Importing additional libraries now that we need to complete some of the data organization.

In [2]:
import pandas as pd
import numpy as np

### The following section is code from the scikit-learn library. PyCaret is only compatible with version 0.23.2 of sklearn, however in 1.0 and beyond a new very useful method of CV was created--Stratified Group K Fold. This is borrowing code from their github to instantiate this class so it can be passed as a custom cv object. 

In [31]:
from collections.abc import Iterable
from collections import defaultdict
import warnings
from itertools import chain, combinations
from math import ceil, floor
import numbers
from abc import ABCMeta, abstractmethod
from inspect import signature

import numpy as np
from scipy.special import comb

from sklearn.utils import indexable, check_random_state, _safe_indexing
from sklearn.utils import _approximate_mode
from sklearn.utils.validation import _num_samples, column_or_1d
from sklearn.utils.validation import check_array
from sklearn.utils.multiclass import type_of_target
from sklearn.base import _pprint

In [32]:
class BaseCrossValidator(metaclass=ABCMeta):
    """Base class for all cross-validators
    Implementations must define `_iter_test_masks` or `_iter_test_indices`.
    """

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where `n_samples` is the number of samples
            and `n_features` is the number of features.
        y : array-like of shape (n_samples,)
            The target variable for supervised learning problems.
        groups : array-like of shape (n_samples,), default=None
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        indices = np.arange(_num_samples(X))
        for test_index in self._iter_test_masks(X, y, groups):
            train_index = indices[np.logical_not(test_index)]
            test_index = indices[test_index]
            yield train_index, test_index

    # Since subclasses must implement either _iter_test_masks or
    # _iter_test_indices, neither can be abstract.
    def _iter_test_masks(self, X=None, y=None, groups=None):
        """Generates boolean masks corresponding to test sets.
        By default, delegates to _iter_test_indices(X, y, groups)
        """
        for test_index in self._iter_test_indices(X, y, groups):
            test_mask = np.zeros(_num_samples(X), dtype=bool)
            test_mask[test_index] = True
            yield test_mask

    def _iter_test_indices(self, X=None, y=None, groups=None):
        """Generates integer indices corresponding to test sets."""
        raise NotImplementedError

    @abstractmethod
    def get_n_splits(self, X=None, y=None, groups=None):
        """Returns the number of splitting iterations in the cross-validator"""

    def __repr__(self):
        return _build_repr(self)
    
class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta):
    """Base class for KFold, GroupKFold, and StratifiedKFold"""

    @abstractmethod
    def __init__(self, n_splits, *, shuffle, random_state):
        if not isinstance(n_splits, numbers.Integral):
            raise ValueError(
                "The number of folds must be of Integral type. "
                "%s of type %s was passed." % (n_splits, type(n_splits))
            )
        n_splits = int(n_splits)

        if n_splits <= 1:
            raise ValueError(
                "k-fold cross-validation requires at least one"
                " train/test split by setting n_splits=2 or more,"
                " got n_splits={0}.".format(n_splits)
            )

        if not isinstance(shuffle, bool):
            raise TypeError("shuffle must be True or False; got {0}".format(shuffle))

        if not shuffle and random_state is not None:  # None is the default
            raise ValueError(
                "Setting a random_state has no effect since shuffle is "
                "False. You should leave "
                "random_state to its default (None), or set shuffle=True.",
            )

        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where `n_samples` is the number of samples
            and `n_features` is the number of features.
        y : array-like of shape (n_samples,), default=None
            The target variable for supervised learning problems.
        groups : array-like of shape (n_samples,), default=None
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        if self.n_splits > n_samples:
            raise ValueError(
                (
                    "Cannot have number of splits n_splits={0} greater"
                    " than the number of samples: n_samples={1}."
                ).format(self.n_splits, n_samples)
            )

        for train, test in super().split(X, y, groups):
            yield train, test

    def get_n_splits(self, X=None, y=None, groups=None):
        """Returns the number of splitting iterations in the cross-validator
        Parameters
        ----------
        X : object
            Always ignored, exists for compatibility.
        y : object
            Always ignored, exists for compatibility.
        groups : object
            Always ignored, exists for compatibility.
        Returns
        -------
        n_splits : int
            Returns the number of splitting iterations in the cross-validator.
        """
        return self.n_splits
    
class StratifiedGroupKFold(_BaseKFold):
    """Stratified K-Folds iterator variant with non-overlapping groups.
    This cross-validation object is a variation of StratifiedKFold attempts to
    return stratified folds with non-overlapping groups. The folds are made by
    preserving the percentage of samples for each class.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    The difference between GroupKFold and StratifiedGroupKFold is that
    the former attempts to create balanced folds such that the number of
    distinct groups is approximately the same in each fold, whereas
    StratifiedGroupKFold attempts to create folds which preserve the
    percentage of samples for each class as much as possible given the
    constraint of non-overlapping groups between splits.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of folds. Must be at least 2.
    shuffle : bool, default=False
        Whether to shuffle each class's samples before splitting into batches.
        Note that the samples within each split will not be shuffled.
        This implementation can only shuffle groups that have approximately the
        same y distribution, no global shuffle will be performed.
    random_state : int or RandomState instance, default=None
        When `shuffle` is True, `random_state` affects the ordering of the
        indices, which controls the randomness of each fold for each class.
        Otherwise, leave `random_state` as `None`.
        Pass an int for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import StratifiedGroupKFold
    >>> X = np.ones((17, 2))
    >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])
    >>> cv = StratifiedGroupKFold(n_splits=3)
    >>> for train_idxs, test_idxs in cv.split(X, y, groups):
    ...     print("TRAIN:", groups[train_idxs])
    ...     print("      ", y[train_idxs])
    ...     print(" TEST:", groups[test_idxs])
    ...     print("      ", y[test_idxs])
    TRAIN: [1 1 2 2 4 5 5 5 5 8 8]
           [0 0 1 1 1 0 0 0 0 0 0]
     TEST: [3 3 3 6 6 7]
           [1 1 1 0 0 0]
    TRAIN: [3 3 3 4 5 5 5 5 6 6 7]
           [1 1 1 1 0 0 0 0 0 0 0]
     TEST: [1 1 2 2 8 8]
           [0 0 1 1 0 0]
    TRAIN: [1 1 2 2 3 3 3 6 6 7 8 8]
           [0 0 1 1 1 1 1 0 0 0 0 0]
     TEST: [4 5 5 5 5]
           [1 0 0 0 0]
    Notes
    -----
    The implementation is designed to:
    * Mimic the behavior of StratifiedKFold as much as possible for trivial
      groups (e.g. when each group contains only one sample).
    * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
      ``y = [1, 0]`` should not change the indices generated.
    * Stratify based on samples as much as possible while keeping
      non-overlapping groups constraint. That means that in some cases when
      there is a small number of groups containing a large number of samples
      the stratification will not be possible and the behavior will be close
      to GroupKFold.
    See also
    --------
    StratifiedKFold: Takes class information into account to build folds which
        retain class distributions (for binary or multiclass classification
        tasks).
    GroupKFold: K-fold iterator variant with non-overlapping groups.
    """

    def __init__(self, n_splits=5, shuffle=False, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def _iter_test_indices(self, X, y, groups):
        # Implementation is based on this kaggle kernel:
        # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
        # and is a subject to Apache 2.0 License. You may obtain a copy of the
        # License at http://www.apache.org/licenses/LICENSE-2.0
        # Changelist:
        # - Refactored function to a class following scikit-learn KFold
        #   interface.
        # - Added heuristic for assigning group to the least populated fold in
        #   cases when all other criteria are equal
        # - Swtch from using python ``Counter`` to ``np.unique`` to get class
        #   distribution
        # - Added scikit-learn checks for input: checking that target is binary
        #   or multiclass, checking passed random state, checking that number
        #   of splits is less than number of members in each class, checking
        #   that least populated class has more members than there are splits.
        rng = check_random_state(self.random_state)
        y = np.asarray(y)
        type_of_target_y = type_of_target(y)
        allowed_target_types = ("binary", "multiclass")
        if type_of_target_y not in allowed_target_types:
            raise ValueError(
                "Supported target types are: {}. Got {!r} instead.".format(
                    allowed_target_types, type_of_target_y
                )
            )

        y = column_or_1d(y)
        _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True)
        if np.all(self.n_splits > y_cnt):
            raise ValueError(
                "n_splits=%d cannot be greater than the"
                " number of members in each class." % (self.n_splits)
            )
        n_smallest_class = np.min(y_cnt)
        if self.n_splits > n_smallest_class:
            warnings.warn(
                "The least populated class in y has only %d"
                " members, which is less than n_splits=%d."
                % (n_smallest_class, self.n_splits),
                UserWarning,
            )
        n_classes = len(y_cnt)

        _, groups_inv, groups_cnt = np.unique(
            groups, return_inverse=True, return_counts=True
        )
        y_counts_per_group = np.zeros((len(groups_cnt), n_classes))
        for class_idx, group_idx in zip(y_inv, groups_inv):
            y_counts_per_group[group_idx, class_idx] += 1

        y_counts_per_fold = np.zeros((self.n_splits, n_classes))
        groups_per_fold = defaultdict(set)

        if self.shuffle:
            rng.shuffle(y_counts_per_group)

        # Stable sort to keep shuffled order for groups with the same
        # class distribution variance
        sorted_groups_idx = np.argsort(
            -np.std(y_counts_per_group, axis=1), kind="mergesort"
        )

        for group_idx in sorted_groups_idx:
            group_y_counts = y_counts_per_group[group_idx]
            best_fold = self._find_best_fold(
                y_counts_per_fold=y_counts_per_fold,
                y_cnt=y_cnt,
                group_y_counts=group_y_counts,
            )
            y_counts_per_fold[best_fold] += group_y_counts
            groups_per_fold[best_fold].add(group_idx)

        for i in range(self.n_splits):
            test_indices = [
                idx
                for idx, group_idx in enumerate(groups_inv)
                if group_idx in groups_per_fold[i]
            ]
            yield test_indices

    def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts):
        best_fold = None
        min_eval = np.inf
        min_samples_in_fold = np.inf
        for i in range(self.n_splits):
            y_counts_per_fold[i] += group_y_counts
            # Summarise the distribution over classes in each proposed fold
            std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0)
            y_counts_per_fold[i] -= group_y_counts
            fold_eval = np.mean(std_per_class)
            samples_in_fold = np.sum(y_counts_per_fold[i])
            is_current_fold_better = (
                fold_eval < min_eval
                or np.isclose(fold_eval, min_eval)
                and samples_in_fold < min_samples_in_fold
            )
            if is_current_fold_better:
                min_eval = fold_eval
                min_samples_in_fold = samples_in_fold
                best_fold = i
        return best_fold

In [33]:
### TESTING THE INSTANTIATION

X = np.ones((17, 2))
y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])
cv = StratifiedGroupKFold(n_splits=3)
for train_idxs, test_idxs in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idxs])
    print("      ", y[train_idxs])
    print(" TEST:", groups[test_idxs])
    print("      ", y[test_idxs])

TRAIN: [1 1 2 2 4 5 5 5 5 8 8]
       [0 0 1 1 1 0 0 0 0 0 0]
 TEST: [3 3 3 6 6 7]
       [1 1 1 0 0 0]
TRAIN: [3 3 3 4 5 5 5 5 6 6 7]
       [1 1 1 1 0 0 0 0 0 0 0]
 TEST: [1 1 2 2 8 8]
       [0 0 1 1 0 0]
TRAIN: [1 1 2 2 3 3 3 6 6 7 8 8]
       [0 0 1 1 1 1 1 0 0 0 0 0]
 TEST: [4 5 5 5 5]
       [1 0 0 0 0]


### Organizing Dataset -- Getting rid of variables that are not of interest and have far too many NA values, getting all points before 2008, Grabbing the subset of our data containing the 28 accounting variables of interest as well as the gvkey (unique for each firm) and the year of the observation

In [34]:
df = pd.read_csv("Documents/findata.csv")
df.head()
df.drop(labels=['sich', 'insbnk', 'understatement', 'option','p_aaer','new_p_aaer'], axis=1, inplace=True)
df.head()
dffin = df[df['Year'] < 2008]
dffin.head()
data = dffin.iloc[:, list((x for x in range(0,31)))]
data = data.dropna()
data.head()

Unnamed: 0,Year,gvkey,Fraud,Current Assets,Accounts Payable,Total Assets,Common Equity,Cash and Short-Term Investments,Cost of Goods Sold,Common Shares Outstanding,...,"Property, Plant, and Equipment",Preferred Stock,Retained Earnings,Receivables,Net Sales,Sale of Common and Preferred Stock,Income Taxes Payable,Total Income Taxes,Interest and Related Expense,"Price Close, Annual, Fiscal"
0,1990.0,1009.0,0.0,10.047,3.736,32.335,6.262,0.002,30.633,2.526,...,31.767,0.0,5.42,6.895,40.522,0.0,0.0,0.769,2.333,6.0
1,1990.0,1011.0,0.0,1.247,0.803,7.784,0.667,0.171,1.125,3.556,...,7.328,0.0,-3.339,0.29,3.635,0.006,0.0,0.0,0.64,1.188
2,1990.0,1017.0,0.0,55.04,3.601,118.12,44.393,3.132,107.343,3.882,...,78.331,0.0,46.63,47.366,144.258,0.0,0.0,0.986,3.962,5.125
3,1990.0,1021.0,0.0,24.684,3.948,34.591,7.751,0.411,31.214,4.755,...,11.145,1.295,3.28,8.522,48.292,0.0,0.448,0.365,2.269,1.562
4,1990.0,1028.0,0.0,17.325,3.52,27.542,-12.142,1.017,32.662,6.735,...,5.782,0.0,-25.955,6.354,33.543,0.0,0.0,0.0,3.51,1.125


### Instantiating our 'Custom' CV build thanks to scikit-learn source code

In [35]:
CustomCV = StratifiedGroupKFold(n_splits=10)

### Setup Pycaret Classification

In [40]:
s = setup(data, target = 'Fraud', normalize = True, data_split_stratify = True, fold_strategy = 'groupkfold', fold_groups = 'gvkey')

Unnamed: 0,Description,Value
0,session_id,7330
1,Target,Fraud
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(107369, 31)"
5,Missing Values,False
6,Numeric Features,30
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [28]:
mlp = create_model('mlp', probability_threshold=.8, hidden_layer_sizes = [2,32])

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9918,0.7289,0.0,0.0,0.0,0.0,0.0
1,0.9906,0.7362,0.0,0.0,0.0,0.0,0.0
2,0.9923,0.6613,0.0,0.0,0.0,0.0,0.0
3,0.9916,0.6556,0.0,0.0,0.0,0.0,0.0
4,0.991,0.6631,0.0,0.0,0.0,0.0,0.0
5,0.9936,0.6438,0.0,0.0,0.0,0.0,0.0
6,0.9935,0.7328,0.0,0.0,0.0,0.0,0.0
7,0.9936,0.7308,0.0,0.0,0.0,0.0,0.0
8,0.9932,0.7058,0.0,0.0,0.0,0.0,0.0
9,0.992,0.6979,0.0,0.0,0.0,0.0,0.0


In [41]:
tuned = tune_model(mlp, n_iter = 50, optimize = 'Prec.')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9911,0.6361,0.0746,0.5,0.1299,0.1279,0.1907
1,0.991,0.6941,0.0938,0.375,0.15,0.1471,0.1842
2,0.9933,0.6978,0.1509,0.6154,0.2424,0.2403,0.3026
3,0.992,0.706,0.0962,0.2778,0.1429,0.1398,0.1601
4,0.9902,0.7477,0.2034,0.3077,0.2449,0.2402,0.2454
5,0.992,0.7105,0.0943,0.2941,0.1429,0.1399,0.1633
6,0.9906,0.6834,0.0597,0.3333,0.1013,0.0988,0.138
7,0.9923,0.6513,0.0784,0.2667,0.1212,0.1185,0.1416
8,0.9923,0.6523,0.1228,0.4667,0.1944,0.1919,0.2366
9,0.9925,0.6831,0.0727,0.4444,0.125,0.1232,0.1776


In [42]:
print(mlp)

CustomProbabilityThresholdClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999,
                                     classifier=MLPClassifier(activation='relu',
                                                              alpha=0.0001,
                                                              batch_size='auto',
                                                              beta_1=0.9,
                                                              beta_2=0.999,
                                                              early_stopping=False,
                                                              epsilon=1e-08,
                                                              hidden_layer_sizes=[2,
                                                                                  32],
                                                              learning_rate='constant',
  

In [43]:
print(tuned)

MLPClassifier(activation='relu', alpha=1e-06, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=[100, 100], learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=8667, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)
