# Write scripts that Pipeline uses

In [1]:
import os
import yaml

In [2]:
with open('param_config.yaml', 'r') as config_file:
    config_params = yaml.safe_load(config_file)

In [3]:
preprocessor_source_dir = config_params['processor_dir']

# 1a. Transformer Classes

The classes used to preprocess the data.

In [4]:
%%writefile $preprocessor_source_dir/transformers_script.py
"""
Custom transformers to preprocess the data for training.

    
References
-----------
    See BaseEstimator and TransformerMixin for compatability with sklearn.
    See MultiLabelBinarizer and OneHotEncoder for dependencies.

"""

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

class FeatureFiller(BaseEstimator, TransformerMixin):
    """Fills missing features with an empty value."""
    
    def __init__(self):
        self._col_names = None
        self._col_dict = {}

    def fit(self, X, y=None):
        """
        Captures the column names used at training and their data type.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _col_names : list
            Saves the list of columns used at training to ``self``.
        _col_dict : dict
            Saves a dictionary to ``self`` that holds the names of the columns used at training as
            the keys and their datatype as the value.
            
        Notes
        -----
        This step ensures that if features that were used at training aren't in the inference data
        that the model will still run. This is necessary because the data captured at inference
        doesn't retain features that are empty.
        """
        
        self._col_names = list(X.columns)
        
        # init_dict = X.dtypes.apply(lambda x: x.name).to_dict()
        # self._col_dict = {}
        # for col, dtype in init_dict.items():
        #     self._col_dict.setdefault(dtype, []).append(col)
        self._col_dict = X.dtypes.apply(lambda x: x.name).to_dict()
        return self

    def transform(self, X, y=None):
        """
        Creates an empty data frame with the columns and data types
        captured in fit from the training data.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed feature data.
        """
        
        # print('Running FeatureFiller')
        new_X = pd.DataFrame(columns=self._col_names)
        new_X = new_X.astype(self._col_dict)

        # for col_type in self._col_dict:
        #     columns = self._col_dict[col_type]
        #     new_X[columns] = new_X[columns].astype(col_type)
        
        reindex_X = X.reindex(columns=new_X.columns)
        
        new_df = pd.concat([new_X, reindex_X])
        # print("Finished FeatureFiller")
        return new_df

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names

class TrueFalseTransformer(BaseEstimator, TransformerMixin):
    """
    Ensures boolean values are represented uniformly as 0 and 1. 
    """
    
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        """
        Captures column names used at training.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _col_names : list
            Saves the list of columns used at training to ``self``.
        """
        
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        """
        Transforms data.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed feature data.
        """
        
        # print('Running TrueFalseTransformer')
        X = X.replace({'None':np.nan}).fillna('-1')
        X = X.replace({'true':'1', 'false':'0'})
        X = X.apply(pd.to_numeric, args=('coerce',))
        # print("Finished TrueFalseTransformer")
        return X

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names

class DateTransformer(BaseEstimator, TransformerMixin):
    """Creates features from datetime values."""
    
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        """
        Captures column names used at training.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _col_names : list
            Saves the list of columns used at training to ``self``.
        """
        
        return self

    def transform(self, X, y=None):
        """
        Transforms data and sets the column names attribute.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed data
        """
        
        # print('Running DateTransformer')
        X = X.replace({'None':np.nan})
        temp_df = pd.DataFrame(index=X.index.copy())

        for col in X.columns:
            X[col] = pd.to_datetime(X[col])
            temp_df[f'{col}-month'] = X[col].dt.month.astype(float)
            temp_df[f'{col}-day_of_week'] = X[col].dt.dayofweek.astype(float)
            temp_df[f'{col}-hour'] = X[col].dt.hour.astype(float)
            temp_df[f'{col}-day_of_month'] = X[col].dt.day.astype(float)
            temp_df[f'{col}-is_month_start'] = X[col].dt.is_month_start.astype(int)
            temp_df[f'{col}-is_month_end'] = X[col].dt.is_month_end.astype(int)
        self._col_names = list(temp_df.columns)
        temp_df = temp_df.fillna(-1)
        # print("Finished DateTransformer")
        return temp_df

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names

class FloatTransformer(BaseEstimator, TransformerMixin):
    """
    Ensures columns with strictly numeric values are all represented
    as floats.
    
    Notes
    -----   
    Experimental results indicate than an integer feature (example: 1) will
    be evaluated differently in a model than a float (example: 1.0).
    """
    
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        """
        Captures column names at training.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _col_names : list
            Saves the list of columns used at training to ``self``.
        """
        
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        """
        Sets the type as float and set null values to -1.0.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed data.
        """
        
        # print('Running FloatTransformer')
        X = X.replace({'None':np.nan})
        for col in self._col_names:
            if X[col].dtype != 'float':
                X[col] = X[col].astype(float)
        X = X.fillna(-1.0)
        # print("Finished FloatTransformer")
        return X

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names

class ListMaxTransformer(BaseEstimator, TransformerMixin):
    """Evaluates a list value and returns the maximum value of the list as a single value."""
    
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        """
        Captures column names used at training.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _col_names : list
            Saves the list of columns used at training to ``self``.
        """
        
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        """
        Ensures uniform boolean handling, ensures all values are numeric,
        finds maximum value, fills null values.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed data.
        """
        
        # print('Running ListMaxTransformer')
        X = X.replace({'None':np.nan})
        temp_df = pd.DataFrame(index=X.index.copy())
        for col in self._col_names:
            if X[col].dtype == 'str':
                X[col].fillna('-1', inplace=True)
                X[col] = X[col].str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.replace({'true':'1', 'false':'0'}).fillna('-1').apply(pd.to_numeric, args=('coerce',))
            temp_series = temp_series.groupby(temp_series.index).max()
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        # print("Finished ListMaxTransformer")
        return temp_df

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names

class ListNuniqueTransformer(BaseEstimator, TransformerMixin):
    """Evaluates a list value and returns the count of unique items."""
    
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        """
        Captures column names used at training.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _col_names : list
            Saves the list of columns used at training to ``self``.
        """
        
        self._col_names = list(X.columns)
        return self

    def transform(self, X, y=None):
        """
        Counts unique items in a list for each observation.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed data.
        """
        
        # print('Running ListNuniqueTransformer')
        X = X.replace({'None':np.nan})
        temp_df = pd.DataFrame(index=X.index.copy())
        for col in self._col_names:
            if X[col].dtype == 'str':
                X[col] = X[col].dropna().str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.groupby(temp_series.index).nunique()
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        # print("Finished ListNuniqueTransformer")
        return temp_df

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names

class DescStatTransformer(BaseEstimator, TransformerMixin):
    """
    Evaluates a list of numbers and returns the minimum value, maximum value,
    mean, standard deviation, and count of unique items. Each of these is
    assigned to its own column.
    """
    
    def __init__(self):
        self._col_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """
        Ensures numeric items in list; calculates the minimum, maximum, mean,
        standard deviation, and count of unique items; fills null values.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed data
        """
        
        # print('Running DescStatTransformer')
        X = X.replace({'None':np.nan})
        temp_df = pd.DataFrame(index=X.index.copy())
        for col in X.columns:
            if X[col].dtype == 'str':
                X[col].fillna('-1', inplace=True)
                X[col] = X[col].str.split(pat=',').apply(set).apply(list)
            temp_series = X[col].explode()
            temp_series = temp_series.fillna('-1').apply(pd.to_numeric, args=('coerce',))
            temp_series = temp_series.groupby(temp_series.index).agg(['min', 'max', 'mean', 'std', 'nunique'])
            temp_series.columns = [f'{col}-{x}' for x in temp_series.columns]
            temp_df = temp_df.merge(temp_series, left_index=True, right_index=True, how='outer')
        temp_df = temp_df.fillna(0)
        self._col_names = list(temp_df.columns)
        # print("Finished DescStatTransformer")
        return temp_df

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names

class OneHotTransformer(BaseEstimator, TransformerMixin):
    """
    One hot encodes categorical columns that hold a single value per claim.
    Each category is assigned its own column. Each claim will have a positive
    value for a maximum of one column.
    """
    
    def __init__(self):
        """
        Sets attributes for initial instantiation.
        
        Returns
        -------
        _filler : str
            The value to use to fill missing values.
        """
        
        self._filler = 'ml_empty'
        self._col_names = None
        self._encoder = None
        self._transformer = None
        self._transformed_feats = []

    def fit(self, X, y=None):
        """
        Fills null values, fits the encoder, and sets the encoded column names.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _col_names : list
            Saves the list of columns used at training to ``self``.
        _encoder : sklearn.preprocessing.OneHotEncoder
            The initially encoder for one hot transformation. Uses sklearn class.
        _transformer : sklearn.preprocessing.OneHotEncoder
            The fitted one hot encoder.
        _transformed_feats : list
            The list of columns names after they have been one hot encoded. Used
            for ``get_feature_names`` and ``get_feature_names_out``.
        
        See Also
        -----------
        sklearn.preprocessing.OneHotEncoder
        """
        
        self._col_names = X.dropna(axis=1, how='all').columns
        X = X[self._col_names].fillna(self._filler)
        self._encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self._transformer = self._encoder.fit(X)
        self._transformed_feats = self._transformer.get_feature_names_out()
        return self

    def transform(self, X, y=None):
        """
        Transforms data.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            One hot encoded data.
        """
        
        # print('Running OneHotTransformer')
        X = X.replace({'None':np.nan}).fillna(self._filler)
        X = self._transformer.transform(X[self._col_names])
        # print("Finished OneHotTransformer")
        return X

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return list(self._transformed_feats)
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return list(self._transformed_feats)

class MultilabelTransformer(BaseEstimator, TransformerMixin):
    """
    One hot encoding for categorical columns that can have multiple items
    per claim. Each category is assigned its own column. Each claim can have
    a positive value for more than one column.
    """
    
    def __init__(self):
        """
        Sets attributes for initial instantiation.
        
        Returns
        -------
        _filler : str
            The value to use to fill missing values.
        """
        self._filler = 'ml_empty'
        self._encoder = None
        self._col_names = None

    def fit(self, X, y=None):
        """
        Sets attributes for specific instantiation, fits multi label encoder,
        and sets the column names to reflect original column name in the
        category column name.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _col_names : list
            The list of columns names after they have been encoded. Used
            for ``get_feature_names`` and ``get_feature_names_out``.
        _encoder : sklearn.preprocessing.MultiLabelBinarizer
            The fitted encoder for multilabel transformation. Uses sklearn class.
        
        See also
        -----------
        sklearn.preprocessing.MultiLabelBinarizer
        """
        
        X = X.fillna(self._filler).str.split(pat=',').apply(set).apply(list)
        self._encoder = MultiLabelBinarizer()
        self._encoder.fit(X)
        self._col_names = [X.name + '_' + x for x in self._encoder.classes_]
        return self

    def transform(self, X, y=None):
        """
        Ensures unique list of items, transforms data.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed data.
        """
        
        # print('Running MultilabelTransformer')
        X = X.replace({'None':np.nan})
        X = X.fillna(self._filler).str.split(pat=',').apply(set).apply(list)
        trans_array = self._encoder.transform(X)
        df = pd.DataFrame(trans_array, columns=self._col_names, index=X.index)   
        # print("Finished MultilabelTransformer")     
        return df

    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    
class DropSingleValueCols(BaseEstimator, TransformerMixin):
    """Drops columns where all rows have the same value."""
    
    def __init__(self):
        self._col_index = []
        self._col_names = []
        
    def fit(self, X, y=None):
        """
        Iterates through columns, if the number of unique values in the column is
        greater than 1, it retains the column, otherwise the column is dropped.
        
        Parameters
        -----------
        X : pd.DataFrame | np.ndarray
            Feature data.
            
        Returns
        -------
        _col_index : list
            Saves the column index values to ``self`` ensure consistent handling.
        _col_names : list
            Saves the list of columns used at training to ``self``.
        """
        
        if type(X) == np.ndarray:
            # print('Processing numpy array')
            X = pd.DataFrame(X)
        # elif type(X) == pd.core.frame.DataFrame:
        #     print('Processing pandas dataframe')
        for i in range(len(X.columns)):
            if X.iloc[:,i].nunique() > 1:
                self._col_index.append(i)
        self._col_names = list(X.iloc[:,self._col_index].columns)
        return self
    
    def transform(self, X, y=None):
        """
        Retains columns that had more than 1 unique value during fitting.
        
        Parameters
        -----------
        X : pd.DataFrame | np.ndarray
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed data.
        """
        
        # print('Running DropSingleValueCols')
        if type(X) == np.ndarray:
            print('Processing numpy array')
            X = pd.DataFrame(X)
        # elif type(X) == pd.core.frame.DataFrame:
        #     print('Processing pandas dataframe')
        X = X[self._col_names]
        # X = X.iloc[:,self._col_index]
        # print("Finished DropSingleValueCols")
        return X
    
    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
       
class RemoveCollinearity(BaseEstimator, TransformerMixin):
    """
    Reduces columns that represent the same information in the dataset to
    a single, representative column.
    """
    
    def __init__(self):
        self._corr_dict = {}
        self._drop_cols = set()
        self._col_index = []
        self._col_names = []
        
    def fit(self, X, y=None):
        """
        Calculates correlations between columns. Compares correlated
        columns and drops one if the correlation is greater than 0.97.
        Sets the column name attribute for columns to keep.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        _corr_dict : dictionary
            Dictionary of correlated features.
        _drop_cols : set
            List of columns to drop due to correlations.    
        _col_index : list
            Column index values to ensure consistent handling.    
        _col_names : list
            Saves the list of columns used at training to ``self``.
        """
        
        drop_list = []
        if type(X) == np.ndarray:
            # print('Processing numpy array')
            X = pd.DataFrame(X)
        # elif type(X) == pd.core.frame.DataFrame:
        #     print('Processing pandas dataframe')
        corr_df = X.corr()
        for i, col in enumerate(corr_df.columns):
            sliced_col = abs(corr_df.iloc[i+1:, i])
            corr_feats = sliced_col[sliced_col > .97].index.tolist()
            if len(corr_feats) > 0:
                self._corr_dict[col] = corr_feats
                drop_list += corr_feats
        self._drop_cols = set(drop_list)
        # print('Collinear feature drop list:', drop_list)
        print('Number of collinear feature:', len(drop_list))
        self._col_names = list(set(X.columns) - self._drop_cols)
        for i, col in enumerate(X.columns):
            if col in self._col_names:
                self._col_index.append(i)
        return self
    
    def transform(self, X, y=None):
        """
        Retains columns uncorrelated columns.
        
        Parameters
        -----------
        X : pd.DataFrame | np.ndarray
            Feature data.
            
        Returns
        -----------
        X : pd.DataFrame
            Transformed data.
        """
        
        # print('Running RemoveCollinearity')
        if type(X) == np.ndarray:
            # print('Processing numpy array')
            X = pd.DataFrame(X)
        # elif type(X) == pd.core.frame.DataFrame:
        #     print('Processing pandas dataframe')
        X =  X[self._col_names]
        # X =  X.iloc[:,self._col_index]
        # print("Finished RemoveCollinearity")
        return X
    
    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    
class SetColumnOrder(BaseEstimator, TransformerMixin):
    """
    Ensures uniform column order.
        
    Notes
    -----
    
    Column names for preprocessing are stored in a json object for
    easy updating. When imported into Python as a dictionary, the
    order of keys is not guaranteed, thus the order of the columns
    must be standardized before being sent to the next step.
    """
    
    def __init__(self):
        self._col_names = []
        
    def fit(self, X, y=None):
        """
        Sets attributes for specific instantiation.
        
        Parameters
        -----------
        X : pd.DataFrame | np.ndarray
            Feature data.
            
        Returns
        -------
        _col_names : list
            Saves the list of columns used at training to ``self``.
        """
        
        if type(X) == np.ndarray:
            # print('Processing numpy array')
            X = pd.DataFrame(X)
        # elif type(X) == pd.core.frame.DataFrame:
        #     print('Processing pandas dataframe')
        self._col_names = list(set(X.columns))
        return self
    
    def transform(self, X, y=None):
        """
        Orders the columns in the dataframe.
        
        Parameters
        -----------
        X : pd.DataFrame
            Feature data.
            
        Returns
        -------
        X : pd.DataFrame
            Transformed data.
        """
        
        # print('Running SetColumnOrder')
        if type(X) == np.ndarray:
            # print('Processing numpy array')
            X = pd.DataFrame(X)
        # elif type(X) == pd.core.frame.DataFrame:
        #     print('Processing pandas dataframe')
        X =  X[self._col_names]
        # X =  X.iloc[:,self._col_index]
        # print("Finished SetColumnOrder")
        return X
    
    def get_feature_names(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names
    def get_feature_names_out(self):
        """Returns column name attribute for specific instantiation."""
        
        return self._col_names

Writing source_dir/transformers_script.py


# 1b. Function Scripts/Modules

In [5]:
%%writefile $preprocessor_source_dir/functions_module.py
import os
import pandas as pd
import json
import pickle

def pandas_reads(file_path, col_list = None):
    """
    Reads all useful data file formats
    
    Parameters
    -----------
    file_path : str
        The directory path and file name of the file to read.

    col_list : list
        Optional list of column names to load. This functionality
        is only compatible for parquet, csv, and txt file formats.

    Returns
    -------
    doc : dataframe
        A pandas dataframe of the data.
    """

    file_type = file_path.split('.')[-1]
    if file_type == 'parquet':
        doc = pd.read_parquet(file_path, columns=col_list)
    elif file_type == 'csv':
        doc = pd.read_csv(file_path, usecols=col_list)
    elif file_type == 'json':
        with open(file_path, 'r') as f:
            doc = json.loads(f.read())
    elif file_type == 'txt':
        doc = pd.read_csv(file_path, sep='\t', usecols=col_list)
    elif file_type == 'pickle':
        doc = pd.read_pickle(file_path)
    else:
        print("""Error: Accepted file types include parquet, csv, json, txt, and pickle.
        
        Use one of these file types or add to the pandas_reads function in functions_module.py""")
    return doc

def pandas_writes(df, file_path):
    """
    Writes all useful data file formats
    
    Parameters
    -----------
    df : dataframe
        The dataframe to be written.

    file_path : str
        The directory path and file name of the file to write.
    """

    file_type = file_path.split('.')[-1]
    if file_type == 'parquet':
        df.to_parquet(file_path)
    elif file_type == 'csv':
        df.to_csv(file_path)
    elif file_type == 'json':
        with open(file_path, "w") as f:
            f.write(json.dumps(df.to_json(date_format='iso')))
    elif file_type == 'txt':
        df.to_csv(file_path, sep='\t')
    elif file_type == 'pickle':
        df.to_pickle(file_path)
    else:
        print("""Error: Accepted file type is dataframe only.
        If you have a dictionary, turn it into a dataframe first.
        Accepted write types include parquet, csv, json, txt, and pickle.
        Use one of these file types or add to the pandas_writes
         function in functions_module.py""")

def get_file_list(input_path):
    """
    Make a list of file paths for all files in a directory.
    
    Parameters
    -----------
    input_path : str
        The file path to the directory where the files are stored.
        
    Returns
    --------
    file_list : list
        A list of the file paths where each file path is a strings.
        
    Notes
    -----
    This is a help function that allows for a dynamic number of files
    to be found in a directory. This allows for changes in
    underlying data and the tables or collections where they're stored
    without having to change this script.
    """

    file_types = ['parquet', 'csv', 'json', 'txt', 'pickle']
    try:
        file_list = [os.path.join(input_path, file) for file in os.listdir(input_path) if file.split('.')[-1] in file_types]
    except Exception as err:
        print(err)
        print("""Error: Accepted file types include parquet, csv, json, txt, and pickle.
        
        Use one or more of these file types in the specified directory or add to the get_file_list function in functions_module.py""")
    return file_list

def read_in_files(input_path):
    """
    Reads the all files in the directory specified.
    
    Parameters
    -----------
    input_path : str
        The file path to the directory where the files are stored.

    Returns
    -------
    df_dict : dictionary
        A dictionary of dataframes where each file is a key and
        the values in that file are a dataframe of those values.
        
    Notes
    -----
    The dictionary structure is used to accomodate the dynamic
    number of files.
    """

    df_dict = {}
    file_list = get_file_list(input_path)
    for file in file_list:
        file_name = file.split('/')[-1].split('.')[0]
        print(f'read_in_files: reading file {file}')
        df_dict[file_name] = pandas_reads(file)
    return df_dict

Writing source_dir/functions_module.py


In [6]:
%%writefile $preprocessor_source_dir/model_functions_module.py
import numpy as np
import pandas as pd
import importlib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from transformers_script import FeatureFiller
from transformers_script import DropSingleValueCols
from transformers_script import RemoveCollinearity
from transformers_script import SetColumnOrder
from sklearn.metrics import(
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    precision_score,
    roc_auc_score,
    f1_score)

import skexplain
import shap
import alibi

from sklearn import set_config
set_config(transform_output="pandas")


def define_algo_pipe(algo, col_dict):
    """
    Defines a sklearn pipeline.
    
    Parameters
    -----------
    algo : algorithm
        The algorithm to use for model training. Example: ``XGBClassifier``
    col_dict : dict
        The dictionary of mappings for feature to preprocessing method. This
        tells the sklearn pipeline which custom Asurion transformer
        (i.e. ``TrueFalseTransformer``) to apply to which column(s).
        
    Returns
    -------
    algo_pipe : sklearn pipeline
        The sklearn pipeline for the preprocessing and model training.
        
    Notes
    -----    
    This function is also in the ``tune_model.py`` file. As these functions
    are maintained in different scripts, differences may occur.
    """
    
    transformer_list = []
    tModule = importlib.import_module("transformers_script")
    key_list = [x for x in col_dict.keys() if not x == 'drop_cols']
    for key in key_list:
        trans_function = getattr(tModule, col_dict[key]['transformer'])
        if key == 'list_to_labels':
            for col in col_dict[key]['columns']:
                step_name = key + '_' + col
                transformer_list.append((step_name, trans_function(), col))
        else:
            transformer_list.append((key, trans_function(), col_dict[key]['columns']))
    transformer_list.append(("drop_cols", 'drop', col_dict["drop_cols"]["columns"]))
    preprocessor = ColumnTransformer(transformer_list)
    
    all_preprocess = Pipeline([
        ('featurefiller', FeatureFiller()),
        ('preprocess', preprocessor),
        ('dropsingle', DropSingleValueCols()),
        ('removemulticollinear', RemoveCollinearity()),
        ('setcolumorder', SetColumnOrder())])

    algo_pipe = Pipeline([
        ('all_preprocess', all_preprocess),
        ('algorithm', algo)])
    return algo_pipe

def train_model(algo, algo_data, col_dict, Xtrain, ytrain):
    """
    Trains a model and returns the evaluation metrics.
    
    Parameters
    -----------    
    algo : algorithm
        The algorithm to use for model training. Example: `XGBClassifier`
    algo_data : dict
        The dictionary holding algorithm information including the
        package information, any changes to default settings, and
        the parameters for hyperparameter tuning.
    col_dict : dict
        The dictionary of mappings for feature to preprocessing method. This
        tells the sklearn pipeline which custom Asurion transformer
        (i.e. ``TrueFalseTransformer``) to apply to which column(s).
    Xtrain : pd.DataFrame
        The training feature dataset.
    ytrain : pd.DataFrame
        The training ground truth data.
    
    Returns
    --------    
    metric_result_dict : dict
        A dictionary of the evaluation metrics for the trained model.
        
    Notes
    -----    
    The trained model is not returned because this model isn't used going
    forward. Training is done at this stage to collect information on
    how well different algorithms perform on the training data. The best
    performing algorithm is then trained and tuned in a different step
    for further use.
    """
    
    metric_result_dict = {}
    results = None
    print(algo_data['class'])
    try:
        if 'defaults' in algo_data:
            model = algo(**algo_data['defaults'])
        else:
            model = algo()
        model_pipe = define_algo_pipe(model, col_dict)
        results = cross_validate(model_pipe, Xtrain, ytrain, cv=5, scoring=['accuracy', 'precision', 'f1', 'recall', 'roc_auc'])
        for result in results.keys():
            results[result] = results[result].tolist()
        metric_result_dict[algo_data['class']] = results
    except Exception as error:
        print("Error on", algo_data['class'])
        print("Error:", error)
    print('\n')
    print(algo_data['class'], "Result Dictionary:")
    print(results)
    return metric_result_dict

def compile_algo_metrics(metric_dict):
    """
    Compiles cross validation metrics from the baseline models
    and calculates several aggregated metrics.
    
    Parameters
    -----------
    metric_dict : dict
        The dictionary of evaluation metrics. Each algorithm
        (i.e. ``XGBoostClassifier``constitutes one row with the
        metrics (i.e. ``f1``) making up the columns.
        
    Returns
    -------
    metric_desc_stat_df : dataframe
        A dataframe with the metrics and calculated aggreated/averaged
        results for all trained algorithms.
    """

    metric_result_df = pd.DataFrame(metric_dict).T
    metric_desc_stat_df = pd.DataFrame()
    for col in metric_result_df.columns:
        if 'time' in col:
            continue
        else:
            mean_col = col + '_mean'
            stdv_col = col + '_stdev'
            penalized = col + '_penalized'
            metric_desc_stat_df[mean_col] = metric_result_df[col].apply(np.mean)
            metric_desc_stat_df[stdv_col] = metric_result_df[col].apply(np.std)
            metric_desc_stat_df[penalized] = metric_desc_stat_df[mean_col] * (1 - metric_desc_stat_df[stdv_col])
    metric_desc_stat_df = metric_desc_stat_df.sort_values([
        'test_f1_penalized',
        'test_roc_auc_penalized',
        'test_recall_penalized',
        'test_precision_penalized',
        'test_accuracy_penalized'], ascending=False)
    return metric_desc_stat_df

def tune_best_algo(algo, algo_data, col_dict, Xtrain, ytrain):
    """
    Trains and tunes a model.
    
    Parameters
    -----------
    algo : algorithm
        The algorithm to use for model training. Example: ``XGBClassifier``    
    algo_data : dict
        The dictionary holding algorithm information including the
        package information, any changes to default settings, and
        the parameters for hyperparameter tuning.
    col_dict : dict
        The dictionary of mappings for feature to preprocessing method. This
        tells the sklearn pipeline which custom Asurion transformer
        (i.e. ``TrueFalseTransformer``) to apply to which column(s).
    Xtrain : dataframe
        The training feature dataset.
    ytrain : dataframe
        The training ground truth data.
    
    Returns
    --------
    model : trained model
        The trained model that has been tuned using ``RandomizedSearchCV``.
        
    See Also
    --------
    sklearn.mode_selection.RandomizedSearchCV
    """
    
    algo_pipe = define_algo_pipe(algo(), col_dict)
    param_grid = dict(('algorithm__'+key, value) for (key, value) in algo_data['params'].items())
    search_params = {"estimator": algo_pipe,
                     "cv": 5,
                     "param_distributions": param_grid,
                     "scoring": {'f1':'f1',
                                 'auc':'roc_auc'},
                     "verbose": 5,
                     "refit": "f1",
                     "random_state": 12}
    print("Tuning for hyperparameters")
    tuner = RandomizedSearchCV(**search_params)
    print("Training model")
    tuner.fit(Xtrain, ytrain)
    model = tuner.best_estimator_
    print(model)
    return model

def metrics_row(y_test, model, testX):
    """
    Evalute trained model using the metrics accuracy, precision, AUC,
    f1, recall, and the confusion matrix.
    
    Parameters
    -----------
    y_test : pd.DataFrame
        The validation ground truth values.
    model : trained model
        The model trained on the training dataset.
    testX : pd.DataFrame
        The validation feature dataset.
    
    Returns
    -------
    result_dict : dict
        A dictionary of the evaluation metrics rounded to 3 decimal places
        including accuracy, precision, f1, recall, ROC AUC, and an
        annotated confusion matrix.
    """
    
    predictions = model.predict(testX)
    proba_predictions = model.predict_proba(testX)[:, 1]
    
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    precision = precision_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    
    conf_matrix_norm = np.round(conf_matrix / conf_matrix.sum(axis=1), 3)
    
    annotation_list = []
    lbls = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
    ct = 0
    for i in range(conf_matrix.shape[0]):
        tmp = []
        for x in range(conf_matrix[i].shape[0]):
            val = f"{lbls[ct]}\n\nCount: {conf_matrix[i][x]}\nActl Rate: {conf_matrix_norm[i][x]}"
            tmp.append(val)
            ct += 1
        annotation_list.append(tmp)
        
    result_dict = {"Accuracy": round(accuracy, 3),
                   "Precision": round(precision, 3),
                   "F1": round(f1, 3),
                   "Recall": round(recall, 3),
                   "ROC AUC": round(roc_auc, 3),
                   "conf_matrix": annotation_list}
    return result_dict

# def eval_report(y_test, model, testX):
def eval_report(y_test, preprocessor, model, testX):
    """
    Evaluates the trained and tuned model against the hold out/test
    data.
    
    Parameters
    -----------
    y_test : dataframe
        The test dataset ground truth values.
    preprocessor : sklearn pipeline
        The preprocessor steps from the sklearn pipeline with the
        named step ``all_preprocess``.
    model : sklearn pipeline
        The algorithm step from the sklearn pipeline with the named
        step ``algorithm``.
    testX : dataframe
        The test dataset features.
        
    Returns
    --------
    report_dict : dict
        A dictionary of the evaluation metrics. These are specifically
        formated to be compatible with the Model Registry's Model
        Quality page.
    """

    Xprocessed = preprocessor.transform(testX)
    predictions = model.predict(Xprocessed)
    # predictions = model.predict(testX)
    # proba_predictions = model.predict_proba(testX)[:, 1]
    
    accuracy = accuracy_score(y_test, predictions)
    conf_matrix = confusion_matrix(y_test, predictions)
    pr_curve = precision_recall_curve(y_test, predictions)
    precision = precision_score(y_test, predictions)
    # avg_precision = average_precision_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    auc_score = auc(fpr, tpr)
        
    report_dict = {
        "binary_classification_metrics": {
            "accuracy": {"value":accuracy, "standard_deviation":"NaN"},
            "precision": {"value":precision, "standard_deviation":"NaN"},
            "f1": {"value":f1, "standard_deviation":"NaN"},
            "recall": {"value":recall, "standard_deviation":"NaN"},
            # "ROC AUC": {"value":auc_score, "standard_deviation":"NaN"},
            # "avg_percision": {"value":avg_precision, "standard_deviation":"NaN"},
            "roc_auc": {"value":roc_auc, "standard_deviation":"NaN"},
            # "log_loss": {"value":log_loss_score, "standard_deviation":"NaN"},
            # "informedness": {"value":informedness, "standard_deviation":"NaN"},
            # "cohen_kappa": {"value":cohen_kappa, "standard_deviation":"NaN"},
            # "mathews_coef": {"value":matthews_coef, "standard_deviation":"NaN"},
            # "fbeta": {"value":fbeta, "standard_deviation":"NaN"},,
            # "roc": {
            #     "fpr": roc[0].tolist(),
            #     "tpr": roc[1].tolist(),
            #     "thresholds": roc[2].tolist()}
            # "pr_curve": {"precision": pr_curve[0].tolist(),
            #              "recall": pr_curve[1].tolist(),
            #              "thresholds": pr_curve[2].tolist()},
            "confusion_matrix": {"0": {"0": int(conf_matrix[0][0]), "1": int(conf_matrix[0][1])},
                                 "1": {"0": int(conf_matrix[1][0]), "1": int(conf_matrix[1][1])}
                                },
            # "receiver_operating_charastic_curve": {
            #     "false_positive_rates": list(fpr),
            #     "true_positive_rates": list(tpr)
            # }
        }
    }
    return report_dict

def explainability_mods(model, Xtrain):
    """
    Trains a shap and an anchor explainer on preprocessed training data.
    
    Parameters
    ----------
    model : pipeline
        Pipeline that has been trained on the training data and includes
        two final named steps ``all_preprocess`` and ``algorithm``.
    Xtrain : pd.DataFrame
        The training dataset.
        
    Returns
    -------
    shap_explainer : trained model
        A fitted shap explainer model.
    anchor_explainer : trained model
        A fitted anchor explainer model.
        
    See Also
    ---------
    shap documentation
    alibi documentation
    """
    
    preprocess_pipe = model.named_steps['all_preprocess']
    train_processed = preprocess_pipe.transform(Xtrain)
    just_model = model.named_steps['algorithm']
    
    shap_mask = shap.maskers.Partition(train_processed,
                                       max_samples=1000)
    shap_explainer = shap.Explainer(just_model.predict,
                                    shap_mask,
                                    algorithm = "permutation")
    
    predict_fn = lambda x: just_model.predict(x)
    anchor_explainer = alibi.explainers.AnchorTabular(predict_fn, train_processed.columns, seed=1)
    anchor_explainer.fit(train_processed.to_numpy())
    
    return shap_explainer, anchor_explainer

Writing source_dir/model_functions_module.py


# 2a. Create Data

In [7]:
%%writefile create_data.py
import os
import pandas as pd
import numpy as np
import random
import datetime
from sklearn.model_selection import train_test_split
from functions_module import pandas_writes
import argparse

def calc_wts(col, predictability):
    low_wt = 1 - predictability
    col_len = len(col)
    if predictability > 0:
        wts = [predictability] + [(low_wt/(col_len-1)) for x in range(col_len-1)]
    elif predictability == 0:
        wts = [(low_wt/(col_len)) for x in range(col_len)]
    else:
        print('''High weight must be between 0 or greater and less than 1.
              A value of 0 gives equal weight to all values
              A value greater than 0 and less than one gives that weight to the first value and equal weights to all other values''')
    xwts = list(reversed(wts))
    return wts, xwts

def generate_dates(df, target_name, year=2022):
    true_vals = []
    false_vals = []
    month_wts, month_xwts = calc_wts(range(1, 12), 0)
    day_wts, day_xwts = calc_wts(range(1, 31), 0.8)    
    
    t_year = [year for x in range(len(df[df[target_name]==1]))]
    t_month = random.choices(range(1, 12), month_wts, k=len(df[df[target_name]==1]))
    t_day = random.choices(range(1, 31), day_wts, k=len(df[df[target_name]==1]))
    true_collist = zip(t_year,
                       t_month,
                       t_day)
    for yr, mt, dy in true_collist:
        try:
            date = datetime.date(yr, mt, dy)
            true_vals.append(date)
        except ValueError:
            true_vals.append(np.nan)
            
    f_year = [year for x in range(len(df[df[target_name]==0]))]
    f_month = random.choices(range(1, 12), month_xwts, k=len(df[df[target_name]==0]))
    f_day = random.choices(range(1, 31), day_xwts, k=len(df[df[target_name]==0]))
    false_collist = zip(f_year,
                        f_month,
                        f_day)
    for yr, mt, dy in false_collist:
        try:
            date = datetime.date(yr, mt, dy)
            false_vals.append(date)
        except ValueError:
            false_vals.append(np.nan)
    
    return true_vals, false_vals

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--target-name', type=str, dest='target_name')
    parser.add_argument('--target-rate', type=str, dest='target_rate')
    parser.add_argument('--sample-size', type=str, dest='sample_size')
    parser.add_argument('--predictability', type=str, dest='predictability')
    parser.add_argument('--train-size', type=str, dest='train_size')
    parser.add_argument('--input-path', type=str, dest='input_path')
    parser.add_argument('--output-path', type=str, dest='output_path')

    args = parser.parse_args()
    target_name = args.target_name
    target_rate = args.target_rate
    sample_size = args.sample_size
    predictability = args.predictability
    train_size = args.train_size
    input_path = args.input_path
    output_path = args.output_path

    target_wts = [1-float(target_rate), float(target_rate)]
    
    try:
        os.makedirs(os.path.join(output_path, "train"), exist_ok=True)
        os.makedirs(os.path.join(output_path, "validate"), exist_ok=True)
        os.makedirs(os.path.join(output_path, "test"), exist_ok=True)
    except:
        pass
    
    # create ground truth
    target_col = []
    target_vals = [0,1]


    col_list = zip([target_col],
                [target_vals],
                [target_wts])

    for col, vals, wts in col_list:
    target_col = random.choices(vals, wts, k=sample_size)

    gt_df = pd.DataFrame({
    target_name:target_col})
    print(f"Ground truth provided sample size: {sample_size}")
    print(f'Ground truth dataframe shape: {gt_df.shape}')
    print(f'Ground truth distribution:\n{gt_df[target_name].value_counts()}')
    # gt_df.to_parquet(os.path.join(config_params['project_dir'], config_params['data_dir'], 'ground_truth.parquet'), index=False)

    # Create Features
    sample_df = gt_df.copy()

    # All features include null, NaN, or None values to simulate missing data.

    # single value observations
    # Examples include 0, 1, 5, 9, or 0.85
    # These simulate numeric features, categories, and booleans stored as numbers
    col_names = ['true_false', 'one_hot', 'floats', 'random_col', 'other']

    tf_vals = ['true', 'false', np.nan, '1', '0']
    onehot_vals = ['red', 'orange', 'yellow', np.nan, 'green', 'blue', 'purple']
    float_vals = list(range(0,10)) + [np.nan] + [x/10 for x in range(0, 100, 5)]
    drop_vals = [np.nan] + list(range(0,10))
    xrand_vals = list(range(5))

    tf_high = predictability/2
    tf_low = (1 - predictability - 0.01)/2
    tf_wts = [tf_high, tf_low, 0.01, tf_high, tf_low]
    tf_xwts = [tf_low, tf_high, 0.01, tf_low, tf_high]
    onehot_wts, onehot_xwts = calc_wts(onehot_vals, predictability)
    float_wts, float_xwts = calc_wts(float_vals, predictability)
    drop_wts, drop_xwts = calc_wts(drop_vals, 0)
    xrand_wts, xrand_xwts = calc_wts(xrand_vals, predictability)

    col_list = zip(col_names,
                [tf_vals, onehot_vals, float_vals, drop_vals, xrand_vals],
                [tf_wts, onehot_wts, float_wts, drop_wts, xrand_wts],
                [tf_xwts, onehot_xwts, float_xwts, drop_xwts, xrand_xwts])

    for col, vals, col_wts, col_xwts in col_list:
    true_vals = random.choices(vals, col_wts, k=len(sample_df[sample_df[target_name]==1]))
    false_vals = random.choices(vals, col_xwts, k=len(sample_df[sample_df[target_name]==0]))
    sample_df.loc[sample_df[target_name]==1, col] = true_vals
    sample_df.loc[sample_df[target_name]==0, col] = false_vals

    # date observations
    # These simulate date features in the YYYY-MM-DD (2022-12-15) date format.
    true_dates, false_dates = generate_dates(sample_df, target_name)
    sample_df.loc[sample_df[target_name]==1, 'dates'] = true_dates
    sample_df.loc[sample_df[target_name]==0, 'dates'] = false_dates

    # multivalue observations
    # These simulate list type features such as [red, blue, purple], [1, 2, 3, 4]
    nbr_vals = list(range(0,10))
    str_vals = ['apple', 'orange', 'grape', 'pineapple', 'strawberry', 'blueberry', 'grapefruit', 'apple']

    nunique_col = []

    for _ in range(sample_size):
    val_size = random.randint(0,6)
    if val_size < 1:
        nunique_col.append(np.nan)
    else:
        if random.randint(0,10) < 5:
            val_type = str_vals
        else:
            val_type = [str(x) for x in nbr_vals]
        val = random.choices(val_type,k=val_size)
        strified = ','.join(val)
        nunique_col.append(strified)

    descstat_col = []
    max_col = []

    nbrlst_cols = [descstat_col, max_col]

    for col in nbrlst_cols:
    for _ in range(sample_size):
        val_size = random.randint(0,6)
        if val_size < 1:
            col.append(np.nan)
        else:
            val_type = [str(x) for x in nbr_vals]
            val = random.choices(val_type,k=val_size)
            strified = ','.join(val)
            col.append(strified)

    multi_col = []

    for _ in range(sample_size):
    val_size = random.randint(0,6)
    if val_size < 1:
        multi_col.append(np.nan)
    else:
        val = random.choices(str_vals, k=val_size)
        strified = ','.join(val)
        multi_col.append(strified)

    # Add to dataframe
    sample_df['max_of_list'] = max_col
    sample_df['nunique_of_list'] = nunique_col
    sample_df['desc_stats'] = descstat_col
    sample_df['multi_label'] = multi_col

    print(f"Dataset provided sample size: {sample_size}")
    print(f'Full dataframe shape: {sample_df.shape}')

    train, other = train_test_split(sample_df, train_size=train_size, random_state=12, stratify=sample_df[target_name])
    test, validate = train_test_split(other, train_size=0.5, random_state=12, stratify=other[target_name])

    pandas_writes(train, os.path.join(output_path, 'train', 'train.parquet'), index=False)
    pandas_writes(validate, os.path.join(output_path, 'validate', 'validate.parquet'), index=False)
    pandas_writes(test_data, os.path.join(output_path, 'test', 'test.json'), index=False)

Writing create_data.py


# 3a. Train Baseline Models

In [8]:
%%writefile train_model.py
"""
Trains an algorithm using specified defaults.

**Input**:
    col_dict : json
        A json object that comprised of metadata for column transformations.
        A human readable column type is used as the key. The value for each
        key includes the following two pieces of metadata:
        * transformer: the custom transformer to apply to applicable columns.
        * columns: the column names to be processed with the transformer. Each
        column should only be included in one column type.
    algo_data : json
        The metadata for the algorithm to run. Information includes:
        * 'module': the module to import for the algorithm.
        * 'class': the specific name of the algorithm.
        * 'params': the parameters to use for fine tuning. These values aren't used
        in this step, but are required in the tuning step.
        * 'defaults': optional parameter used when alternate values are desired for the 
        default training parameters.                                    
    train : parquet
        The training dataset only, excludes data set aside for validation and testing.
        
**Arguments**:
    --target : str
        The name of the event to be used. `--target` name is case insensitive.
    --train-size : number
        The size for the training/test split.
        Can be expressed either as a ratio (floating point between 0 and 1)
        or as a percentage (whole number between 0 and 100).
    --algo-name : str
        The key from the algo_dict. This is dynamically populated from
        the Sagemaker Pipeline from the algo_dict.json file.
    
**Output**:
    algo_result_dict : json
        The evaluation metrics for the trained model. This is used to
        determine which algorithm is the best candidate for
        hyperparameter tuning.

Notes
-----
    The algorithm, algo package, and any defaults are specified in
    the Sagemaker Pipeline. Algorithm package is loaded dynamically
    so the same script can be used for multiple algorithms while
    loading a minimal number of packages. All relevant json files
    are stored in the ``preprocessing_source_dir`` folder and loaded
    to the EC2 upon startup.
"""

from sklearn import set_config
set_config(transform_output="pandas")


if __name__ == '__main__':
    import numpy as np
    import pandas as pd
    import sklearn
    import json
    import importlib
    import os
    import warnings
    import argparse    
    from model_functions_module import define_algo_pipe, train_model
    from functions_module import pandas_reads, pandas_writes

    from sklearn import set_config
    set_config(transform_output="pandas")
    
    print("numpy version:", np.__version__)
    print("pandas version:", pd.__version__)
    print("sklearn version:", sklearn.__version__)
    import platform
    print("python version:", platform.python_version())

    warnings.simplefilter("once")
    # warnings.simplefilter(action='ignore', category=FutureWarning)

    parser = argparse.ArgumentParser()
    parser.add_argument('--target', type=str, dest='target_col')
    parser.add_argument('--train-size', type=str, dest='train_size', default='0.8')
    parser.add_argument('--algo-name', type=str, dest='algo_name')
    parser.add_argument('--input-path', type=str, dest='input_path')
    parser.add_argument('--output-path', type=str, dest='output_path')

    args = parser.parse_args()
    target_col = args.target_col
    train_size = args.train_size
    algo_name = args.algo_name
    input_path = args.input_path
    output_path = args.output_path
    
    print('Args: {}'.format(args))
    
    # ecr_basepath = "/opt/app"
    # col_dict = pandas_reads(os.path.join(ecr_basepath, "col_dict.json"))
    # algo_data = pandas_reads(os.path.join(ecr_basepath, "algo_dict.json"))[algo_name]"
    col_dict = pandas_reads("col_dict.json")
    algo_data = pandas_reads("algo_dict.json")[algo_name]
        
        
    module = importlib.import_module(algo_data['module'])
    algo = getattr(module, algo_data['class'])

    train_size = float(train_size)
    if train_size > 1:
        train_size = train_size/100
      
    train_filepath = os.path.join(input_path, 'train.parquet')

    print('Loading data')
    train_df = pandas_reads(train_filepath)
    print(f'Train data size: {train_df.shape}')
    
    Xtrain = train_df.drop(target_col, axis=1)
    ytrain = train_df[[target_col]]

    print('Training algorithm with defaults')
    algo_result_dict = pd.DataFrame(train_model(algo, algo_data, col_dict, Xtrain, ytrain))
    
    print('Saving metrics')
    file_name = algo_name + '_result_dict.json'
    pandas_writes(algo_result_dict, os.path.join(output_path, file_name))

Writing train_model.py


# 3b. Evaluate Baseline Models

In [9]:
%%writefile eval_basemodels.py
"""
Compiles the evaluation results from all trained baseline models and
calculates some aggregated metrics.

**Input**:
    input_files : json
        Intakes json files holding the evaluation metrics from the trained
        baseline models. These are automatically updated in the Sagemaker
        Pipeline based on the alogrithm steps.

**Output**:
    agg_metrics_df : parquet
        A parquet file holding dataframe of the compiled metrics.
        
Notes
------    
    The results for all algorithms are retained and saved to s3 in this
    step to retain training provenance for reference and audit purposes.
"""

if __name__ == '__main__':
    import numpy as np
    import pandas as pd
    import json
    import os
    import argparse
    from functions_module import pandas_reads, pandas_writes, get_file_list, read_in_files
    from model_functions_module import compile_algo_metrics

    print("numpy version:", np.__version__)
    print("pandas version:", pd.__version__)
    import platform
    print("python version:", platform.python_version())
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-path', type=str, dest='input_path')
    parser.add_argument('--output-path', type=str, dest='output_path')
    
    args = parser.parse_args()
    input_path = args.input_path
    output_path = args.output_path
    
    print('creating metrics dataframe')
    init_dict = read_in_files(input_path)
    metrics_dict = {}
    for x in init_dict.values():
        x = json.loads(x)
        for k, v in x.items():
            metrics_dict[k] = v
    agg_metrics_df = compile_algo_metrics(metrics_dict)
    
    pandas_writes(agg_metrics_df, os.path.join(output_path, 'basemodel_metrics.parquet'))

Writing eval_basemodels.py


# 4. Tune Best Algorithm

In [10]:
%%writefile tune_model.py
"""
Trains and tunes a final model for deployment. Includes functions that
are used at prediction and are specified by the Sagemaker Pipeline
enviornment/ecosystem.

Deployed model should be able to handle both single observations
as well as batches of observations.

**Input**:
    basemodel_metrics : parquet
        The evaluation metrics for the baseline models that was compiled
        in the prior Sagemaker Pipeline step and autopopulates into the
        pipeline step that corresponds with this script.
    col_dict : json
        A json object that comprised of metadata for column transformations.
        A human readable column type is used as the key. The value for each
        key includes the following two pieces of metadata:
        * transformer: the custom transformer to apply to applicable columns.
        * columns: the column names to be processed with the transformer. Each
        column should only be included in one column type.
    algo_data : json
        The metadata for the algorithm to run. Information includes:
        * 'module': the module to import for the algorithm.
        * 'class': the specific name of the algorithm.
        * 'params': the parameters to use for fine tuning. These values aren't used
        in this step, but are required in the tuning step.
        * 'defaults': optional parameter used when alternate values are desired for the 
        default training parameters.                                    
    train : parquet
        The training dataset only, excludes data set aside for validation and testing.
    validate : parquet
        The validation dataset used to evaluate the model during hyperparameter tuning.
    required_cols : json
        The columns or tables required in order to make a prediction. This is currently a
        hardcoded file; todo: dynamically populate this based on features used by the
        trained and tuned model.        

**Arguments**:
    --target : str
            The name of the event to be used. `--target` name is case insensitive.
    --train-size : number
        The size for the training/test split.
        Can be expressed either as a ratio (floating point between 0 and 1)
        or as a percentage (whole number between 0 and 100).
    --model_dir : str
        The directory where model artifacts are stored.
    --output-data-dir : str
        The directory where other artifacts, such as the required features,
        are stored.
    --train-data-dir : str
        The directory where the training data is placed by the Sagemaker
        Pipeline.
    --validate-data-dir : str
        The directory where the validation data is placed by the Sagemaker
        Pipeline.
    --metric-data-dir : str
        The directory where the consolidated evaluation data from the
        previous step is placed by the Sagemaker Pipeline.

**Output**:
    processor_steps : joblib
        The portion of the sklearn pipeline that completes all processing steps.
    model_step : joblib
        The portion of the sklearn pipeline that makes a prediction.
    shap_explainer : joblib
        The fitted shap explainer.
    anchor_explainer : dill
        Returns a folder that includes two dill files, one that is the explainer and one
        that holds metadata.
    required_cols : json
        The columns or tables required in order to make a prediction. This is currently a
        hardcoded file; todo: dynamically populate this based on features used by the
        trained and tuned model.

Notes
-----
    Functions required for the model to make predictions at inference are
    ``input_fn``, ``output_fn``, ``predict_fn``, and ``model_fn``.
"""

import numpy as np
import pandas as pd
import sklearn
import json
import os
import warnings
import joblib
import argparse
import importlib
from io import StringIO
from sagemaker_containers.beta.framework import encoders, worker
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# import skexplain
# import shap
# import alibi
from functions_module import pandas_reads, pandas_writes
from model_functions_module import define_algo_pipe, tune_best_algo, metrics_row
# from model_functions_module import explainability_mods

from sklearn import set_config
set_config(transform_output="pandas")

print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)
print("sklearn version:", sklearn.__version__)
import platform
print("python version:", platform.python_version())

warnings.simplefilter("once")
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
# warnings.simplefilter(action='ignore', category=FutureWarning)

input_path = '/opt/ml/processing/input'
output_path = '/opt/ml/processing/output'

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--target', type=str, dest='target_col')
    parser.add_argument('--train-size', type=str, dest='train_size', default='0.8')
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--train-data-dir', type=str, default=os.environ.get('SM_CHANNEL_TRAIN_DATA'))
    parser.add_argument('--validate-data-dir', type=str, default=os.environ.get('SM_CHANNEL_VALIDATE_DATA'))
    parser.add_argument('--metric-data-dir', type=str, default=os.environ.get('SM_CHANNEL_METRIC_DATA'))

    args = parser.parse_args()

    print('Args: {}'.format(args))
        
    train_size = float(args.train_size)
    if train_size > 1:
        train_size = train_size/100
      
    metric_desc_stat_df = pandas_reads(os.path.join(args.metric_data_dir,'basemodel_metrics.parquet'))
    best_algo = metric_desc_stat_df['test_f1_penalized'].idxmax()
    best_results = metric_desc_stat_df.loc[best_algo, ['test_f1_mean',
                                              'test_roc_auc_mean',
                                              'test_recall_mean',
                                              'test_precision_mean',
                                              'test_accuracy_mean']]
    print('Best model is:', best_algo)
    print("Model's performance was:")
    print(best_results)
        
    
    # ecr_basepath = "/opt/app"
    # col_dict = pandas_reads(os.path.join(ecr_basepath, "col_dict.json"))
    # algo_data = pandas_reads(os.path.join(ecr_basepath, "algo_dict.json"))[algo_name]"
    col_dict = pandas_reads("col_dict.json")
    algo_data = pandas_reads("algo_dict.json")[algo_name]
    
    module = importlib.import_module(algo_data['module'])
    algo = getattr(module, algo_data['class'])
        
    train_filepath = os.path.join(args.train_data_dir, 'train.parquet')
    validate_filepath = os.path.join(args.validate_data_dir, 'validate.parquet')

    print('Loading data')
    train_df = pandas_reads(train_filepath)
    print(f'Train data size: {train_df.shape}')
    validate_df = pandas_reads(validate_filepath)
    print(f'Validate data size: {validate_df.shape}')
    
    Xtrain = train_df.drop(args.target_col, axis=1)
    ytrain = train_df[[args.target_col]]
    Xvalidate = validate_df.drop(args.target_col, axis=1)
    yvalidate = validate_df[[args.target_col]]
    
    print('Tuning best algorithm')
    model = tune_best_algo(algo, algo_data, col_dict, Xtrain, ytrain)
    # shap_explainer, anchor_explainer = explainability_mods(model, Xtrain)
    tuned_val_metrics = metrics_row(yvalidate, model, Xvalidate)
    
    print("Tuned Model's preformance on training data:")
    print(metrics_row(ytrain, model, Xtrain))
    
    print("Tuned Model's preformance on validation data:")
    print(tuned_val_metrics)
    
    print('Saving model')
    # This is done so that it is all put into a tar.gz file that can be used at inference
    processor_steps = model.named_steps['all_preprocess']
    model_step = model.named_steps['algorithm']
    # joblib.dump(model, os.path.join(args.model_dir, 'model.joblib'))
    joblib.dump(processor_steps, os.path.join(args.model_dir, 'preprocessor.joblib'))
    joblib.dump(model_step, os.path.join(args.model_dir, 'model.joblib'))
    # joblib.dump(shap_explainer, os.path.join(args.model_dir, 'shap_explainer.joblib'))
    # alibi.saving.save_explainer(anchor_explainer, os.path.join(args.model_dir, 'anchor_explainer'))
    
    # Write required cols to output path
    required_cols = pandas(reads("required_cols.json"))
    pandas_writes(required_cols, os.path.join(args.output_data_dir, 'required-features.json'))
        
# Functions for Inference

# Note: these are required in order for the deployed model to
# make predictions once deployed.

def input_fn(input_data, content_type):
    """
    Parses the input data payload.
    
    Accepts data formats include csv, json, or parquet file types.
    
    Parameters
    -----------
    
    input_data : str or parquet
        The feature data to be used to make a prediction. Accepted file
        formats include csv, json, or parquet.
    content_type : {csv, json, parquet}
        The file format of the input_data.
        
    Returns
    -----------
    
    df : dataframe
        Data is converted to a dataframe for further processing.
        
    Notes
    -----------
    
    There appears to be some kind of handling completed by AWS when the
    data is passed through the end point.
    """
    
    # print('Pandas version:', pd.__version__)
    # print('Sklearn version:', sklearn.__version__)
    
    print(f'Loading input data with content type {content_type}')
    
    if content_type == 'text/csv':
        df = pd.read_csv(StringIO(input_data))
    elif content_type == 'application/x-parquet':
        df = pd.read_parquet(input_data)
    elif content_type == 'application/json':
        # contents = json.loads(input_data.read())
        contents = json.loads(input_data)
        df = pd.DataFrame(json.loads(contents), index=[0])
    else:
        raise ValueError("{} not supported by script".format(content_type))
    # print(df)
    return df
        
def output_fn(prediction, accept):
    """
    Takes the prediction and formats it for output.
    
    Parameters
    -----------
    
    prediction : dict
        The model's prediction. In our case, this includes the trained
        model's prediction as well as explainability information from
        shap and anchor.
    accept : {json, csv}
        The file format for the output. The only tested and proven file
        format is json. File format csv hasn't been tested.
        
    Returns
    -----------
    
    prediction : {json, csv}
        The encoded prediction for output with the mimetype set to the
        specified file format.
    
    Notes
    -----------
    The default accept/content-type between containers for serial
    inference is JSON. We also want to set the ContentType or mimetype
    as the same value as accept so the next container can read the
    response payload correctly.
    """
    
    print(f"Prediction object: {prediction}")
    print(f"Prediction object type: {type(prediction)}")
    
    print(f'Running output function with accept type {accept}')
    
    if accept == 'application/json':
        try:
            # json_df = prediction.to_json(orient='records')
            json_df = json.dumps(prediction)
            print('Prediction to json:', json_df)
            print('New type', type(json_df))
        except Exception as e:
            print('Error when creating json object', e)
        return worker.Response(json_df, mimetype=accept)
    elif accept == 'text/csv':
        return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
    else:
        raise RuntimeException('{} accept type is not supported by this script')
        
def predict_fn(df, estimator):
    """
    Uses the output of the ``input_fn`` for the dataset and the output
    of the ``model_fn`` for the estimator to make a prediction.
    
    Parameters
    -----------
    
    df : dataframe
        The features, which is obtained from the output of the
        ``input_fn``.
    estimator : tuple
        The trained model and the trained explainers, which is obtained
        from the output of the ``model_fn``.
        
    Returns
    -----------
    
    prediction : dict
        Dictionary of the prediction probabilities, shap values, and
        anchor values.
    
    Notes
    -----------
    
    We use predict_proba to return prediction proababilities instead
    of single yes/no values.
    
    Example
    -----------
    
    An example of the return for this function is:
    
    {'prediction':
        {0: 0.9995577335357666, 1: 0.00044227897888049483},
    'shap': {
        'float_cols__floats': -0.2596146665822727,
        'one_hot__one_hot_purple': -0.11884183668430261,
        'date_cols__dates-day_of_month': -0.04672476084749602},
    'anchor': {
        'values': [
            'float_cols__floats > 0.00',
            'one_hot__one_hot_purple > 0.00'],
        'precision': 0.9994452149791956,
        'coverage': 0.4319}}
    """
    
    from sklearn import set_config
    set_config(transform_output="pandas")

    # processor_steps, model_step, shap_explainer, anchor_explainer = estimator
    processor_steps, model_step = estimator

    Xprocessed = processor_steps.transform(df)
    prediction = model_step.predict_proba(Xprocessed)
    prediction_dict = pd.DataFrame(prediction,
                                   columns=model_step.classes_).T.rename(
        columns={0:'prediction'}).to_dict()
    
    max_evals = Xprocessed.shape[1] * 2 + 1
#     shap_results = shap_explainer(Xprocessed, max_evals=max_evals)
#     shap_df = pd.DataFrame(shap_results.values.T,
#                            index=Xprocessed.columns,
#                            columns=['shap'])
#     shap_df['shap_abs'] = shap_df['shap'].abs()
#     shap_largest = shap_df.nlargest(10, 'shap_abs', keep='all').sort_values('shap', ascending=False)
#     shap_dict = shap_largest[['shap']].to_dict()
    
#     anchor_explainer.predictor(Xprocessed)
#     anchor_explanation = anchor_explainer.explain(Xprocessed.to_numpy())
#     anchor_dict = {'anchor': {'values':anchor_explanation.anchor,
#                               'precision':anchor_explanation.precision,
#                               'coverage':anchor_explanation.coverage}}
    
#     return {**prediction_dict, **shap_dict, **anchor_dict}
    return {**prediction_dict}
    
def model_fn(model_dir):
    """
    Loads in the trained model and other artifacts from the model_dir.
    
    Parameters
    -----------
    
    model_dir : str
        The directory path for the model directory where the model and
        model artifacts are stored.
        
    Returns
    -----------
    
    estimator : tuple
        A tuple that contains the preprocessing for the trained model,
        the trained model, the shap arguments/explainer, and the
        anchor explainer
    
    Notes
    -----------
    
    The model artifacts are bundled into a tuple in order to pass all
    of them to the ``predict_fn``. This allows us to include explainability
    information with the prediction at the time of inference.
    """
    
    print('Loading model')    
    processor_steps = joblib.load(os.path.join(model_dir, 'preprocessor.joblib'))
    model_step = joblib.load(os.path.join(model_dir, 'model.joblib'))
    # model = joblib.load(os.path.join(model_dir, 'model.joblib'))
    # processor_steps = model.named_steps['all_preprocess']
    # model_step = model.named_steps['algorithm']
    
#     print('Loading shap arguments')    
#     shap_explainer = joblib.load(os.path.join(model_dir, 'shap_explainer.joblib'))
    
#     print('Loading anchor explainer')    
#     predict_fn = lambda x: model_step.predict(x)
#     anchor_explainer = alibi.saving.load_explainer(os.path.join(model_dir, 'anchor_explainer'), predict_fn)
    
#     estimator = processor_steps, model_step, shap_explainer, anchor_explainer
    estimator = processor_steps, model_step
    return estimator

Writing tune_model.py


# 5. Evaluation

Uses the test set label column and the batch transformed predictions to evaluate how well the model performs. Requires the headers be removed before being passed in.

In [11]:
%%writefile evaluate.py
"""
Evalutes the trained and tuned model against the hold out/test dataset
and stores that information for inclusion in the Sagemaker Studio
Model Registry page.

**Input**:
    model : model.tar.gz
        The file that holds the preprocessor and model.
    test_y : json
        The hold out/test data used to evaluate the model.

**Arguments**:
    --target : string
        The name of the column that stores the ground truth values.

**Output**:
    report_dict : json
        The json file containing the evaluation metrics for the trained
        and tuned model as evaluated against the hold out/test dataset.
        This object includes accuracy, precision, f1, recall, ROC AUC,
        and the confusion matrix.
    
Notes
-----
    When correctly configured, this information can be included in
    Sagemaker Studio's Model Registry page for easy reference.
"""


if __name__ == "__main__":
    import json
    import os
    import joblib
    import argparse
    import pandas as pd
    import numpy as np
    import sklearn
    import tarfile
    import pathlib
    from sklearn import set_config
    from sklearn.pipeline import Pipeline
    # from functions_module import eval_report

    set_config(transform_output="pandas")

    print("numpy version:", np.__version__)
    print("pandas version:", pd.__version__)
    print("sklearn version:", sklearn.__version__)
    import platform
    print("python version:", platform.python_version())

    parser = argparse.ArgumentParser()
    parser.add_argument('--target', type=str, dest='target_col')
    args = parser.parse_args()
    
    print('Args: {}'.format(args))

    model_path = "/opt/ml/processing/input/model"
    test_y_path = "/opt/ml/processing/input/test/test.json"
    
    with tarfile.open(os.path.join(model_path, "model.tar.gz")) as tar:
        tar.extractall(path=".")

    processor_step = joblib.load("preprocessor.joblib")
    model_step = joblib.load("model.joblib")
    # model =  Pipeline([
    #     ('all_preprocess', processor_step),
    #     ('algorithm', model_step)])
    # model = joblib.load("model.joblib")
    
    with open(test_y_path, "r") as xf:
        contents = json.loads(xf.read())
    # print("Contents variable:", contents)
    test_df = pd.DataFrame(pd.read_json(contents))
    print("Test dataframe type:")
    print(type(test_df))
    # print('Test dataframe')
    print(test_df)
    Xtest = test_df.drop(args.target_col, axis=1)
    ytest = test_df[[args.target_col]]
    
    report_dict = eval_report(ytest, processor_step, model_step, Xtest)
    # report_dict = eval_report(ytest, model, Xtest)
    print('Report Dict:', report_dict)
        
    output_dir = "/opt/ml/processing/evaluation"
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    evaluation_path = os.path.join(output_dir, 'evaluation.json')
    with open(evaluation_path, "w") as f:
        f.write(json.dumps(report_dict))

Writing evaluate.py
