# LIME Expansion

## Setup

In [1]:
# First install package from terminal:
!pip install -U pip
!pip install -U setuptools wheel
!pip install autogluon  # autogluon==0.4.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.1.2-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 7.1 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting setuptools
  Downloading setuptools-63.2.0-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
[31mERROR: pip's dependency resolver does not

In [2]:
# import all required modules
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

pd.set_option('display.max_rows', None)  ###
pd.set_option('display.max_columns', None)  ###
pd.set_option('display.width', None)  ###
pd.set_option('display.max_colwidth', None)  ###

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import cohen_kappa_score
from sklearn.inspection import permutation_importance
from numpy.lib.shape_base import row_stack
import pickle
import re

In [3]:
# connect colab with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## LIME Expansion - Class Definition

In [4]:
class LimeError(Exception):
    """Raise for errors"""

In [5]:
"""
Explanation class, with visualization functions.
"""
from io import open
import os
import os.path
import json
import string
import numpy as np


from sklearn.utils import check_random_state


def id_generator(size=15, random_state=None):
    """Helper function to generate random div ids. This is useful for embedding
    HTML into ipython notebooks."""
    chars = list(string.ascii_uppercase + string.digits)
    return ''.join(random_state.choice(chars, size, replace=True))


class DomainMapper(object):
    """Class for mapping features to the specific domain.
    The idea is that there would be a subclass for each domain (text, tables,
    images, etc), so that we can have a general Explanation class, and separate
    out the specifics of visualizing features in here.
    """

    def __init__(self):
        pass

    def map_exp_ids(self, exp, **kwargs):
        """Maps the feature ids to concrete names.
        Default behaviour is the identity function. Subclasses can implement
        this as they see fit.
        Args:
            exp: list of tuples [(id, weight), (id,weight)]
            kwargs: optional keyword arguments
        Returns:
            exp: list of tuples [(name, weight), (name, weight)...]
        """
        return exp

    def visualize_instance_html(self,
                                exp,
                                label,
                                div_name,
                                exp_object_name,
                                **kwargs):
        """Produces html for visualizing the instance.
        Default behaviour does nothing. Subclasses can implement this as they
        see fit.
        Args:
             exp: list of tuples [(id, weight), (id,weight)]
             label: label id (integer)
             div_name: name of div object to be used for rendering(in js)
             exp_object_name: name of js explanation object
             kwargs: optional keyword arguments
        Returns:
             js code for visualizing the instance
        """
        return ''


class Explanation(object):
    """Object returned by explainers."""

    def __init__(self,
                 domain_mapper,
                 mode='classification',
                 class_names=None,
                 random_state=None):
        """
        Initializer.
        Args:
            domain_mapper: must inherit from DomainMapper class
            type: "classification" or "regression"
            class_names: list of class names (only used for classification)
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
        """
        self.random_state = random_state
        self.mode = mode
        self.domain_mapper = domain_mapper
        self.local_exp = {}
        self.intercept = {}
        self.score = {}
        self.local_pred = {}
        if mode == 'classification':
            self.class_names = class_names
            self.top_labels = None
            self.predict_proba = None
        elif mode == 'regression':
            self.class_names = ['negative', 'positive']
            self.predicted_value = None
            self.min_value = 0.0
            self.max_value = 1.0
            self.dummy_label = 1
        else:
            raise LimeError('Invalid explanation mode "{}". '
                            'Should be either "classification" '
                            'or "regression".'.format(mode))

    def available_labels(self):
        """
        Returns the list of classification labels for which we have any explanations.
        """
        try:
            assert self.mode == "classification"
        except AssertionError:
            raise NotImplementedError('Not supported for regression explanations.')
        else:
            ans = self.top_labels if self.top_labels else self.local_exp.keys()
            return list(ans)

    def as_list(self, label=1, **kwargs):
        """Returns the explanation as a list.
        Args:
            label: desired label. If you ask for a label for which an
                explanation wasn't computed, will throw an exception.
                Will be ignored for regression explanations.
            kwargs: keyword arguments, passed to domain_mapper
        Returns:
            list of tuples (representation, weight), where representation is
            given by domain_mapper. Weight is a float.
        """
        label_to_use = label if self.mode == "classification" else self.dummy_label
        ans = self.domain_mapper.map_exp_ids(self.local_exp[label_to_use], **kwargs)
        ans = [(x[0], float(x[1])) for x in ans]
        return ans

    def as_map(self):
        """Returns the map of explanations.
        Returns:
            Map from label to list of tuples (feature_id, weight).
        """
        return self.local_exp

    def as_pyplot_figure(self, label=1, figsize=(4,4), **kwargs):
        """Returns the explanation as a pyplot figure.
        Will throw an error if you don't have matplotlib installed
        Args:
            label: desired label. If you ask for a label for which an
                   explanation wasn't computed, will throw an exception.
                   Will be ignored for regression explanations.
            figsize: desired size of pyplot in tuple format, defaults to (4,4).
            kwargs: keyword arguments, passed to domain_mapper
        Returns:
            pyplot figure (barchart).
        """
        import matplotlib.pyplot as plt
        exp = self.as_list(label=label, **kwargs)
        fig = plt.figure(figsize=figsize)
        vals = [x[1] for x in exp]
        names = [x[0] for x in exp]
        vals.reverse()
        names.reverse()
        colors = ['green' if x > 0 else 'red' for x in vals]
        pos = np.arange(len(exp)) + .5
        plt.barh(pos, vals, align='center', color=colors)
        plt.yticks(pos, names)
        if self.mode == "classification":
            title = 'Local explanation for class %s' % self.class_names[label]
        else:
            title = 'Local explanation'
        plt.title(title)
        return fig

    def show_in_notebook(self,
                         labels=None,
                         predict_proba=True,
                         show_predicted_value=True,
                         **kwargs):
        """Shows html explanation in ipython notebook.
        See as_html() for parameters.
        This will throw an error if you don't have IPython installed"""

        from IPython.core.display import display, HTML
        display(HTML(self.as_html(labels=labels,
                                  predict_proba=predict_proba,
                                  show_predicted_value=show_predicted_value,
                                  **kwargs)))

    def save_to_file(self,
                     file_path,
                     labels=None,
                     predict_proba=True,
                     show_predicted_value=True,
                     **kwargs):
        """Saves html explanation to file. .
        Params:
            file_path: file to save explanations to
        See as_html() for additional parameters.
        """
        file_ = open(file_path, 'w', encoding='utf8')
        file_.write(self.as_html(labels=labels,
                                 predict_proba=predict_proba,
                                 show_predicted_value=show_predicted_value,
                                 **kwargs))
        file_.close()

    def as_html(self,
                labels=None,
                predict_proba=True,
                show_predicted_value=True,
                **kwargs):
        """Returns the explanation as an html page.
        Args:
            labels: desired labels to show explanations for (as barcharts).
                If you ask for a label for which an explanation wasn't
                computed, will throw an exception. If None, will show
                explanations for all available labels. (only used for classification)
            predict_proba: if true, add  barchart with prediction probabilities
                for the top classes. (only used for classification)
            show_predicted_value: if true, add  barchart with expected value
                (only used for regression)
            kwargs: keyword arguments, passed to domain_mapper
        Returns:
            code for an html page, including javascript includes.
        """

        def jsonize(x):
            return json.dumps(x, ensure_ascii=False)

        if labels is None and self.mode == "classification":
            labels = self.available_labels()

        this_dir, _ = os.path.split(__file__)
        bundle = open(os.path.join(this_dir, 'bundle.js'),
                      encoding="utf8").read()

        out = u'''<html>
        <meta http-equiv="content-type" content="text/html; charset=UTF8">
        <head><script>%s </script></head><body>''' % bundle
        random_id = id_generator(size=15, random_state=check_random_state(self.random_state))
        out += u'''
        <div class="lime top_div" id="top_div%s"></div>
        ''' % random_id

        predict_proba_js = ''
        if self.mode == "classification" and predict_proba:
            predict_proba_js = u'''
            var pp_div = top_div.append('div')
                                .classed('lime predict_proba', true);
            var pp_svg = pp_div.append('svg').style('width', '100%%');
            var pp = new lime.PredictProba(pp_svg, %s, %s);
            ''' % (jsonize([str(x) for x in self.class_names]),
                   jsonize(list(self.predict_proba.astype(float))))

        predict_value_js = ''
        if self.mode == "regression" and show_predicted_value:
            # reference self.predicted_value
            # (svg, predicted_value, min_value, max_value)
            predict_value_js = u'''
                    var pp_div = top_div.append('div')
                                        .classed('lime predicted_value', true);
                    var pp_svg = pp_div.append('svg').style('width', '100%%');
                    var pp = new lime.PredictedValue(pp_svg, %s, %s, %s);
                    ''' % (jsonize(float(self.predicted_value)),
                           jsonize(float(self.min_value)),
                           jsonize(float(self.max_value)))

        exp_js = '''var exp_div;
            var exp = new lime.Explanation(%s);
        ''' % (jsonize([str(x) for x in self.class_names]))

        if self.mode == "classification":
            for label in labels:
                exp = jsonize(self.as_list(label))
                exp_js += u'''
                exp_div = top_div.append('div').classed('lime explanation', true);
                exp.show(%s, %d, exp_div);
                ''' % (exp, label)
        else:
            exp = jsonize(self.as_list())
            exp_js += u'''
            exp_div = top_div.append('div').classed('lime explanation', true);
            exp.show(%s, %s, exp_div);
            ''' % (exp, self.dummy_label)

        raw_js = '''var raw_div = top_div.append('div');'''

        if self.mode == "classification":
            html_data = self.local_exp[labels[0]]
        else:
            html_data = self.local_exp[self.dummy_label]

        raw_js += self.domain_mapper.visualize_instance_html(
                html_data,
                labels[0] if self.mode == "classification" else self.dummy_label,
                'raw_div',
                'exp',
                **kwargs)
        out += u'''
        <script>
        var top_div = d3.select('#top_div%s').classed('lime top_div', true);
        %s
        %s
        %s
        %s
        </script>
        ''' % (random_id, predict_proba_js, predict_value_js, exp_js, raw_js)
        out += u'</body></html>'

        return out

#### LIME BASE

In [6]:
"""
Contains abstract functionality for learning locally linear sparse model.
"""
import numpy as np
import scipy as sp
from sklearn.linear_model import Ridge, lars_path
from sklearn.utils import check_random_state


class LimeBase(object):
    """Class for learning a locally linear sparse model from perturbed data"""
    def __init__(self,
                 kernel_fn,
                 verbose=False,
                 random_state=None):
        """Init function
        Args:
            kernel_fn: function that transforms an array of distances into an
                        array of proximity values (floats).
            verbose: if true, print local prediction values from linear model.
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
        """
        self.kernel_fn = kernel_fn
        self.verbose = verbose
        self.random_state = check_random_state(random_state)

    @staticmethod
    def generate_lars_path(weighted_data, weighted_labels):
        """Generates the lars path for weighted data.
        Args:
            weighted_data: data that has been weighted by kernel
            weighted_label: labels, weighted by kernel
        Returns:
            (alphas, coefs), both are arrays corresponding to the
            regularization parameter and coefficients, respectively
        """
        x_vector = weighted_data
        alphas, _, coefs = lars_path(x_vector,
                                     weighted_labels,
                                     method='lasso',
                                     verbose=False)
        return alphas, coefs

    def forward_selection(self, data, labels, weights, num_features):
        """Iteratively adds features to the model"""
        clf = Ridge(alpha=0, fit_intercept=True, random_state=self.random_state)
        used_features = []
        for _ in range(min(num_features, data.shape[1])):
            max_ = -100000000
            best = 0
            for feature in range(data.shape[1]):
                if feature in used_features:
                    continue
                clf.fit(data[:, used_features + [feature]], labels,
                        sample_weight=weights)
                score = clf.score(data[:, used_features + [feature]],
                                  labels,
                                  sample_weight=weights)
                if score > max_:
                    best = feature
                    max_ = score
            used_features.append(best)
        return np.array(used_features)

    def feature_selection(self, data, labels, weights, num_features, method):
        """Selects features for the model. see explain_instance_with_data to
           understand the parameters."""
        if method == 'none':
            return np.array(range(data.shape[1]))
        elif method == 'forward_selection':
            return self.forward_selection(data, labels, weights, num_features)
        elif method == 'highest_weights':
            clf = Ridge(alpha=0.01, fit_intercept=True,
                        random_state=self.random_state)
            clf.fit(data, labels, sample_weight=weights)

            coef = clf.coef_
            if sp.sparse.issparse(data):
                coef = sp.sparse.csr_matrix(clf.coef_)
                weighted_data = coef.multiply(data[0])
                # Note: most efficient to slice the data before reversing
                sdata = len(weighted_data.data)
                argsort_data = np.abs(weighted_data.data).argsort()
                # Edge case where data is more sparse than requested number of feature importances
                # In that case, we just pad with zero-valued features
                if sdata < num_features:
                    nnz_indexes = argsort_data[::-1]
                    indices = weighted_data.indices[nnz_indexes]
                    num_to_pad = num_features - sdata
                    indices = np.concatenate((indices, np.zeros(num_to_pad, dtype=indices.dtype)))
                    indices_set = set(indices)
                    pad_counter = 0
                    for i in range(data.shape[1]):
                        if i not in indices_set:
                            indices[pad_counter + sdata] = i
                            pad_counter += 1
                            if pad_counter >= num_to_pad:
                                break
                else:
                    nnz_indexes = argsort_data[sdata - num_features:sdata][::-1]
                    indices = weighted_data.indices[nnz_indexes]
                return indices
            else:
                weighted_data = coef * data[0]
                feature_weights = sorted(
                    zip(range(data.shape[1]), weighted_data),
                    key=lambda x: np.abs(x[1]),
                    reverse=True)
                return np.array([x[0] for x in feature_weights[:num_features]])
        elif method == 'lasso_path':
            weighted_data = ((data - np.average(data, axis=0, weights=weights))
                             * np.sqrt(weights[:, np.newaxis]))
            weighted_labels = ((labels - np.average(labels, weights=weights))
                               * np.sqrt(weights))
            nonzero = range(weighted_data.shape[1])
            _, coefs = self.generate_lars_path(weighted_data,
                                               weighted_labels)
            for i in range(len(coefs.T) - 1, 0, -1):
                nonzero = coefs.T[i].nonzero()[0]
                if len(nonzero) <= num_features:
                    break
            used_features = nonzero
            return used_features
        elif method == 'auto':
            if num_features <= 6:
                n_method = 'forward_selection'
            else:
                n_method = 'highest_weights'
            return self.feature_selection(data, labels, weights,
                                          num_features, n_method)

    def explain_instance_with_data(self,
                                   neighborhood_data,
                                   neighborhood_labels,
                                   distances,
                                   label,
                                   num_features,
                                   feature_selection='auto',
                                   model_regressor=None):
        """Takes perturbed data, labels and distances, returns explanation.
        Args:
            neighborhood_data: perturbed data, 2d array. first element is
                               assumed to be the original data point.
            neighborhood_labels: corresponding perturbed labels. should have as
                                 many columns as the number of possible labels.
            distances: distances to original data point.
            label: label for which we want an explanation
            num_features: maximum number of features in explanation
            feature_selection: how to select num_features. options are:
                'forward_selection': iteratively add features to the model.
                    This is costly when num_features is high
                'highest_weights': selects the features that have the highest
                    product of absolute weight * original data point when
                    learning with all the features
                'lasso_path': chooses features based on the lasso
                    regularization path
                'none': uses all features, ignores num_features
                'auto': uses forward_selection if num_features <= 6, and
                    'highest_weights' otherwise.
            model_regressor: sklearn regressor to use in explanation.
                Defaults to Ridge regression if None. Must have
                model_regressor.coef_ and 'sample_weight' as a parameter
                to model_regressor.fit()
        Returns:
            (intercept, exp, score, local_pred):
            intercept is a float.
            exp is a sorted list of tuples, where each tuple (x,y) corresponds
            to the feature id (x) and the local weight (y). The list is sorted
            by decreasing absolute value of y.
            score is the R^2 value of the returned explanation
            local_pred is the prediction of the explanation model on the original instance
        """

        weights = self.kernel_fn(distances)
        labels_column = neighborhood_labels[:, label]
        used_features = self.feature_selection(neighborhood_data,
                                               labels_column,
                                               weights,
                                               num_features,
                                               feature_selection)
        if model_regressor is None:
            model_regressor = Ridge(alpha=1, fit_intercept=True,
                                    random_state=self.random_state)
        easy_model = model_regressor
        easy_model.fit(neighborhood_data[:, used_features],
                       labels_column, sample_weight=weights)
        prediction_score = easy_model.score(
            neighborhood_data[:, used_features],
            labels_column, sample_weight=weights)

        local_pred = easy_model.predict(neighborhood_data[0, used_features].reshape(1, -1))

        if self.verbose:
            print('Intercept', easy_model.intercept_)
            print('Prediction_local', local_pred,)
            print('Right:', neighborhood_labels[0, label])
        return (easy_model.intercept_,
                sorted(zip(used_features, easy_model.coef_),
                       key=lambda x: np.abs(x[1]), reverse=True),
                prediction_score, local_pred)

#### Discretizer

In [7]:
"""
Discretizers classes, to be used in lime_tabular
"""
import numpy as np
import sklearn
import sklearn.tree
import scipy
from sklearn.utils import check_random_state
from abc import ABCMeta, abstractmethod


class BaseDiscretizer():
    """
    Abstract class - Build a class that inherits from this class to implement
    a custom discretizer.
    Method bins() is to be redefined in the child class, as it is the actual
    custom part of the discretizer.
    """

    __metaclass__ = ABCMeta  # abstract class

    def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None,
                 data_stats=None):
        """Initializer
        Args:
            data: numpy 2d array
            categorical_features: list of indices (ints) corresponding to the
                categorical columns. These features will not be discretized.
                Everything else will be considered continuous, and will be
                discretized.
            categorical_names: map from int to list of names, where
                categorical_names[x][y] represents the name of the yth value of
                column x.
            feature_names: list of names (strings) corresponding to the columns
                in the training data.
            data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this
                if you don't want these values to be computed from data
        """
        self.to_discretize = ([x for x in range(data.shape[1])
                               if x not in categorical_features])
        self.data_stats = data_stats
        self.names = {}
        self.lambdas = {}
        self.means = {}
        self.stds = {}
        self.mins = {}
        self.maxs = {}
        self.random_state = check_random_state(random_state)

        # To override when implementing a custom binning
        bins = self.bins(data, labels)
        bins = [np.unique(x) for x in bins]

        # Read the stats from data_stats if exists
        if data_stats:
            self.means = self.data_stats.get("means")
            self.stds = self.data_stats.get("stds")
            self.mins = self.data_stats.get("mins")
            self.maxs = self.data_stats.get("maxs")

        for feature, qts in zip(self.to_discretize, bins):
            n_bins = qts.shape[0]  # Actually number of borders (= #bins-1)
            boundaries = np.min(data[:, feature]), np.max(data[:, feature])
            name = feature_names[feature]

            self.names[feature] = ['%s <= %.2f' % (name, qts[0])]
            for i in range(n_bins - 1):
                self.names[feature].append('%.2f < %s <= %.2f' %
                                           (qts[i], name, qts[i + 1]))
            self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1]))

            self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x)
            discretized = self.lambdas[feature](data[:, feature])

            # If data stats are provided no need to compute the below set of details
            if data_stats:
                continue

            self.means[feature] = []
            self.stds[feature] = []
            for x in range(n_bins + 1):
                selection = data[discretized == x, feature]
                mean = 0 if len(selection) == 0 else np.mean(selection)
                self.means[feature].append(mean)
                std = 0 if len(selection) == 0 else np.std(selection)
                std += 0.00000000001
                self.stds[feature].append(std)
            self.mins[feature] = [boundaries[0]] + qts.tolist()
            self.maxs[feature] = qts.tolist() + [boundaries[1]]

    @abstractmethod
    def bins(self, data, labels):
        """
        To be overridden
        Returns for each feature to discretize the boundaries
        that form each bin of the discretizer
        """
        raise NotImplementedError("Must override bins() method")

    def discretize(self, data):
        """Discretizes the data.
        Args:
            data: numpy 2d or 1d array
        Returns:
            numpy array of same dimension, discretized.
        """
        ret = data.copy()
        for feature in self.lambdas:
            if len(data.shape) == 1:
                ret[feature] = int(self.lambdas[feature](ret[feature]))
            else:
                ret[:, feature] = self.lambdas[feature](
                    ret[:, feature]).astype(int)
        return ret

    def get_undiscretize_values(self, feature, values):
        mins = np.array(self.mins[feature])[values]
        maxs = np.array(self.maxs[feature])[values]

        means = np.array(self.means[feature])[values]
        stds = np.array(self.stds[feature])[values]
        minz = (mins - means) / stds
        maxz = (maxs - means) / stds
        min_max_unequal = (minz != maxz)

        ret = minz
        ret[np.where(min_max_unequal)] = scipy.stats.truncnorm.rvs(
            minz[min_max_unequal],
            maxz[min_max_unequal],
            loc=means[min_max_unequal],
            scale=stds[min_max_unequal],
            random_state=self.random_state
        )
        return ret

    def undiscretize(self, data):
        ret = data.copy()
        for feature in self.means:
            if len(data.shape) == 1:
                ret[feature] = self.get_undiscretize_values(
                    feature, ret[feature].astype(int).reshape(-1, 1)
                )
            else:
                ret[:, feature] = self.get_undiscretize_values(
                    feature, ret[:, feature].astype(int)
                )
        return ret


class StatsDiscretizer(BaseDiscretizer):
    """
        Class to be used to supply the data stats info when discretize_continuous is true
    """

    def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None,
                 data_stats=None):

        BaseDiscretizer.__init__(self, data, categorical_features,
                                 feature_names, labels=labels,
                                 random_state=random_state,
                                 data_stats=data_stats)

    def bins(self, data, labels):
        bins_from_stats = self.data_stats.get("bins")
        bins = []
        if bins_from_stats is not None:
            for feature in self.to_discretize:
                bins_from_stats_feature = bins_from_stats.get(feature)
                if bins_from_stats_feature is not None:
                    qts = np.array(bins_from_stats_feature)
                    bins.append(qts)
        return bins


class QuartileDiscretizer(BaseDiscretizer):
    def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):

        BaseDiscretizer.__init__(self, data, categorical_features,
                                 feature_names, labels=labels,
                                 random_state=random_state)

    def bins(self, data, labels):
        bins = []
        for feature in self.to_discretize:
            qts = np.array(np.percentile(data[:, feature], [25, 50, 75]))
            bins.append(qts)
        return bins


class DecileDiscretizer(BaseDiscretizer):
    def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
        BaseDiscretizer.__init__(self, data, categorical_features,
                                 feature_names, labels=labels,
                                 random_state=random_state)

    def bins(self, data, labels):
        bins = []
        for feature in self.to_discretize:
            qts = np.array(np.percentile(data[:, feature],
                                         [10, 20, 30, 40, 50, 60, 70, 80, 90]))
            bins.append(qts)
        return bins


class EntropyDiscretizer(BaseDiscretizer):
    def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
        if(labels is None):
            raise ValueError('Labels must be not None when using \
                             EntropyDiscretizer')
        BaseDiscretizer.__init__(self, data, categorical_features,
                                 feature_names, labels=labels,
                                 random_state=random_state)

    def bins(self, data, labels):
        bins = []
        for feature in self.to_discretize:
            # Entropy splitting / at most 8 bins so max_depth=3
            dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=3,
                                                     random_state=self.random_state)
            x = np.reshape(data[:, feature], (-1, 1))
            dt.fit(x, labels)
            qts = dt.tree_.threshold[np.where(dt.tree_.children_left > -1)]

            if qts.shape[0] == 0:
                qts = np.array([np.median(data[:, feature])])
            else:
                qts = np.sort(qts)

            bins.append(qts)

        return bins

In [8]:
!pip install pyDOE2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyDOE2
  Downloading pyDOE2-1.3.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyDOE2
  Building wheel for pyDOE2 (setup.py) ... [?25l[?25hdone
  Created wheel for pyDOE2: filename=pyDOE2-1.3.0-py3-none-any.whl size=25521 sha256=3c52a10bf6856df5d8215daf3654d9e8090934c5f48d4613400c7316625e142e
  Stored in directory: /root/.cache/pip/wheels/49/91/2d/d08e80806bf7756193541f6c03c0492af288fcd6158d3d0998
Successfully built pyDOE2
Installing collected packages: pyDOE2
Successfully installed pyDOE2-1.3.0
[0m

#### Tabular LIME

In [9]:
"""
Functions for explaining classifiers that use tabular data (matrices).
"""
import collections
import copy
from functools import partial
import json
import warnings

import numpy as np
import scipy as sp
import sklearn
import sklearn.preprocessing
from sklearn.utils import check_random_state
from pyDOE2 import lhs
from scipy.stats.distributions import norm



class TableDomainMapper(DomainMapper):
    """Maps feature ids to names, generates table views, etc"""

    def __init__(self, feature_names, feature_values, scaled_row,
                 categorical_features, discretized_feature_names=None,
                 feature_indexes=None):
        """Init.
        Args:
            feature_names: list of feature names, in order
            feature_values: list of strings with the values of the original row
            scaled_row: scaled row
            categorical_features: list of categorical features ids (ints)
            feature_indexes: optional feature indexes used in the sparse case
        """
        self.exp_feature_names = feature_names
        self.discretized_feature_names = discretized_feature_names
        self.feature_names = feature_names
        self.feature_values = feature_values
        self.feature_indexes = feature_indexes
        self.scaled_row = scaled_row
        if sp.sparse.issparse(scaled_row):
            self.all_categorical = False
        else:
            self.all_categorical = len(categorical_features) == len(scaled_row)
        self.categorical_features = categorical_features

    def map_exp_ids(self, exp):
        """Maps ids to feature names.
        Args:
            exp: list of tuples [(id, weight), (id,weight)]
        Returns:
            list of tuples (feature_name, weight)
        """
        names = self.exp_feature_names
        if self.discretized_feature_names is not None:
            names = self.discretized_feature_names
        return [(names[x[0]], x[1]) for x in exp]

    def visualize_instance_html(self,
                                exp,
                                label,
                                div_name,
                                exp_object_name,
                                show_table=True,
                                show_all=False):
        """Shows the current example in a table format.
        Args:
             exp: list of tuples [(id, weight), (id,weight)]
             label: label id (integer)
             div_name: name of div object to be used for rendering(in js)
             exp_object_name: name of js explanation object
             show_table: if False, don't show table visualization.
             show_all: if True, show zero-weighted features in the table.
        """
        if not show_table:
            return ''
        weights = [0] * len(self.feature_names)
        for x in exp:
            weights[x[0]] = x[1]
        if self.feature_indexes is not None:
            # Sparse case: only display the non-zero values and importances
            fnames = [self.exp_feature_names[i] for i in self.feature_indexes]
            fweights = [weights[i] for i in self.feature_indexes]
            if show_all:
                out_list = list(zip(fnames,
                                    self.feature_values,
                                    fweights))
            else:
                out_dict = dict(map(lambda x: (x[0], (x[1], x[2], x[3])),
                                zip(self.feature_indexes,
                                    fnames,
                                    self.feature_values,
                                    fweights)))
                out_list = [out_dict.get(x[0], (str(x[0]), 0.0, 0.0)) for x in exp]
        else:
            out_list = list(zip(self.exp_feature_names,
                                self.feature_values,
                                weights))
            if not show_all:
                out_list = [out_list[x[0]] for x in exp]
        ret = u'''
            %s.show_raw_tabular(%s, %d, %s);
        ''' % (exp_object_name, json.dumps(out_list, ensure_ascii=False), label, div_name)
        return ret


class LimeTabularExplainer(object):
    """Explains predictions on tabular (i.e. matrix) data.
    For numerical features, perturb them by sampling from a Normal(0,1) and
    doing the inverse operation of mean-centering and scaling, according to the
    means and stds in the training data. For categorical features, perturb by
    sampling according to the training distribution, and making a binary
    feature that is 1 when the value is the same as the instance being
    explained."""

    def __init__(self,
                 training_data,
                 mode="classification",
                 training_labels=None,
                 feature_names=None,
                 categorical_features=None,
                 categorical_names=None,
                 kernel_width=None,
                 kernel=None,
                 verbose=False,
                 class_names=None,
                 feature_selection='auto',
                 discretize_continuous=True,
                 discretizer='quartile',
                 sample_around_instance=False,
                 random_state=None,
                 training_data_stats=None):
        """Init function.
        Args:
            training_data: numpy 2d array
            mode: "classification" or "regression"
            training_labels: labels for training data. Not required, but may be
                used by discretizer.
            feature_names: list of names (strings) corresponding to the columns
                in the training data.
            categorical_features: list of indices (ints) corresponding to the
                categorical columns. Everything else will be considered
                continuous. Values in these columns MUST be integers.
            categorical_names: map from int to list of names, where
                categorical_names[x][y] represents the name of the yth value of
                column x.
            kernel_width: kernel width for the exponential kernel.
                If None, defaults to sqrt (number of columns) * 0.75
            kernel: similarity kernel that takes euclidean distances and kernel
                width as input and outputs weights in (0,1). If None, defaults to
                an exponential kernel.
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            discretize_continuous: if True, all non-categorical features will
                be discretized into quartiles.
            discretizer: only matters if discretize_continuous is True
                and data is not sparse. Options are 'quartile', 'decile',
                'entropy' or a BaseDiscretizer instance.
            sample_around_instance: if True, will sample continuous features
                in perturbed samples from a normal centered at the instance
                being explained. Otherwise, the normal is centered on the mean
                of the feature data.
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
            training_data_stats: a dict object having the details of training data
                statistics. If None, training data information will be used, only matters
                if discretize_continuous is True. Must have the following keys:
                means", "mins", "maxs", "stds", "feature_values",
                "feature_frequencies"
        """
        self.random_state = check_random_state(random_state)
        self.mode = mode
        self.categorical_names = categorical_names or {}
        self.sample_around_instance = sample_around_instance
        self.training_data_stats = training_data_stats

        # Check and raise proper error in stats are supplied in non-descritized path
        if self.training_data_stats:
            self.validate_training_data_stats(self.training_data_stats)

        if categorical_features is None:
            categorical_features = []
        if feature_names is None:
            feature_names = [str(i) for i in range(training_data.shape[1])]

        self.categorical_features = list(categorical_features)
        self.feature_names = list(feature_names)

        self.discretizer = None
        if discretize_continuous and not sp.sparse.issparse(training_data):
            # Set the discretizer if training data stats are provided
            if self.training_data_stats:
                discretizer = StatsDiscretizer(
                    training_data, self.categorical_features,
                    self.feature_names, labels=training_labels,
                    data_stats=self.training_data_stats,
                    random_state=self.random_state)

            if discretizer == 'quartile':
                self.discretizer = QuartileDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names, labels=training_labels,
                        random_state=self.random_state)
            elif discretizer == 'decile':
                self.discretizer = DecileDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names, labels=training_labels,
                        random_state=self.random_state)
            elif discretizer == 'entropy':
                self.discretizer = EntropyDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names, labels=training_labels,
                        random_state=self.random_state)
            elif isinstance(discretizer, BaseDiscretizer):
                self.discretizer = discretizer
            else:
                raise ValueError('''Discretizer must be 'quartile',''' +
                                 ''' 'decile', 'entropy' or a''' +
                                 ''' BaseDiscretizer instance''')
            self.categorical_features = list(range(training_data.shape[1]))

            # Get the discretized_training_data when the stats are not provided
            if(self.training_data_stats is None):
                discretized_training_data = self.discretizer.discretize(
                    training_data)

        if kernel_width is None:
            kernel_width = np.sqrt(training_data.shape[1]) * .75
        kernel_width = float(kernel_width)

        if kernel is None:
            def kernel(d, kernel_width):
                return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))

        kernel_fn = partial(kernel, kernel_width=kernel_width)

        self.feature_selection = feature_selection
        self.base = LimeBase(kernel_fn, verbose, random_state=self.random_state)
        self.class_names = class_names

        # Though set has no role to play if training data stats are provided
        self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        self.scaler.fit(training_data)
        self.feature_values = {}
        self.feature_frequencies = {}

        for feature in self.categorical_features:
            if training_data_stats is None:
                if self.discretizer is not None:
                    column = discretized_training_data[:, feature]
                else:
                    column = training_data[:, feature]

                feature_count = collections.Counter(column)
                values, frequencies = map(list, zip(*(sorted(feature_count.items()))))
            else:
                values = training_data_stats["feature_values"][feature]
                frequencies = training_data_stats["feature_frequencies"][feature]

            self.feature_values[feature] = values
            self.feature_frequencies[feature] = (np.array(frequencies) /
                                                 float(sum(frequencies)))
            self.scaler.mean_[feature] = 0
            self.scaler.scale_[feature] = 1

    @staticmethod
    def convert_and_round(values):
        return ['%.2f' % v for v in values]

    @staticmethod
    def validate_training_data_stats(training_data_stats):
        """
            Method to validate the structure of training data stats
        """
        stat_keys = list(training_data_stats.keys())
        valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"]
        missing_keys = list(set(valid_stat_keys) - set(stat_keys))
        if len(missing_keys) > 0:
            raise Exception("Missing keys in training_data_stats. Details: %s" % (missing_keys))

    def explain_instance(self,
                         data_row,
                         predict_fn,
                         labels=(1,),
                         top_labels=None,
                         num_features=10,
                         num_samples=5000,
                         distance_metric='euclidean',
                         model_regressor=None,
                         sampling_method='gaussian'):
        """Generates explanations for a prediction.
        First, we generate neighborhood data by randomly perturbing features
        from the instance (see __data_inverse). We then learn locally weighted
        linear models on this neighborhood data to explain each of the classes
        in an interpretable way (see lime_base.py).
        Args:
            data_row: 1d numpy array or scipy.sparse matrix, corresponding to a row
            predict_fn: prediction function. For classifiers, this should be a
                function that takes a numpy array and outputs prediction
                probabilities. For regressors, this takes a numpy array and
                returns the predictions. For ScikitClassifiers, this is
                `classifier.predict_proba()`. For ScikitRegressors, this
                is `regressor.predict()`. The prediction function needs to work
                on multiple feature vectors (the vectors randomly perturbed
                from the data_row).
            labels: iterable with labels to be explained.
            top_labels: if not None, ignore labels and produce explanations for
                the K labels with highest prediction probabilities, where K is
                this parameter.
            num_features: maximum number of features present in explanation
            num_samples: size of the neighborhood to learn the linear model
            distance_metric: the distance metric to use for weights.
            model_regressor: sklearn regressor to use in explanation. Defaults
                to Ridge regression in LimeBase. Must have model_regressor.coef_
                and 'sample_weight' as a parameter to model_regressor.fit()
            sampling_method: Method to sample synthetic data. Defaults to Gaussian
                sampling. Can also use Latin Hypercube Sampling.
        Returns:
            An Explanation object (see explanation.py) with the corresponding
            explanations.
        """
        if sp.sparse.issparse(data_row) and not sp.sparse.isspmatrix_csr(data_row):
            # Preventative code: if sparse, convert to csr format if not in csr format already
            data_row = data_row.tocsr()
        data, inverse = self.__data_inverse(data_row, num_samples, sampling_method)
        if sp.sparse.issparse(data):
            # Note in sparse case we don't subtract mean since data would become dense
            scaled_data = data.multiply(self.scaler.scale_)
            # Multiplying with csr matrix can return a coo sparse matrix
            if not sp.sparse.isspmatrix_csr(scaled_data):
                scaled_data = scaled_data.tocsr()
        else:
            scaled_data = (data - self.scaler.mean_) / self.scaler.scale_
        distances = sklearn.metrics.pairwise_distances(
                scaled_data,
                scaled_data[0].reshape(1, -1),
                metric=distance_metric
        ).ravel()

        yss = predict_fn(inverse)

        # for classification, the model needs to provide a list of tuples - classes
        # along with prediction probabilities
        if self.mode == "classification":
            if len(yss.shape) == 1:
                raise NotImplementedError("LIME does not currently support "
                                          "classifier models without probability "
                                          "scores. If this conflicts with your "
                                          "use case, please let us know: "
                                          "https://github.com/datascienceinc/lime/issues/16")
            elif len(yss.shape) == 2:
                if self.class_names is None:
                    self.class_names = [str(x) for x in range(yss[0].shape[0])]
                else:
                    self.class_names = list(self.class_names)
                if not np.allclose(yss.sum(axis=1), 1.0):
                    warnings.warn("""
                    Prediction probabilties do not sum to 1, and
                    thus does not constitute a probability space.
                    Check that you classifier outputs probabilities
                    (Not log probabilities, or actual class predictions).
                    """)
            else:
                raise ValueError("Your model outputs "
                                 "arrays with {} dimensions".format(len(yss.shape)))

        # for regression, the output should be a one-dimensional array of predictions
        else:
            try:
                if len(yss.shape) != 1 and len(yss[0].shape) == 1:
                    yss = np.array([v[0] for v in yss])
                assert isinstance(yss, np.ndarray) and len(yss.shape) == 1
            except AssertionError:
                raise ValueError("Your model needs to output single-dimensional \
                    numpyarrays, not arrays of {} dimensions".format(yss.shape))

            predicted_value = yss[0]
            min_y = min(yss)
            max_y = max(yss)

            # add a dimension to be compatible with downstream machinery
            yss = yss[:, np.newaxis]

        feature_names = copy.deepcopy(self.feature_names)
        if feature_names is None:
            feature_names = [str(x) for x in range(data_row.shape[0])]

        if sp.sparse.issparse(data_row):
            values = self.convert_and_round(data_row.data)
            feature_indexes = data_row.indices
        else:
            values = self.convert_and_round(data_row)
            feature_indexes = None

        for i in self.categorical_features:
            if self.discretizer is not None and i in self.discretizer.lambdas:
                continue
            name = int(data_row[i])
            if i in self.categorical_names:
                name = self.categorical_names[i][name]
            feature_names[i] = '%s=%s' % (feature_names[i], name)
            values[i] = 'True'
        categorical_features = self.categorical_features

        discretized_feature_names = None
        if self.discretizer is not None:
            categorical_features = range(data.shape[1])
            discretized_instance = self.discretizer.discretize(data_row)
            discretized_feature_names = copy.deepcopy(feature_names)
            for f in self.discretizer.names:
                discretized_feature_names[f] = self.discretizer.names[f][int(
                        discretized_instance[f])]

        domain_mapper = TableDomainMapper(feature_names,
                                          values,
                                          scaled_data[0],
                                          categorical_features=categorical_features,
                                          discretized_feature_names=discretized_feature_names,
                                          feature_indexes=feature_indexes)
        ret_exp = Explanation(domain_mapper,
                                          mode=self.mode,
                                          class_names=self.class_names)
        if self.mode == "classification":
            ret_exp.predict_proba = yss[0]
            if top_labels:
                labels = np.argsort(yss[0])[-top_labels:]
                ret_exp.top_labels = list(labels)
                ret_exp.top_labels.reverse()
        else:
            ret_exp.predicted_value = predicted_value
            ret_exp.min_value = min_y
            ret_exp.max_value = max_y
            labels = [0]
        for label in labels:
            (ret_exp.intercept[label],
             ret_exp.local_exp[label],
             ret_exp.score[label],
             ret_exp.local_pred[label]) = self.base.explain_instance_with_data(
                    scaled_data,
                    yss,
                    distances,
                    label,
                    num_features,
                    model_regressor=model_regressor,
                    feature_selection=self.feature_selection)

        if self.mode == "regression":
            ret_exp.intercept[1] = ret_exp.intercept[0]
            ret_exp.local_exp[1] = [x for x in ret_exp.local_exp[0]]
            ret_exp.local_exp[0] = [(i, -1 * j) for i, j in ret_exp.local_exp[1]]

        return ret_exp

    def __data_inverse(self,
                       data_row,
                       num_samples,
                       sampling_method):
        """Generates a neighborhood around a prediction.
        For numerical features, perturb them by sampling from a Normal(0,1) and
        doing the inverse operation of mean-centering and scaling, according to
        the means and stds in the training data. For categorical features,
        perturb by sampling according to the training distribution, and making
        a binary feature that is 1 when the value is the same as the instance
        being explained.
        Args:
            data_row: 1d numpy array, corresponding to a row
            num_samples: size of the neighborhood to learn the linear model
            sampling_method: 'gaussian' or 'lhs'
        Returns:
            A tuple (data, inverse), where:
                data: dense num_samples * K matrix, where categorical features
                are encoded with either 0 (not equal to the corresponding value
                in data_row) or 1. The first row is the original instance.
                inverse: same as data, except the categorical features are not
                binary, but categorical (as the original data)
        """
        is_sparse = sp.sparse.issparse(data_row)
        if is_sparse:
            num_cols = data_row.shape[1]
            data = sp.sparse.csr_matrix((num_samples, num_cols), dtype=data_row.dtype)
        else:
            num_cols = data_row.shape[0]
            data = np.zeros((num_samples, num_cols))
        categorical_features = range(num_cols)
        if self.discretizer is None:
            instance_sample = data_row
            scale = self.scaler.scale_
            mean = self.scaler.mean_
            if is_sparse:
                # Perturb only the non-zero values
                non_zero_indexes = data_row.nonzero()[1]
                num_cols = len(non_zero_indexes)
                instance_sample = data_row[:, non_zero_indexes]
                scale = scale[non_zero_indexes]
                mean = mean[non_zero_indexes]

            if sampling_method == 'gaussian':
                data = self.random_state.normal(0, 1, num_samples * num_cols
                                                ).reshape(num_samples, num_cols)
                data = np.array(data)
            elif sampling_method == 'lhs':
                data = lhs(num_cols, samples=num_samples
                           ).reshape(num_samples, num_cols)
                means = np.zeros(num_cols)
                stdvs = np.array([1]*num_cols)
                for i in range(num_cols):
                    data[:, i] = norm(loc=means[i], scale=stdvs[i]).ppf(data[:, i])
                data = np.array(data)
            else:
                warnings.warn('''Invalid input for sampling_method.
                                 Defaulting to Gaussian sampling.''', UserWarning)
                data = self.random_state.normal(0, 1, num_samples * num_cols
                                                ).reshape(num_samples, num_cols)
                data = np.array(data)

            if self.sample_around_instance:
                data = data * scale + instance_sample
            else:
                data = data * scale + mean
            if is_sparse:
                if num_cols == 0:
                    data = sp.sparse.csr_matrix((num_samples,
                                                 data_row.shape[1]),
                                                dtype=data_row.dtype)
                else:
                    indexes = np.tile(non_zero_indexes, num_samples)
                    indptr = np.array(
                        range(0, len(non_zero_indexes) * (num_samples + 1),
                              len(non_zero_indexes)))
                    data_1d_shape = data.shape[0] * data.shape[1]
                    data_1d = data.reshape(data_1d_shape)
                    data = sp.sparse.csr_matrix(
                        (data_1d, indexes, indptr),
                        shape=(num_samples, data_row.shape[1]))
            categorical_features = self.categorical_features
            first_row = data_row
        else:
            first_row = self.discretizer.discretize(data_row)
        data[0] = data_row.copy()
        inverse = data.copy()
        for column in categorical_features:
            values = self.feature_values[column]
            freqs = self.feature_frequencies[column]
            inverse_column = self.random_state.choice(values, size=num_samples,
                                                      replace=True, p=freqs)
            binary_column = (inverse_column == first_row[column]).astype(int)
            binary_column[0] = 1
            inverse_column[0] = data[0, column]
            data[:, column] = binary_column
            inverse[:, column] = inverse_column
        if self.discretizer is not None:
            inverse[1:] = self.discretizer.undiscretize(inverse[1:])
        inverse[0] = data_row
        return data, inverse


class RecurrentTabularExplainer(LimeTabularExplainer):
    """
    An explainer for keras-style recurrent neural networks, where the
    input shape is (n_samples, n_timesteps, n_features). This class
    just extends the LimeTabularExplainer class and reshapes the training
    data and feature names such that they become something like
    (val1_t1, val1_t2, val1_t3, ..., val2_t1, ..., valn_tn)
    Each of the methods that take data reshape it appropriately,
    so you can pass in the training/testing data exactly as you
    would to the recurrent neural network.
    """

    def __init__(self, training_data, mode="classification",
                 training_labels=None, feature_names=None,
                 categorical_features=None, categorical_names=None,
                 kernel_width=None, kernel=None, verbose=False, class_names=None,
                 feature_selection='auto', discretize_continuous=True,
                 discretizer='quartile', random_state=None):
        """
        Args:
            training_data: numpy 3d array with shape
                (n_samples, n_timesteps, n_features)
            mode: "classification" or "regression"
            training_labels: labels for training data. Not required, but may be
                used by discretizer.
            feature_names: list of names (strings) corresponding to the columns
                in the training data.
            categorical_features: list of indices (ints) corresponding to the
                categorical columns. Everything else will be considered
                continuous. Values in these columns MUST be integers.
            categorical_names: map from int to list of names, where
                categorical_names[x][y] represents the name of the yth value of
                column x.
            kernel_width: kernel width for the exponential kernel.
            If None, defaults to sqrt(number of columns) * 0.75
            kernel: similarity kernel that takes euclidean distances and kernel
                width as input and outputs weights in (0,1). If None, defaults to
                an exponential kernel.
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            discretize_continuous: if True, all non-categorical features will
                be discretized into quartiles.
            discretizer: only matters if discretize_continuous is True. Options
                are 'quartile', 'decile', 'entropy' or a BaseDiscretizer
                instance.
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
        """

        # Reshape X
        n_samples, n_timesteps, n_features = training_data.shape
        training_data = np.transpose(training_data, axes=(0, 2, 1)).reshape(
                n_samples, n_timesteps * n_features)
        self.n_timesteps = n_timesteps
        self.n_features = n_features
        if feature_names is None:
            feature_names = ['feature%d' % i for i in range(n_features)]

        # Update the feature names
        feature_names = ['{}_t-{}'.format(n, n_timesteps - (i + 1))
                         for n in feature_names for i in range(n_timesteps)]

        # Send off the the super class to do its magic.
        super(RecurrentTabularExplainer, self).__init__(
                training_data,
                mode=mode,
                training_labels=training_labels,
                feature_names=feature_names,
                categorical_features=categorical_features,
                categorical_names=categorical_names,
                kernel_width=kernel_width,
                kernel=kernel,
                verbose=verbose,
                class_names=class_names,
                feature_selection=feature_selection,
                discretize_continuous=discretize_continuous,
                discretizer=discretizer,
                random_state=random_state)

    def _make_predict_proba(self, func):
        """
        The predict_proba method will expect 3d arrays, but we are reshaping
        them to 2D so that LIME works correctly. This wraps the function
        you give in explain_instance to first reshape the data to have
        the shape the the keras-style network expects.
        """

        def predict_proba(X):
            n_samples = X.shape[0]
            new_shape = (n_samples, self.n_features, self.n_timesteps)
            X = np.transpose(X.reshape(new_shape), axes=(0, 2, 1))
            return func(X)

        return predict_proba

    def explain_instance(self, data_row, classifier_fn, labels=(1,),
                         top_labels=None, num_features=10, num_samples=5000,
                         distance_metric='euclidean', model_regressor=None):
        """Generates explanations for a prediction.
        First, we generate neighborhood data by randomly perturbing features
        from the instance (see __data_inverse). We then learn locally weighted
        linear models on this neighborhood data to explain each of the classes
        in an interpretable way (see lime_base.py).
        Args:
            data_row: 2d numpy array, corresponding to a row
            classifier_fn: classifier prediction probability function, which
                takes a numpy array and outputs prediction probabilities. For
                ScikitClassifiers , this is classifier.predict_proba.
            labels: iterable with labels to be explained.
            top_labels: if not None, ignore labels and produce explanations for
                the K labels with highest prediction probabilities, where K is
                this parameter.
            num_features: maximum number of features present in explanation
            num_samples: size of the neighborhood to learn the linear model
            distance_metric: the distance metric to use for weights.
            model_regressor: sklearn regressor to use in explanation. Defaults
                to Ridge regression in LimeBase. Must have
                model_regressor.coef_ and 'sample_weight' as a parameter
                to model_regressor.fit()
        Returns:
            An Explanation object (see explanation.py) with the corresponding
            explanations.
        """

        # Flatten input so that the normal explainer can handle it
        data_row = data_row.T.reshape(self.n_timesteps * self.n_features)

        # Wrap the classifier to reshape input
        classifier_fn = self._make_predict_proba(classifier_fn)
        return super(RecurrentTabularExplainer, self).explain_instance(
            data_row, classifier_fn,
            labels=labels,
            top_labels=top_labels,
            num_features=num_features,
            num_samples=num_samples,
            distance_metric=distance_metric,
            model_regressor=model_regressor)

#### Text LIME

In [10]:
"""
Functions for explaining text classifiers.
"""
from functools import partial
import itertools
import json
import re

import numpy as np
import scipy as sp
import sklearn
from sklearn.utils import check_random_state


class TextDomainMapper(DomainMapper):
    """Maps feature ids to words or word-positions"""

    def __init__(self, indexed_string):
        """Initializer.
        Args:
            indexed_string: lime_text.IndexedString, original string
        """
        self.indexed_string = indexed_string

    def map_exp_ids(self, exp, positions=False):
        """Maps ids to words or word-position strings.
        Args:
            exp: list of tuples [(id, weight), (id,weight)]
            positions: if True, also return word positions
        Returns:
            list of tuples (word, weight), or (word_positions, weight) if
            examples: ('bad', 1) or ('bad_3-6-12', 1)
        """
        if positions:
            exp = [('%s_%s' % (
                self.indexed_string.word(x[0]),
                '-'.join(
                    map(str,
                        self.indexed_string.string_position(x[0])))), x[1])
                   for x in exp]
        else:
            exp = [(self.indexed_string.word(x[0]), x[1]) for x in exp]
        return exp

    def visualize_instance_html(self, exp, label, div_name, exp_object_name,
                                text=True, opacity=True):
        """Adds text with highlighted words to visualization.
        Args:
             exp: list of tuples [(id, weight), (id,weight)]
             label: label id (integer)
             div_name: name of div object to be used for rendering(in js)
             exp_object_name: name of js explanation object
             text: if False, return empty
             opacity: if True, fade colors according to weight
        """
        if not text:
            return u''
        text = (self.indexed_string.raw_string()
                .encode('utf-8', 'xmlcharrefreplace').decode('utf-8'))
        text = re.sub(r'[<>&]', '|', text)
        exp = [(self.indexed_string.word(x[0]),
                self.indexed_string.string_position(x[0]),
                x[1]) for x in exp]
        all_occurrences = list(itertools.chain.from_iterable(
            [itertools.product([x[0]], x[1], [x[2]]) for x in exp]))
        all_occurrences = [(x[0], int(x[1]), x[2]) for x in all_occurrences]
        ret = '''
            %s.show_raw_text(%s, %d, %s, %s, %s);
            ''' % (exp_object_name, json.dumps(all_occurrences), label,
                   json.dumps(text), div_name, json.dumps(opacity))
        return ret


class IndexedString(object):
    """String with various indexes."""

    def __init__(self, raw_string, split_expression=r'\W+', bow=True,
                 mask_string=None):
        """Initializer.
        Args:
            raw_string: string with raw text in it
            split_expression: Regex string or callable. If regex string, will be used with re.split.
                If callable, the function should return a list of tokens.
            bow: if True, a word is the same everywhere in the text - i.e. we
                 will index multiple occurrences of the same word. If False,
                 order matters, so that the same word will have different ids
                 according to position.
            mask_string: If not None, replace words with this if bow=False
                if None, default value is UNKWORDZ
        """
        self.raw = raw_string
        self.mask_string = 'UNKWORDZ' if mask_string is None else mask_string

        if callable(split_expression):
            tokens = split_expression(self.raw)
            self.as_list = self._segment_with_tokens(self.raw, tokens)
            tokens = set(tokens)

            def non_word(string):
                return string not in tokens

        else:
            # with the split_expression as a non-capturing group (?:), we don't need to filter out
            # the separator character from the split results.
            splitter = re.compile(r'(%s)|$' % split_expression)
            self.as_list = [s for s in splitter.split(self.raw) if s]
            non_word = splitter.match

        self.as_np = np.array(self.as_list)
        self.string_start = np.hstack(
            ([0], np.cumsum([len(x) for x in self.as_np[:-1]])))
        vocab = {}
        self.inverse_vocab = []
        self.positions = []
        self.bow = bow
        non_vocab = set()
        for i, word in enumerate(self.as_np):
            if word in non_vocab:
                continue
            if non_word(word):
                non_vocab.add(word)
                continue
            if bow:
                if word not in vocab:
                    vocab[word] = len(vocab)
                    self.inverse_vocab.append(word)
                    self.positions.append([])
                idx_word = vocab[word]
                self.positions[idx_word].append(i)
            else:
                self.inverse_vocab.append(word)
                self.positions.append(i)
        if not bow:
            self.positions = np.array(self.positions)

    def raw_string(self):
        """Returns the original raw string"""
        return self.raw

    def num_words(self):
        """Returns the number of tokens in the vocabulary for this document."""
        return len(self.inverse_vocab)

    def word(self, id_):
        """Returns the word that corresponds to id_ (int)"""
        return self.inverse_vocab[id_]

    def string_position(self, id_):
        """Returns a np array with indices to id_ (int) occurrences"""
        if self.bow:
            return self.string_start[self.positions[id_]]
        else:
            return self.string_start[[self.positions[id_]]]

    def inverse_removing(self, words_to_remove):
        """Returns a string after removing the appropriate words.
        If self.bow is false, replaces word with UNKWORDZ instead of removing
        it.
        Args:
            words_to_remove: list of ids (ints) to remove
        Returns:
            original raw string with appropriate words removed.
        """
        mask = np.ones(self.as_np.shape[0], dtype='bool')
        mask[self.__get_idxs(words_to_remove)] = False
        if not self.bow:
            return ''.join(
                [self.as_list[i] if mask[i] else self.mask_string
                 for i in range(mask.shape[0])])
        return ''.join([self.as_list[v] for v in mask.nonzero()[0]])

    @staticmethod
    def _segment_with_tokens(text, tokens):
        """Segment a string around the tokens created by a passed-in tokenizer"""
        list_form = []
        text_ptr = 0
        for token in tokens:
            inter_token_string = []
            while not text[text_ptr:].startswith(token):
                inter_token_string.append(text[text_ptr])
                text_ptr += 1
                if text_ptr >= len(text):
                    raise ValueError("Tokenization produced tokens that do not belong in string!")
            text_ptr += len(token)
            if inter_token_string:
                list_form.append(''.join(inter_token_string))
            list_form.append(token)
        if text_ptr < len(text):
            list_form.append(text[text_ptr:])
        return list_form

    def __get_idxs(self, words):
        """Returns indexes to appropriate words."""
        if self.bow:
            return list(itertools.chain.from_iterable(
                [self.positions[z] for z in words]))
        else:
            return self.positions[words]


class IndexedCharacters(object):
    """String with various indexes."""

    def __init__(self, raw_string, bow=True, mask_string=None):
        """Initializer.
        Args:
            raw_string: string with raw text in it
            bow: if True, a char is the same everywhere in the text - i.e. we
                 will index multiple occurrences of the same character. If False,
                 order matters, so that the same word will have different ids
                 according to position.
            mask_string: If not None, replace characters with this if bow=False
                if None, default value is chr(0)
        """
        self.raw = raw_string
        self.as_list = list(self.raw)
        self.as_np = np.array(self.as_list)
        self.mask_string = chr(0) if mask_string is None else mask_string
        self.string_start = np.arange(len(self.raw))
        vocab = {}
        self.inverse_vocab = []
        self.positions = []
        self.bow = bow
        non_vocab = set()
        for i, char in enumerate(self.as_np):
            if char in non_vocab:
                continue
            if bow:
                if char not in vocab:
                    vocab[char] = len(vocab)
                    self.inverse_vocab.append(char)
                    self.positions.append([])
                idx_char = vocab[char]
                self.positions[idx_char].append(i)
            else:
                self.inverse_vocab.append(char)
                self.positions.append(i)
        if not bow:
            self.positions = np.array(self.positions)

    def raw_string(self):
        """Returns the original raw string"""
        return self.raw

    def num_words(self):
        """Returns the number of tokens in the vocabulary for this document."""
        return len(self.inverse_vocab)

    def word(self, id_):
        """Returns the word that corresponds to id_ (int)"""
        return self.inverse_vocab[id_]

    def string_position(self, id_):
        """Returns a np array with indices to id_ (int) occurrences"""
        if self.bow:
            return self.string_start[self.positions[id_]]
        else:
            return self.string_start[[self.positions[id_]]]

    def inverse_removing(self, words_to_remove):
        """Returns a string after removing the appropriate words.
        If self.bow is false, replaces word with UNKWORDZ instead of removing
        it.
        Args:
            words_to_remove: list of ids (ints) to remove
        Returns:
            original raw string with appropriate words removed.
        """
        mask = np.ones(self.as_np.shape[0], dtype='bool')
        mask[self.__get_idxs(words_to_remove)] = False
        if not self.bow:
            return ''.join(
                [self.as_list[i] if mask[i] else self.mask_string
                 for i in range(mask.shape[0])])
        return ''.join([self.as_list[v] for v in mask.nonzero()[0]])

    def __get_idxs(self, words):
        """Returns indexes to appropriate words."""
        if self.bow:
            return list(itertools.chain.from_iterable(
                [self.positions[z] for z in words]))
        else:
            return self.positions[words]


class LimeTextExplainer(object):
    """Explains text classifiers.
       Currently, we are using an exponential kernel on cosine distance, and
       restricting explanations to words that are present in documents."""

    def __init__(self,
                 kernel_width=25,
                 kernel=None,
                 verbose=False,
                 class_names=None,
                 feature_selection='auto',
                 split_expression=r'\W+',
                 bow=True,
                 mask_string=None,
                 random_state=None,
                 char_level=False):
        """Init function.
        Args:
            kernel_width: kernel width for the exponential kernel.
            kernel: similarity kernel that takes euclidean distances and kernel
                width as input and outputs weights in (0,1). If None, defaults to
                an exponential kernel.
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            split_expression: Regex string or callable. If regex string, will be used with re.split.
                If callable, the function should return a list of tokens.
            bow: if True (bag of words), will perturb input data by removing
                all occurrences of individual words or characters.
                Explanations will be in terms of these words. Otherwise, will
                explain in terms of word-positions, so that a word may be
                important the first time it appears and unimportant the second.
                Only set to false if the classifier uses word order in some way
                (bigrams, etc), or if you set char_level=True.
            mask_string: String used to mask tokens or characters if bow=False
                if None, will be 'UNKWORDZ' if char_level=False, chr(0)
                otherwise.
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
            char_level: an boolean identifying that we treat each character
                as an independent occurence in the string
        """

        if kernel is None:
            def kernel(d, kernel_width):
                return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))

        kernel_fn = partial(kernel, kernel_width=kernel_width)

        self.random_state = check_random_state(random_state)
        self.base = LimeBase(kernel_fn, verbose,
                                       random_state=self.random_state)
        self.class_names = class_names
        self.vocabulary = None
        self.feature_selection = feature_selection
        self.bow = bow
        self.mask_string = mask_string
        self.split_expression = split_expression
        self.char_level = char_level

    def explain_instance(self,
                         text_instance,
                         classifier_fn,
                         labels=(1,),
                         top_labels=None,
                         num_features=10,
                         num_samples=5000,
                         distance_metric='cosine',
                         model_regressor=None):
        """Generates explanations for a prediction.
        First, we generate neighborhood data by randomly hiding features from
        the instance (see __data_labels_distance_mapping). We then learn
        locally weighted linear models on this neighborhood data to explain
        each of the classes in an interpretable way (see lime_base.py).
        Args:
            text_instance: raw text string to be explained.
            classifier_fn: classifier prediction probability function, which
                takes a list of d strings and outputs a (d, k) numpy array with
                prediction probabilities, where k is the number of classes.
                For ScikitClassifiers , this is classifier.predict_proba.
            labels: iterable with labels to be explained.
            top_labels: if not None, ignore labels and produce explanations for
                the K labels with highest prediction probabilities, where K is
                this parameter.
            num_features: maximum number of features present in explanation
            num_samples: size of the neighborhood to learn the linear model
            distance_metric: the distance metric to use for sample weighting,
                defaults to cosine similarity
            model_regressor: sklearn regressor to use in explanation. Defaults
            to Ridge regression in LimeBase. Must have model_regressor.coef_
            and 'sample_weight' as a parameter to model_regressor.fit()
        Returns:
            An Explanation object (see explanation.py) with the corresponding
            explanations.
        """

        indexed_string = (IndexedCharacters(
            text_instance, bow=self.bow, mask_string=self.mask_string)
                          if self.char_level else
                          IndexedString(text_instance, bow=self.bow,
                                        split_expression=self.split_expression,
                                        mask_string=self.mask_string)) # here it tokenizes the string using IndexedString
        domain_mapper = TextDomainMapper(indexed_string)
        data, yss, distances = self.__data_labels_distances( # here it calculates various instances of the data with hidden tokens (0 and 1)
            indexed_string, classifier_fn, num_samples,
            distance_metric=distance_metric)
        if self.class_names is None:
            self.class_names = [str(x) for x in range(yss[0].shape[0])]
        ret_exp = Explanation(domain_mapper=domain_mapper,
                                          class_names=self.class_names,
                                          random_state=self.random_state)
        ret_exp.predict_proba = yss[0]
        if top_labels:
            labels = np.argsort(yss[0])[-top_labels:]
            ret_exp.top_labels = list(labels)
            ret_exp.top_labels.reverse()
        for label in labels:
            (ret_exp.intercept[label],
             ret_exp.local_exp[label],
             ret_exp.score[label],
             ret_exp.local_pred[label]) = self.base.explain_instance_with_data(
                data, yss, distances, label, num_features,
                model_regressor=model_regressor,
                feature_selection=self.feature_selection)
        return ret_exp

    def __data_labels_distances(self,
                                indexed_string,
                                classifier_fn,
                                num_samples,
                                distance_metric='cosine'):
        """Generates a neighborhood around a prediction.
        Generates neighborhood data by randomly removing words from
        the instance, and predicting with the classifier. Uses cosine distance
        to compute distances between original and perturbed instances.
        Args:
            indexed_string: document (IndexedString) to be explained,
            classifier_fn: classifier prediction probability function, which
                takes a string and outputs prediction probabilities. For
                ScikitClassifier, this is classifier.predict_proba.
            num_samples: size of the neighborhood to learn the linear model
            distance_metric: the distance metric to use for sample weighting,
                defaults to cosine similarity.
        Returns:
            A tuple (data, labels, distances), where:
                data: dense num_samples * K binary matrix, where K is the
                    number of tokens in indexed_string. The first row is the
                    original instance, and thus a row of ones.
                labels: num_samples * L matrix, where L is the number of target
                    labels
                distances: cosine distance between the original instance and
                    each perturbed instance (computed in the binary 'data'
                    matrix), times 100.
        """

        def distance_fn(x):
            return sklearn.metrics.pairwise.pairwise_distances(
                x, x[0], metric=distance_metric).ravel() * 100

        doc_size = indexed_string.num_words()
        sample = self.random_state.randint(1, doc_size + 1, num_samples - 1)
        data = np.ones((num_samples, doc_size))
        data[0] = np.ones(doc_size)
        features_range = range(doc_size)
        inverse_data = [indexed_string.raw_string()]
        for i, size in enumerate(sample, start=1):
            inactive = self.random_state.choice(features_range, size,
                                                replace=False)
            data[i, inactive] = 0
            inverse_data.append(indexed_string.inverse_removing(inactive))
        labels = classifier_fn(inverse_data)
        distances = distance_fn(sp.sparse.csr_matrix(data))
        return data, labels, distances

#### MultiModal LIME

In [11]:
class LimeMultimodalExplainer(object):
    """Explains text classifiers.
       Currently, we are using an exponential kernel on cosine distance, and
       restricting explanations to words that are present in documents."""

    def __init__(self,
                 kernel_width=25,
                 kernel=None,
                 verbose=False,
                 class_names=None,
                 feature_selection='auto',
                 split_expression=r'\W+',
                 bow=True,
                 mask_string=None,
                 random_state=None,
                 char_level=False):
        """Init function.
        Args:
            kernel_width: kernel width for the exponential kernel.
            kernel: similarity kernel that takes euclidean distances and kernel
                width as input and outputs weights in (0,1). If None, defaults to
                an exponential kernel.
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            split_expression: Regex string or callable. If regex string, will be used with re.split.
                If callable, the function should return a list of tokens.
            bow: if True (bag of words), will perturb input data by removing
                all occurrences of individual words or characters.
                Explanations will be in terms of these words. Otherwise, will
                explain in terms of word-positions, so that a word may be
                important the first time it appears and unimportant the second.
                Only set to false if the classifier uses word order in some way
                (bigrams, etc), or if you set char_level=True.
            mask_string: String used to mask tokens or characters if bow=False
                if None, will be 'UNKWORDZ' if char_level=False, chr(0)
                otherwise.
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
            char_level: an boolean identifying that we treat each character
                as an independent occurence in the string
        """

        if kernel is None:
            def kernel(d, kernel_width):
                return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))

        kernel_fn = partial(kernel, kernel_width=kernel_width)

        self.random_state = check_random_state(random_state)
        self.base = LimeBase(kernel_fn, verbose,
                                       random_state=self.random_state)
        self.class_names = class_names
        self.vocabulary = None
        self.feature_selection = feature_selection
        self.bow = bow
        self.mask_string = mask_string
        self.split_expression = split_expression
        self.char_level = char_level

    def explain_instance(self,
                         sample_observation,
                         predict_fn,
                         labels=(1,),
                         top_labels=None,
                         num_features=10,
                         num_samples=5000,
                         distance_metric='cosine',
                         model_regressor=None):
        """Generates explanations for a prediction.
        First, we generate neighborhood data by randomly hiding features from
        the instance (see __data_labels_distance_mapping). We then learn
        locally weighted linear models on this neighborhood data to explain
        each of the classes in an interpretable way (see lime_base.py).
        Args:
            text_instance: raw text string to be explained.
            predict_fn: classifier prediction probability function, which
                takes a list of d strings and outputs a (d, k) numpy array with
                prediction probabilities, where k is the number of classes.
                For ScikitClassifiers , this is classifier.predict_proba.
            labels: iterable with labels to be explained.
            top_labels: if not None, ignore labels and produce explanations for
                the K labels with highest prediction probabilities, where K is
                this parameter.
            num_features: maximum number of features present in explanation
            num_samples: size of the neighborhood to learn the linear model
            distance_metric: the distance metric to use for sample weighting,
                defaults to cosine similarity
            model_regressor: sklearn regressor to use in explanation. Defaults
            to Ridge regression in LimeBase. Must have model_regressor.coef_
            and 'sample_weight' as a parameter to model_regressor.fit()
        Returns:
            An Explanation object (see explanation.py) with the corresponding
            explanations.
        """
        text_instance = sample_observation['discharge']
        tabular_instance = sample_observation.drop(['discharge'])
        indexed_string = (IndexedCharacters(
            text_instance, bow=self.bow, mask_string=self.mask_string)
                          if self.char_level else
                          IndexedString(text_instance, bow=self.bow,
                                        split_expression=self.split_expression,
                                        mask_string=self.mask_string)) # here it tokenizes the string using IndexedString
        domain_mapper = TextDomainMapper(indexed_string)
        data, yss, distances = self.__data_labels_distances( # here it calculates various instances of the data with hidden tokens (0 and 1)
            indexed_string, tabular_instance, predict_fn, num_samples, # CHANGED HERE
            distance_metric=distance_metric) # data is a list of 0/1 arrays (0 -> word is not used); yss are the labels for each array
        if self.class_names is None:
            self.class_names = [str(x) for x in range(yss[0].shape[0])]
        ret_exp = Explanation(domain_mapper=domain_mapper, # generates an explanation object
                                          class_names=self.class_names,
                                          random_state=self.random_state)
        ret_exp.predict_proba = yss[0] # the first yss is the original prediction for our original sample!
        if top_labels: # IGNORE, it just produces explanations for the top labels instead of all, usually not used
            labels = np.argsort(yss[0])[-top_labels:]
            ret_exp.top_labels = list(labels)
            ret_exp.top_labels.reverse()
        for label in labels: # for each class/label we have (in our case Low/High LOS)
            (ret_exp.intercept[label], # we fill the explanation object with -> the intercept
             ret_exp.local_exp[label], # the local explanation
             ret_exp.score[label], # the explanation score and the local prediction -> all is filled with the base.explain_instance_with_data
             ret_exp.local_pred[label]) = self.base.explain_instance_with_data(
                data, yss, distances, label, num_features,
                model_regressor=model_regressor,
                feature_selection=self.feature_selection)
        return ret_exp

    def __data_labels_distances(self,
                                indexed_string, tabular_instance, #CHANGED HERE ADDED TABULAR INSTANCE
                                predict_fn,
                                num_samples,
                                distance_metric='cosine'):
        """Generates a neighborhood around a prediction.
        Generates neighborhood data by randomly removing words from
        the instance, and predicting with the classifier. Uses cosine distance
        to compute distances between original and perturbed instances.
        Args:
            indexed_string: document (IndexedString) to be explained,
            predict_fn: classifier prediction probability function, which
                takes a string and outputs prediction probabilities. For
                ScikitClassifier, this is classifier.predict_proba.
            num_samples: size of the neighborhood to learn the linear model
            distance_metric: the distance metric to use for sample weighting,
                defaults to cosine similarity.
        Returns:
            A tuple (data, labels, distances), where:
                data: dense num_samples * K binary matrix, where K is the
                    number of tokens in indexed_string. The first row is the
                    original instance, and thus a row of ones.
                labels: num_samples * L matrix, where L is the number of target
                    labels
                distances: cosine distance between the original instance and
                    each perturbed instance (computed in the binary 'data'
                    matrix), times 100.
        """

        def distance_fn(x):
            return sklearn.metrics.pairwise.pairwise_distances(
                x, x[0], metric=distance_metric).ravel() * 100

        doc_size = indexed_string.num_words() # get the number of words we have
        sample = self.random_state.randint(1, doc_size + 1, num_samples - 1) # generates a 1 x doc_size array with num_samples - 1 samples
        data = np.ones((num_samples, doc_size)) # generates an array full of ones
        data[0] = np.ones(doc_size) # fills the first array with 1s
        features_range = range(doc_size) # get the number of features as a range
        inverse_data = [indexed_string.raw_string()] # we get the raw string
        inverse_data = [tabular_instance.append(pd.Series(inverse_data))] # here we are appending the tabular data
        for i, size in enumerate(sample, start=1): # for each sample we want to generate
            inactive = self.random_state.choice(features_range, size, # randomly generate what data is going to be equal to 0 from 1 to 768 (our document length)
                                                replace=False)
            data[i, inactive] = 0 # set to 0 certain words in each data array
            inverse_data.append(tabular_instance.append(pd.Series(indexed_string.inverse_removing(inactive)))) 
            # here it creates different string perturbations (as many as sample size), it removes from the original full 1 data the 0 we have generated with inactive
      
        labels = predict_fn(inverse_data) # generates labels
        distances = distance_fn(sp.sparse.csr_matrix(data)) # calculates the distance
        return data, labels, distances

#### TextMixed LIME

In [12]:
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse

In [13]:
class TextMixedDomainMapper(DomainMapper):
    """Maps feature ids to names, generates table views, etc"""

    def __init__(self, feature_names, feature_values, scaled_row,
                 categorical_features, indexed_string, discretized_feature_names=None,
                 feature_indexes=None):
        """Init.
        Args:
            feature_names: list of feature names, in order
            feature_values: list of strings with the values of the original row
            scaled_row: scaled row
            categorical_features: list of categorical features ids (ints)
            feature_indexes: optional feature indexes used in the sparse case
        """
        self.exp_feature_names = feature_names
        self.discretized_feature_names = discretized_feature_names
        self.feature_names = feature_names
        self.feature_values = feature_values
        self.feature_indexes = feature_indexes
        self.scaled_row = scaled_row
        self.indexed_string = indexed_string
        if sp.sparse.issparse(scaled_row):
            self.all_categorical = False
        else:
            self.all_categorical = len(categorical_features) == len(scaled_row)
        self.categorical_features = categorical_features

    def map_exp_ids(self, exp, positions = False):
        """Maps ids to feature names.
        Args:
            exp: list of tuples [(id, weight), (id,weight)]
            positions: if True, also return word positions
        Returns:
            list of tuples (feature_name, weight)
        """
        names = self.exp_feature_names
        if self.discretized_feature_names is not None:
            names = self.discretized_feature_names
        exp_list = []
        for x in exp:
          # if so -> we are dealing with our text column (assuming it is the last column of our dataframe)
          if x[0] > max(self.categorical_features):
            if positions:
                single_exp = ('%s_%s' % (
                    self.indexed_string.word(x[0]),
                    '-'.join(
                        map(str,
                            self.indexed_string.string_position(x[0])))), x[1])
            else:
                
                single_exp = (self.indexed_string.word(x[0] - max(self.categorical_features)-1), x[1]) # need to subtract, since the vocabulary IDs starts from 0
          else:
              single_exp = (names[x[0]], x[1])
          exp_list.append(single_exp)
        return exp_list

    def visualize_instance_html(self,
                                exp,
                                label,
                                div_name,
                                exp_object_name,
                                show_table=True,
                                show_all=False):
        """Shows the current example in a table format.
        Args:
             exp: list of tuples [(id, weight), (id,weight)]
             label: label id (integer)
             div_name: name of div object to be used for rendering(in js)
             exp_object_name: name of js explanation object
             show_table: if False, don't show table visualization.
             show_all: if True, show zero-weighted features in the table.
        """
        if not show_table:
            return ''
        weights = [0] * len(self.feature_names)
        for x in exp:
            weights[x[0]] = x[1]
        if self.feature_indexes is not None:
            # Sparse case: only display the non-zero values and importances
            fnames = [self.exp_feature_names[i] for i in self.feature_indexes]
            fweights = [weights[i] for i in self.feature_indexes]
            if show_all:
                out_list = list(zip(fnames,
                                    self.feature_values,
                                    fweights))
            else:
                out_dict = dict(map(lambda x: (x[0], (x[1], x[2], x[3])),
                                zip(self.feature_indexes,
                                    fnames,
                                    self.feature_values,
                                    fweights)))
                out_list = [out_dict.get(x[0], (str(x[0]), 0.0, 0.0)) for x in exp]
        else:
            out_list = list(zip(self.exp_feature_names,
                                self.feature_values,
                                weights))
            if not show_all:
                out_list = [out_list[x[0]] for x in exp]
        ret = u'''
            %s.show_raw_tabular(%s, %d, %s);
        ''' % (exp_object_name, json.dumps(out_list, ensure_ascii=False), label, div_name)
        return ret

In [14]:
class LimeTextMixed(object):
    """Explains predictions on tabular (i.e. matrix) data.
    For numerical features, perturb them by sampling from a Normal(0,1) and
    doing the inverse operation of mean-centering and scaling, according to the
    means and stds in the training data. For categorical features, perturb by
    sampling according to the training distribution, and making a binary
    feature that is 1 when the value is the same as the instance being
    explained."""

    def __init__(self,
                 training_data,
                 mode="classification",
                 training_labels=None,
                 feature_names=None,
                 categorical_features=None,
                 text_features = None,
                 categorical_names=None,
                 kernel_width=25,
                 kernel=None,
                 verbose=False,
                 class_names=None,
                 feature_selection='auto',
                 discretize_continuous=True,
                 discretizer='quartile',
                 sample_around_instance=False,
                 random_state=None,
                 training_data_stats=None,
                 split_expression=r'\W+',
                 bow=True,
                 mask_string=None,
                 char_level=False):
        """Init function.
        Args:
            training_data: numpy 2d array
            mode: "classification" or "regression"
            training_labels: labels for training data. Not required, but may be
                used by discretizer.
            feature_names: list of names (strings) corresponding to the columns
                in the training data.
            categorical_features: list of indices (ints) corresponding to the
                categorical columns. Everything else will be considered
                continuous. Values in these columns MUST be integers.
            categorical_names: map from int to list of names, where
                categorical_names[x][y] represents the name of the yth value of
                column x.
            kernel_width: kernel width for the exponential kernel.
                If None, defaults to sqrt (number of columns) * 0.75
            kernel: similarity kernel that takes euclidean distances and kernel
                width as input and outputs weights in (0,1). If None, defaults to
                an exponential kernel.
            verbose: if true, print local prediction values from linear model
            class_names: list of class names, ordered according to whatever the
                classifier is using. If not present, class names will be '0',
                '1', ...
            feature_selection: feature selection method. can be
                'forward_selection', 'lasso_path', 'none' or 'auto'.
                See function 'explain_instance_with_data' in lime_base.py for
                details on what each of the options does.
            discretize_continuous: if True, all non-categorical features will
                be discretized into quartiles.
            discretizer: only matters if discretize_continuous is True
                and data is not sparse. Options are 'quartile', 'decile',
                'entropy' or a BaseDiscretizer instance.
            sample_around_instance: if True, will sample continuous features
                in perturbed samples from a normal centered at the instance
                being explained. Otherwise, the normal is centered on the mean
                of the feature data.
            random_state: an integer or numpy.RandomState that will be used to
                generate random numbers. If None, the random state will be
                initialized using the internal numpy seed.
            training_data_stats: a dict object having the details of training data
                statistics. If None, training data information will be used, only matters
                if discretize_continuous is True. Must have the following keys:
                means", "mins", "maxs", "stds", "feature_values",
                "feature_frequencies"
            split_expression: Regex string or callable. If regex string, will be used with re.split.
                If callable, the function should return a list of tokens.
            bow: if True (bag of words), will perturb input data by removing
                all occurrences of individual words or characters.
                Explanations will be in terms of these words. Otherwise, will
                explain in terms of word-positions, so that a word may be
                important the first time it appears and unimportant the second.
                Only set to false if the classifier uses word order in some way
                (bigrams, etc), or if you set char_level=True.
            mask_string: String used to mask tokens or characters if bow=False
                if None, will be 'UNKWORDZ' if char_level=False, chr(0)
                otherwise.
            char_level: an boolean identifying that we treat each character
                as an independent occurence in the string
        """
        self.random_state = check_random_state(random_state)
        self.mode = mode
        self.categorical_names = categorical_names or {}
        self.sample_around_instance = sample_around_instance
        self.training_data_stats = training_data_stats
        self.encoder = sklearn.preprocessing.OneHotEncoder(handle_unknown = 'ignore') # define our encoder
        self.vocabulary = None
        self.bow = bow
        self.mask_string = mask_string
        self.split_expression = split_expression
        self.char_level = char_level
        self.column_names = training_data.columns
        

        # Check and raise proper error in stats are supplied in non-descritized path
        if self.training_data_stats:
            self.validate_training_data_stats(self.training_data_stats)

        if categorical_features is None:
            categorical_features = []
        

        categorical_features = list(categorical_features)
        # if the text feature is also in the categorical features, we remove it
        if text_features in categorical_features:
          categorical_features.remove(text_features)

        
        self.text_features = text_features


        # LUCA -> Clean Pandas dataframe and prepare it to be used by LIME

        training_data.reset_index(inplace = True, drop = True) # remove index order
        no_text_df = training_data.drop(text_features, axis = 1) # drop the text feature
        self.encoder.fit(no_text_df.loc[:,categorical_features]) # fit our encoder
        training_data_ohe = pd.DataFrame(self.encoder.transform(no_text_df.loc[:,categorical_features]).toarray(), # encode and transform to dataframe our categorical features
                                         columns = self.encoder.get_feature_names_out(categorical_features))
        training_data = pd.concat([no_text_df.drop(categorical_features, axis = 1), training_data_ohe], axis = 1) # merge encoded categorical features with numerical ones
        
        self.feature_names_nontext = list(training_data.columns)


        self.numerical_feat_name = no_text_df.drop(categorical_features, axis = 1).columns
        self.numerical_feat = training_data.columns.get_indexer(self.numerical_feat_name)
        self.original_cat_feat = categorical_features
        self.categorical_names = categorical_features
        self.categorical_features = training_data.columns.get_indexer(self.encoder.get_feature_names_out(self.original_cat_feat)) # get the indexes of the categorical features

        self.cat_features_ohe = training_data.columns.get_indexer(self.encoder.get_feature_names_out(self.original_cat_feat)) # get the indexes of the categorical features
        
        training_data = training_data.to_numpy() # transform to numpy array

        if feature_names is None:
            feature_names = [str(i) for i in range(training_data.shape[1]+1)] # add one for the text feature
        self.feature_names = list(feature_names)

        
        
        self.discretizer = None
        if discretize_continuous and not sp.sparse.issparse(training_data):
            # Set the discretizer if training data stats are provided
            if self.training_data_stats:
                discretizer = StatsDiscretizer(
                    training_data, self.categorical_features,
                    self.feature_names_nontext, labels=training_labels,
                    data_stats=self.training_data_stats,
                    random_state=self.random_state)

            if discretizer == 'quartile':
                self.discretizer = QuartileDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names_nontext, labels=training_labels,
                        random_state=self.random_state)
            elif discretizer == 'decile':
                self.discretizer = DecileDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names_nontext, labels=training_labels,
                        random_state=self.random_state)
            elif discretizer == 'entropy':
                self.discretizer = EntropyDiscretizer(
                        training_data, self.categorical_features,
                        self.feature_names_nontext, labels=training_labels,
                        random_state=self.random_state)
            elif isinstance(discretizer, BaseDiscretizer):
                self.discretizer = discretizer
            else:
                raise ValueError('''Discretizer must be 'quartile',''' +
                                 ''' 'decile', 'entropy' or a''' +
                                 ''' BaseDiscretizer instance''')
            self.categorical_features = list(range(training_data.shape[1])) # -> ??????????? <- 

            # Get the discretized_training_data when the stats are not provided
            if(self.training_data_stats is None):
                discretized_training_data = self.discretizer.discretize(
                    training_data)
        #self.categorical_features = list(range(training_data.shape[1])) # -> ??????????? <- 
        if kernel_width is None:
            kernel_width = np.sqrt(training_data.shape[1]) * .75
        kernel_width = float(kernel_width)

        if kernel is None:
            def kernel(d, kernel_width):
                return np.sqrt(np.exp(-(d ** 2) / kernel_width ** 2))

        kernel_fn = partial(kernel, kernel_width=kernel_width)

        self.feature_selection = feature_selection
        self.base = LimeBase(kernel_fn, verbose, random_state=self.random_state)
        self.class_names = class_names

        # Though set has no role to play if training data stats are provided
        self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
        self.scaler.fit(training_data)
        self.feature_values = {}
        self.feature_frequencies = {}

        for feature in self.categorical_features:
            if training_data_stats is None:
                if self.discretizer is not None:
                    column = discretized_training_data[:, feature]
                else:
                    column = training_data[:, feature]

                feature_count = collections.Counter(column)
                values, frequencies = map(list, zip(*(sorted(feature_count.items()))))
            else:
                values = training_data_stats["feature_values"][feature]
                frequencies = training_data_stats["feature_frequencies"][feature]

            self.feature_values[feature] = values
            self.feature_frequencies[feature] = (np.array(frequencies) /
                                                 float(sum(frequencies)))
            self.scaler.mean_[feature] = 0
            self.scaler.scale_[feature] = 1

    @staticmethod
    def convert_and_round(values):
        return ['%.2f' % v for v in values]

    @staticmethod
    def validate_training_data_stats(training_data_stats):
        """
            Method to validate the structure of training data stats
        """
        stat_keys = list(training_data_stats.keys())
        valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"]
        missing_keys = list(set(valid_stat_keys) - set(stat_keys))
        if len(missing_keys) > 0:
            raise Exception("Missing keys in training_data_stats. Details: %s" % (missing_keys))

    def explain_instance(self,
                         data_row,
                         predict_fn,
                         labels=(1,),
                         top_labels=None,
                         num_features=10,
                         num_samples=5000,
                         distance_metric='euclidean',
                         model_regressor=None,
                         sampling_method='gaussian'):
        """Generates explanations for a prediction.
        First, we generate neighborhood data by randomly perturbing features
        from the instance (see __data_inverse). We then learn locally weighted
        linear models on this neighborhood data to explain each of the classes
        in an interpretable way (see lime_base.py).
        Args:
            data_row: 1d numpy array or scipy.sparse matrix, corresponding to a row
            predict_fn: prediction function. For classifiers, this should be a
                function that takes a numpy array and outputs prediction
                probabilities. For regressors, this takes a numpy array and
                returns the predictions. For ScikitClassifiers, this is
                `classifier.predict_proba()`. For ScikitRegressors, this
                is `regressor.predict()`. The prediction function needs to work
                on multiple feature vectors (the vectors randomly perturbed
                from the data_row).
            labels: iterable with labels to be explained.
            top_labels: if not None, ignore labels and produce explanations for
                the K labels with highest prediction probabilities, where K is
                this parameter.
            num_features: maximum number of features present in explanation
            num_samples: size of the neighborhood to learn the linear model
            distance_metric: the distance metric to use for weights.
            model_regressor: sklearn regressor to use in explanation. Defaults
                to Ridge regression in LimeBase. Must have model_regressor.coef_
                and 'sample_weight' as a parameter to model_regressor.fit()
            sampling_method: Method to sample synthetic data. Defaults to Gaussian
                sampling. Can also use Latin Hypercube Sampling.
        Returns:
            An Explanation object (see explanation.py) with the corresponding
            explanations.
        """
        data_row = pd.DataFrame(data_row).T
        data_row.reset_index(inplace = True, drop = True)
        text_only = data_row[self.text_features][0]
        no_text_df = data_row.drop(self.text_features, axis = 1) # drop the text feature
        data_row_ohe = pd.DataFrame(self.encoder.transform(no_text_df.loc[:,self.original_cat_feat]).toarray(), # encode and transform to dataframe our categorical features
                                         columns = self.encoder.get_feature_names_out(self.original_cat_feat))
        data_row = pd.concat([no_text_df.drop(self.original_cat_feat, axis = 1), data_row_ohe], axis = 1) # merge encoded categorical features with numerical ones

        data_row = data_row.T.to_numpy()
        if sp.sparse.issparse(data_row) and not sp.sparse.isspmatrix_csr(data_row):
            # Preventative code: if sparse, convert to csr format if not in csr format already
            data_row = data_row.tocsr()

        # -> TEXT ADDITION
        indexed_string = (IndexedCharacters(
            text_only, bow=self.bow, mask_string=self.mask_string)
                          if self.char_level else
                          IndexedString(text_only, bow=self.bow,
                                        split_expression=self.split_expression,
                                        mask_string=self.mask_string))
        
        
        
        data, inverse, data_text, inverse_data_text = self.__data_inverse(data_row, indexed_string, num_samples, sampling_method)
        
        if sp.sparse.issparse(data):
            # Note in sparse case we don't subtract mean since data would become dense
            scaled_data = data.multiply(self.scaler.scale_)
            # Multiplying with csr matrix can return a coo sparse matrix
            if not sp.sparse.isspmatrix_csr(scaled_data):
                scaled_data = scaled_data.tocsr()
        else:
            scaled_data = (data - self.scaler.mean_) / self.scaler.scale_
        
        scaled_data = np.concatenate((scaled_data, data_text), axis = 1)
        distances = sklearn.metrics.pairwise_distances(
                scaled_data,
                scaled_data[0].reshape(1, -1),
                metric=distance_metric
        ).ravel()

        inverse_cat = pd.DataFrame(self.encoder.inverse_transform(inverse[:,self.cat_features_ohe]), columns = self.original_cat_feat)
        inverse_num = pd.DataFrame(inverse[:, self.numerical_feat], columns = self.numerical_feat_name)
        inverse_data_text = pd.DataFrame(inverse_data_text, columns = [self.text_features])
        inverse_cat.reset_index(inplace = True, drop = True)
        inverse_num.reset_index(inplace = True, drop = True)
        inverse_data_text.reset_index(inplace = True, drop = True)
        inverse = pd.concat([inverse_cat, inverse_num, inverse_data_text], axis = 1)

        inverse = inverse[self.column_names]
        yss = predict_fn(inverse)

        # for classification, the model needs to provide a list of tuples - classes
        # along with prediction probabilities
        if self.mode == "classification":
            if len(yss.shape) == 1:
                raise NotImplementedError("LIME does not currently support "
                                          "classifier models without probability "
                                          "scores. If this conflicts with your "
                                          "use case, please let us know: "
                                          "https://github.com/datascienceinc/lime/issues/16")
            elif len(yss.shape) == 2:
                if self.class_names is None:
                    self.class_names = [str(x) for x in range(yss[0].shape[0])]
                else:
                    self.class_names = list(self.class_names)
                if not np.allclose(yss.sum(axis=1), 1.0):
                    warnings.warn("""
                    Prediction probabilties do not sum to 1, and
                    thus does not constitute a probability space.
                    Check that you classifier outputs probabilities
                    (Not log probabilities, or actual class predictions).
                    """)
            else:
                raise ValueError("Your model outputs "
                                 "arrays with {} dimensions".format(len(yss.shape)))

        # for regression, the output should be a one-dimensional array of predictions
        else:
            try:
                if len(yss.shape) != 1 and len(yss[0].shape) == 1:
                    yss = np.array([v[0] for v in yss])
                assert isinstance(yss, np.ndarray) and len(yss.shape) == 1
            except AssertionError:
                raise ValueError("Your model needs to output single-dimensional \
                    numpyarrays, not arrays of {} dimensions".format(yss.shape))

            predicted_value = yss[0]
            min_y = min(yss)
            max_y = max(yss)

            # add a dimension to be compatible with downstream machinery
            yss = yss[:, np.newaxis]

        feature_names = copy.deepcopy(self.feature_names)
        if feature_names is None:
            feature_names = [str(x) for x in range(data_row.shape[0])]

        if sp.sparse.issparse(data_row):
            values = self.convert_and_round(data_row.data)
            feature_indexes = data_row.indices
        else:
            values = self.convert_and_round(data_row)
            feature_indexes = None

        for i in self.categorical_features:
            if self.discretizer is not None and i in self.discretizer.lambdas:
                continue
            name = self.feature_names_nontext[i]

            for cat_col in self.categorical_names:
              if bool(re.search(cat_col, name)): 
                name = cat_col
            #if i in self.cat_features_ohe:
            #    print(i)
                #name = self.categorical_names[i][name]
            feature_names[i] = '%s=%s' % (name, self.feature_names_nontext[i])
            values[i] = 'True'
        if self.discretizer is None:
            for i in self.numerical_feat:
              feature_names[i] = self.numerical_feat_name[i]
        categorical_features = self.categorical_features
        

        discretized_feature_names = None
        if self.discretizer is not None:
            categorical_features = range(data.shape[1])
            discretized_instance = self.discretizer.discretize(data_row[:,0])
            discretized_feature_names = copy.deepcopy(feature_names)
            for f in self.discretizer.names:
                discretized_feature_names[f] = self.discretizer.names[f][int(
                        discretized_instance[f])]
        
        
        domain_mapper = TextMixedDomainMapper(feature_names,
                                          values,
                                          scaled_data[0], indexed_string = indexed_string,
                                          categorical_features=categorical_features,
                                          discretized_feature_names=discretized_feature_names,
                                          feature_indexes=feature_indexes)

        ret_exp = Explanation(domain_mapper,
                                          mode=self.mode,
                                          class_names=self.class_names)
        if self.mode == "classification":
            ret_exp.predict_proba = yss[0]
            if top_labels:
                labels = np.argsort(yss[0])[-top_labels:]
                ret_exp.top_labels = list(labels)
                ret_exp.top_labels.reverse()
        else:
            ret_exp.predicted_value = predicted_value
            ret_exp.min_value = min_y
            ret_exp.max_value = max_y
            labels = [0]
        #return scaled_data, yss, distances, labels, num_features
        for label in labels:
            (ret_exp.intercept[label],
             ret_exp.local_exp[label],
             ret_exp.score[label],
             ret_exp.local_pred[label]) = self.base.explain_instance_with_data(
                    scaled_data,
                    yss,
                    distances,
                    label,
                    num_features,
                    model_regressor=model_regressor,
                    feature_selection=self.feature_selection)

        if self.mode == "regression":
            ret_exp.intercept[1] = ret_exp.intercept[0]
            ret_exp.local_exp[1] = [x for x in ret_exp.local_exp[0]]
            ret_exp.local_exp[0] = [(i, -1 * j) for i, j in ret_exp.local_exp[1]]

        return ret_exp

    def __data_inverse(self,
                       data_row,
                       indexed_string,
                       num_samples,
                       sampling_method):
        """Generates a neighborhood around a prediction.
        For numerical features, perturb them by sampling from a Normal(0,1) and
        doing the inverse operation of mean-centering and scaling, according to
        the means and stds in the training data. For categorical features,
        perturb by sampling according to the training distribution, and making
        a binary feature that is 1 when the value is the same as the instance
        being explained.
        Args:
            data_row: 1d numpy array, corresponding to a row
            num_samples: size of the neighborhood to learn the linear model
            sampling_method: 'gaussian' or 'lhs'
        Returns:
            A tuple (data, inverse), where:
                data: dense num_samples * K matrix, where categorical features
                are encoded with either 0 (not equal to the corresponding value
                in data_row) or 1. The first row is the original instance.
                inverse: same as data, except the categorical features are not
                binary, but categorical (as the original data)
        """
        is_sparse = sp.sparse.issparse(data_row)
        if is_sparse:
            num_cols = data_row.shape[1]
            data = sp.sparse.csr_matrix((num_samples, num_cols), dtype=data_row.dtype)
        else:
            num_cols = data_row.shape[0]
            data = np.zeros((num_samples, num_cols))
        categorical_features = range(num_cols)
        if self.discretizer is None:
            instance_sample = data_row
            scale = self.scaler.scale_
            mean = self.scaler.mean_
            if is_sparse:
                # Perturb only the non-zero values
                non_zero_indexes = data_row.nonzero()[1]
                num_cols = len(non_zero_indexes)
                instance_sample = data_row[:, non_zero_indexes]
                scale = scale[non_zero_indexes]
                mean = mean[non_zero_indexes]

            if sampling_method == 'gaussian':
                data = self.random_state.normal(0, 1, num_samples * num_cols
                                                ).reshape(num_samples, num_cols)
                data = np.array(data)
            elif sampling_method == 'lhs':
                data = lhs(num_cols, samples=num_samples
                           ).reshape(num_samples, num_cols)
                means = np.zeros(num_cols)
                stdvs = np.array([1]*num_cols)
                for i in range(num_cols):
                    data[:, i] = norm(loc=means[i], scale=stdvs[i]).ppf(data[:, i])
                data = np.array(data)
            else:
                warnings.warn('''Invalid input for sampling_method.
                                 Defaulting to Gaussian sampling.''', UserWarning)
                data = self.random_state.normal(0, 1, num_samples * num_cols
                                                ).reshape(num_samples, num_cols)
                data = np.array(data)

            if self.sample_around_instance:
                data = data * scale + instance_sample
            else:
                data = data * scale + mean
            if is_sparse:
                if num_cols == 0:
                    data = sp.sparse.csr_matrix((num_samples,
                                                 data_row.shape[1]),
                                                dtype=data_row.dtype)
                else:
                    indexes = np.tile(non_zero_indexes, num_samples)
                    indptr = np.array(
                        range(0, len(non_zero_indexes) * (num_samples + 1),
                              len(non_zero_indexes)))
                    data_1d_shape = data.shape[0] * data.shape[1]
                    data_1d = data.reshape(data_1d_shape)
                    data = sp.sparse.csr_matrix(
                        (data_1d, indexes, indptr),
                        shape=(num_samples, data_row.shape[1]))
            categorical_features = self.categorical_features
            first_row = data_row
        else:
            first_row = self.discretizer.discretize(data_row.T)[0]
        
        
        data[0] = data_row.copy()[:,0]
        inverse = data.copy()
        for column in categorical_features:
            values = self.feature_values[column]
            freqs = self.feature_frequencies[column]
            inverse_column = self.random_state.choice(values, size=num_samples,
                                                      replace=True, p=freqs)
            binary_column = (inverse_column == first_row[column]).astype(int)
            binary_column[0] = 1
            inverse_column[0] = data[0, column]
            data[:, column] = binary_column
            inverse[:, column] = inverse_column
        if self.discretizer is not None:
            inverse[1:] = self.discretizer.undiscretize(inverse[1:])
        inverse[0] = data_row[:,0]
        # TEXT RANDOMIZATION
        doc_size = indexed_string.num_words()
        sample_text = self.random_state.randint(1, doc_size + 1, num_samples - 1)
        data_text = np.ones((num_samples, doc_size))
        data_text[0] = np.ones(doc_size)
        features_range_text = range(doc_size)
        inverse_data_text = [indexed_string.raw_string()]
        for i, size in enumerate(sample_text, start=1):
            inactive_text = self.random_state.choice(features_range_text, size,
                                                replace=False)
            data_text[i, inactive_text] = 0
            inverse_data_text.append(indexed_string.inverse_removing(inactive_text))
        
        return data, inverse, data_text, inverse_data_text

## LIME Explanation

In [15]:
def heading_clean(text):
  text = re.sub(r'\[\*\*(.+?)\*\*\]', "", text) # we take out information in brackets
  text = re.sub(r'(Admission Date:)|(Discharge Date:)|(Date of Birth:)|(Name:)|(Unit No:)', "", text, flags = re.I)
  return text

In [16]:
class AutogluonWrapper:
    def __init__(self, predictor, feature_names):
        self.ag_model = predictor
        self.feature_names = feature_names
    
    def predict_binary_prob(self, X):
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names)
        prob = self.ag_model.predict_proba(X, as_multiclass=True)
        #prob.columns = [0, 1]
        #prob = prob.iloc[:, ::-1]
        return np.array(prob)

In [17]:
def return_weights(exp):
    
    """Get weights from LIME explanation object"""
    
    exp_list = exp.as_list()
    exp_weight = [x[1] for x in exp_list]
    exp_labels = [x[0] for x in exp_list]
    
    return exp_weight, exp_labels

### Run 1

#### Load Dataset

In [18]:
# PARAMETERS

preprocessing = True # set to true if we want to clean and perform some preprocessing
preproc_heavier = True # set to True if we want a heavier preprocessing
do_discretization = False # set to True if we want to discretize numerical features
model_explained = 'multimodal' # either "multimodal" or "text_mixed"
lime_discharge_only = False # set to True if we want to explain only discharge note features
lemmatization = False # set to True if we want to lemmatize
lasso_selection = True # set to True if we want lasso selection


preproc_tag_2 = np.where(preproc_heavier, '_heavier', '')
preproc_tag = np.where(preprocessing, f'_preproc{preproc_tag_2}', f'{preproc_tag_2}')
preproc_tag = np.where(lemmatization, f'{preproc_tag}_lemmatization' , preproc_tag)
discretization_tag = np.where(do_discretization, '', '_no_discr')
discretization_tag = np.where(lime_discharge_only, '_disch_only', discretization_tag) # if we explain discharge only, we override discretization/not use it
discretization_tag = np.where(lasso_selection, discretization_tag, f'{discretization_tag}_no_lasso')


feat_selection = np.where(lasso_selection, 'lasso_path', 'auto')

In [19]:
if preprocessing:    
  df = pd.read_feather(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/df_los{preproc_tag}')
  print('Dataframe Loaded')
else:
  print('Unprocessed Dataframe')
  # import dataset
  file = '/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/df_mixed_discharge.csv'
  df = pd.read_csv(file, low_memory=False)
  # drop the variables to be exempted from the analysis and rename new dataset
  df = df.drop(columns = ['Unnamed: 0', 'HADM_ID', 'subject_id','icu_los'])
  # selection criterion : only patients 18 and older and with a length of stay or 1 day or greater
  df = df.loc[(df['age']>=18) & (df['los']>=1),:]
  # check proportion of missing values
  missing = pd.DataFrame(df.isna().mean(), columns = ['proportions'])
  # drop variables having more than 20 % missing values
  df = df.drop(columns=['albumin_min','patientweight','type_stay'])
  # save df
  df_copy = df.copy()
  # impute missing values
  df = df_copy.interpolate()
  # compute Lower and Upper Fence according to Tukey's criteria
  y = df['los']
  Q1 = np.percentile(y, 25)
  Q3 = np.percentile(y, 75)
  IQR = Q3-Q1
  LF = Q1 - 1.5*IQR
  UF = Q3 + 1.5*IQR
  print(f'First quartile = {Q1:.3f}, Third Quartile = {Q3:.3f}, Interquartile Interval = {IQR:.3f}')
  print(f'Lower Fence = {LF:.3f}, Upper Fence = {UF:.3f}')
  # create categorical LOS variable where prolonged LOS is any value greater than Upper Fence
  df['los_cat'] = df['los']> UF

Dataframe Loaded


In [20]:
# split the data into training and test
df_train, df_test = train_test_split(df, train_size=0.80, stratify = df['los_cat'], random_state=42)

In [21]:
df_train.shape, df_test.shape

((31284, 48), (7821, 48))

#### Load Model

In [22]:

# paramètres du modèle
# paramètres du modèle
# paramètres du modèle
if preprocessing:
  save_path = f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/models/{model_explained}{preproc_tag}'
  print('With Preprocessing')
else:
  save_path = f'/content/drive/MyDrive/AutoGluon/models/{model_explained}_2022-05-29'
  save_path = f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/models/{model_explained}{preproc_tag}'
  print('Without Preprocessing')
  
if model_explained == 'multimodal':
  from autogluon.tabular import TabularPredictor
  predictor = TabularPredictor.load(save_path, require_version_match = False)
if model_explained == 'text_mixed':
  from autogluon.text import TextPredictor
  predictor = TextPredictor.load(save_path)

With Preprocessing



	Predictor Version: 0.4.2
	Current Version:   0.5.0



In [23]:
x_test = df_test.drop("los_cat", axis = 1)

In [24]:
autogluon_wrap = AutogluonWrapper(predictor, list(x_test.columns))

In [25]:
autogluon_wrap.predict_binary_prob(x_test.iloc[0])

array([[0.98838687, 0.01161316]])

#### LIME

In [26]:
# load weights
try :
    with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'rb') as handle:
        weights = pickle.load(handle)
        print('Weights loaded')
        print(len(weights))
except :
    weights = [] # initialize an empty dictionary if no existing file is present
    print('New Weight List')
# load labels
try :
    with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'rb') as handle:
        labels = pickle.load(handle)
        print('Label Loaded')
        print(len(labels))
except :
    labels = [] # initialize an empty dictionary if no existing file is present
    print('New Label List')

Weights loaded
7821
Label Loaded
7821


In [27]:
f'{model_explained}_weights{preproc_tag}{discretization_tag}'

'multimodal_weights_preproc_heavier_no_discr'

In [28]:
x_train = df_train.drop("los_cat", axis = 1)

In [29]:
if lime_discharge_only:
  lime_explainer = LimeMultimodalExplainer(class_names = ['short', 'long'], random_state = 42, feature_selection = feat_selection)
else:
  lime_explainer = LimeTextMixed(x_train, categorical_features = x_train.select_dtypes('object').columns, text_features = 'discharge',class_names = ['short', 'long'],
                               random_state = 42, feature_selection = feat_selection, discretize_continuous = do_discretization)

In [30]:
prova2 =lime_explainer.explain_instance(x_test.iloc[0,:], predict_fn = autogluon_wrap.predict_binary_prob, num_samples = 500)

In [31]:
prova2.as_list()

[('urea_n_max', 0.015417082737139081),
 ('platelets_min', -0.01512770870872719),
 ('secondari', -0.013025940641583583),
 ('1211am', -0.011942241061790896),
 ('temp_min', -0.010133563005910144),
 ('automobil', -0.009783896087753785),
 ('magnesium_max', 0.007867075859787655),
 ('urea_n_min', -0.007812710546362268),
 ('platelets_max', 0.007480910101929542),
 ('sofa', -0.006110308942085342)]

In [32]:
minimum = len(weights)  # beginning of our list
maximum = len(x_test) # end of our list len(user_list)
steps = 500 # how many users per batch
range_list = list(np.arange(minimum,maximum,steps))

iter_count = 0

In [33]:
if maximum != len(weights):
  for i in range(len(range_list)):
    min_r = range_list[i]
    try:
      max_r = range_list[i+1]
    except:
      max_r = maximum
    print("Range: {} to {}".format(min_r, max_r))
    batch_pred = x_test.iloc[min_r:max_r]
    #Iterate over first 100 rows in feature matrix
    for index, row in batch_pred.iterrows():
        
        #Get explanation
        exp = lime_explainer.explain_instance(row, 
                                    autogluon_wrap.predict_binary_prob, 
                                    num_features=100,
                                    num_samples = 500)
        
        #Get weights
        exp_weight, exp_labels = return_weights(exp)
        weights.append(exp_weight)
        labels.append(exp_labels)
    iter_count += steps
    if iter_count % 500 == 0 and max_r != maximum:
      # save the file
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(weights, f)
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(labels, f)
      print("Output Saved")
    if max_r == maximum:
      # save the file
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(weights, f)
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(labels, f)
      print("Output Saved, Maximum Reached")

### Run 2

#### Load Dataset

In [34]:
# PARAMETERS

preprocessing = True # set to true if we want to clean and perform some preprocessing
preproc_heavier = True # set to True if we want a heavier preprocessing
do_discretization = False # set to True if we want to discretize numerical features
model_explained = 'multimodal' # either "multimodal" or "text_mixed"
lime_discharge_only = False # set to True if we want to explain only discharge note features
lemmatization = True # set to True if we want to lemmatize
lasso_selection = True # set to True if we want lasso selection


preproc_tag_2 = np.where(preproc_heavier, '_heavier', '')
preproc_tag = np.where(preprocessing, f'_preproc{preproc_tag_2}', f'{preproc_tag_2}')
preproc_tag = np.where(lemmatization, f'{preproc_tag}_lemmatization' , preproc_tag)
discretization_tag = np.where(do_discretization, '', '_no_discr')
discretization_tag = np.where(lime_discharge_only, '_disch_only', discretization_tag) # if we explain discharge only, we override discretization/not use it
discretization_tag = np.where(lasso_selection, discretization_tag, f'{discretization_tag}_no_lasso')


feat_selection = np.where(lasso_selection, 'lasso_path', 'auto')

In [35]:
if preprocessing:    
  df = pd.read_feather(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/df_los{preproc_tag}')
  print('Dataframe Loaded')
else:
  print('Unprocessed Dataframe')
  # import dataset
  file = '/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/df_mixed_discharge.csv'
  df = pd.read_csv(file, low_memory=False)
  # drop the variables to be exempted from the analysis and rename new dataset
  df = df.drop(columns = ['Unnamed: 0', 'HADM_ID', 'subject_id','icu_los'])
  # selection criterion : only patients 18 and older and with a length of stay or 1 day or greater
  df = df.loc[(df['age']>=18) & (df['los']>=1),:]
  # check proportion of missing values
  missing = pd.DataFrame(df.isna().mean(), columns = ['proportions'])
  # drop variables having more than 20 % missing values
  df = df.drop(columns=['albumin_min','patientweight','type_stay'])
  # save df
  df_copy = df.copy()
  # impute missing values
  df = df_copy.interpolate()
  # compute Lower and Upper Fence according to Tukey's criteria
  y = df['los']
  Q1 = np.percentile(y, 25)
  Q3 = np.percentile(y, 75)
  IQR = Q3-Q1
  LF = Q1 - 1.5*IQR
  UF = Q3 + 1.5*IQR
  print(f'First quartile = {Q1:.3f}, Third Quartile = {Q3:.3f}, Interquartile Interval = {IQR:.3f}')
  print(f'Lower Fence = {LF:.3f}, Upper Fence = {UF:.3f}')
  # create categorical LOS variable where prolonged LOS is any value greater than Upper Fence
  df['los_cat'] = df['los']> UF

Dataframe Loaded


In [36]:
# split the data into training and test
df_train, df_test = train_test_split(df, train_size=0.80, stratify = df['los_cat'], random_state=42)

In [37]:
df_train.shape, df_test.shape

((31284, 49), (7821, 49))

#### Load Model

In [38]:

# paramètres du modèle
# paramètres du modèle
# paramètres du modèle
if preprocessing:
  save_path = f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/models/{model_explained}{preproc_tag}'
  print('With Preprocessing')
else:
  save_path = f'/content/drive/MyDrive/AutoGluon/models/{model_explained}_2022-05-29'
  save_path = f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/models/{model_explained}{preproc_tag}'
  print('Without Preprocessing')
  
if model_explained == 'multimodal':
  from autogluon.tabular import TabularPredictor
  predictor = TabularPredictor.load(save_path, require_version_match = False)
if model_explained == 'text_mixed':
  from autogluon.text import TextPredictor
  predictor = TextPredictor.load(save_path)

With Preprocessing


In [39]:
x_test = df_test.drop("los_cat", axis = 1)

In [40]:
autogluon_wrap = AutogluonWrapper(predictor, list(x_test.columns))

In [None]:
autogluon_wrap.predict_binary_prob(x_test.iloc[0])

array([[0.97434306, 0.02565696]])

#### LIME

In [None]:
# load weights
try :
    with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'rb') as handle:
        weights = pickle.load(handle)
        print('Weights loaded')
        print(len(weights))
except :
    weights = [] # initialize an empty dictionary if no existing file is present
    print('New Weight List')
# load labels
try :
    with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'rb') as handle:
        labels = pickle.load(handle)
        print('Label Loaded')
        print(len(labels))
except :
    labels = [] # initialize an empty dictionary if no existing file is present
    print('New Label List')

In [None]:
x_train = df_train.drop("los_cat", axis = 1)

In [None]:
if lime_discharge_only:
  lime_explainer = LimeMultimodalExplainer(class_names = ['short', 'long'], random_state = 42, feature_selection = feat_selection)
else:
  lime_explainer = LimeTextMixed(x_train, categorical_features = x_train.select_dtypes('object').columns, text_features = 'discharge',class_names = ['short', 'long'],
                               random_state = 42, feature_selection = feat_selection, discretize_continuous = do_discretization)

In [None]:
prova2 =lime_explainer.explain_instance(x_test.iloc[0,:], predict_fn = autogluon_wrap.predict_binary_prob, num_samples = 500)

In [None]:
prova2.as_list()

In [None]:
minimum = len(weights)  # beginning of our list
maximum = len(x_test) # end of our list len(user_list)
steps = 500 # how many users per batch
range_list = list(np.arange(minimum,maximum,steps))

iter_count = 0

In [None]:
if maximum != len(weights):
  for i in range(len(range_list)):
    min_r = range_list[i]
    try:
      max_r = range_list[i+1]
    except:
      max_r = maximum
    print("Range: {} to {}".format(min_r, max_r))
    batch_pred = x_test.iloc[min_r:max_r]
    #Iterate over first 100 rows in feature matrix
    for index, row in batch_pred.iterrows():
        
        #Get explanation
        exp = lime_explainer.explain_instance(row, 
                                    autogluon_wrap.predict_binary_prob, 
                                    num_features=100,
                                    num_samples = 500)
        
        #Get weights
        exp_weight, exp_labels = return_weights(exp)
        weights.append(exp_weight)
        labels.append(exp_labels)
    iter_count += steps
    if iter_count % 500 == 0 and max_r != maximum:
      # save the file
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(weights, f)
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(labels, f)
      print("Output Saved")
    if max_r == maximum:
      # save the file
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(weights, f)
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(labels, f)
      print("Output Saved, Maximum Reached")

### Run 3

#### Load Dataset

In [None]:
# PARAMETERS

preprocessing = True # set to true if we want to clean and perform some preprocessing
preproc_heavier = True # set to True if we want a heavier preprocessing
do_discretization = True # set to True if we want to discretize numerical features
model_explained = 'text_mixed' # either "multimodal" or "text_mixed"
lime_discharge_only = False # set to True if we want to explain only discharge note features
lemmatization = True # set to True if we want to lemmatize
lasso_selection = True # set to True if we want lasso selection


preproc_tag_2 = np.where(preproc_heavier, '_heavier', '')
preproc_tag = np.where(preprocessing, f'_preproc{preproc_tag_2}', f'{preproc_tag_2}')
preproc_tag = np.where(lemmatization, f'{preproc_tag}_lemmatization' , preproc_tag)
discretization_tag = np.where(do_discretization, '', '_no_discr')
discretization_tag = np.where(lime_discharge_only, '_disch_only', discretization_tag) # if we explain discharge only, we override discretization/not use it
discretization_tag = np.where(lasso_selection, discretization_tag, f'{discretization_tag}_no_lasso')


feat_selection = np.where(lasso_selection, 'lasso_path', 'auto')

In [None]:
if preprocessing:    
  df = pd.read_feather(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/df_los{preproc_tag}')
  print('Dataframe Loaded')
else:
  print('Unprocessed Dataframe')
  # import dataset
  file = '/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/df_mixed_discharge.csv'
  df = pd.read_csv(file, low_memory=False)
  # drop the variables to be exempted from the analysis and rename new dataset
  df = df.drop(columns = ['Unnamed: 0', 'HADM_ID', 'subject_id','icu_los'])
  # selection criterion : only patients 18 and older and with a length of stay or 1 day or greater
  df = df.loc[(df['age']>=18) & (df['los']>=1),:]
  # check proportion of missing values
  missing = pd.DataFrame(df.isna().mean(), columns = ['proportions'])
  # drop variables having more than 20 % missing values
  df = df.drop(columns=['albumin_min','patientweight','type_stay'])
  # save df
  df_copy = df.copy()
  # impute missing values
  df = df_copy.interpolate()
  # compute Lower and Upper Fence according to Tukey's criteria
  y = df['los']
  Q1 = np.percentile(y, 25)
  Q3 = np.percentile(y, 75)
  IQR = Q3-Q1
  LF = Q1 - 1.5*IQR
  UF = Q3 + 1.5*IQR
  print(f'First quartile = {Q1:.3f}, Third Quartile = {Q3:.3f}, Interquartile Interval = {IQR:.3f}')
  print(f'Lower Fence = {LF:.3f}, Upper Fence = {UF:.3f}')
  # create categorical LOS variable where prolonged LOS is any value greater than Upper Fence
  df['los_cat'] = df['los']> UF

In [None]:
# split the data into training and test
df_train, df_test = train_test_split(df, train_size=0.80, stratify = df['los_cat'], random_state=42)

In [None]:
df_train.shape, df_test.shape

#### Load Model

In [None]:

# paramètres du modèle
# paramètres du modèle
# paramètres du modèle
if preprocessing:
  save_path = f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/models/{model_explained}{preproc_tag}'
  print('With Preprocessing')
else:
  save_path = f'/content/drive/MyDrive/AutoGluon/models/{model_explained}_2022-05-29'
  save_path = f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/models/{model_explained}{preproc_tag}'
  print('Without Preprocessing')
  
if model_explained == 'multimodal':
  from autogluon.tabular import TabularPredictor
  predictor = TabularPredictor.load(save_path, require_version_match = False)
if model_explained == 'text_mixed':
  from autogluon.text import TextPredictor
  predictor = TextPredictor.load(save_path)

In [None]:
x_test = df_test.drop("los_cat", axis = 1)

In [None]:
autogluon_wrap = AutogluonWrapper(predictor, list(x_test.columns))

In [None]:
autogluon_wrap.predict_binary_prob(x_test.iloc[0])

#### LIME

In [None]:
# load weights
try :
    with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'rb') as handle:
        weights = pickle.load(handle)
        print('Weights loaded')
        print(len(weights))
except :
    weights = [] # initialize an empty dictionary if no existing file is present
    print('New Weight List')
# load labels
try :
    with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'rb') as handle:
        labels = pickle.load(handle)
        print('Label Loaded')
        print(len(labels))
except :
    labels = [] # initialize an empty dictionary if no existing file is present
    print('New Label List')

In [None]:
x_train = df_train.drop("los_cat", axis = 1)

In [None]:
if lime_discharge_only:
  lime_explainer = LimeMultimodalExplainer(class_names = ['short', 'long'], random_state = 42, feature_selection = feat_selection)
else:
  lime_explainer = LimeTextMixed(x_train, categorical_features = x_train.select_dtypes('object').columns, text_features = 'discharge',class_names = ['short', 'long'],
                               random_state = 42, feature_selection = feat_selection, discretize_continuous = do_discretization)

In [None]:
prova2 =lime_explainer.explain_instance(x_test.iloc[0,:], predict_fn = autogluon_wrap.predict_binary_prob, num_samples = 500)

In [None]:
prova2.as_list()

In [None]:
minimum = len(weights)  # beginning of our list
maximum = len(x_test) # end of our list len(user_list)
steps = 500 # how many users per batch
range_list = list(np.arange(minimum,maximum,steps))

iter_count = 0

In [None]:
if maximum != len(weights):
  for i in range(len(range_list)):
    min_r = range_list[i]
    try:
      max_r = range_list[i+1]
    except:
      max_r = maximum
    print("Range: {} to {}".format(min_r, max_r))
    batch_pred = x_test.iloc[min_r:max_r]
    #Iterate over first 100 rows in feature matrix
    for index, row in batch_pred.iterrows():
        
        #Get explanation
        exp = lime_explainer.explain_instance(row, 
                                    autogluon_wrap.predict_binary_prob, 
                                    num_features=100,
                                    num_samples = 500)
        
        #Get weights
        exp_weight, exp_labels = return_weights(exp)
        weights.append(exp_weight)
        labels.append(exp_labels)
    iter_count += steps
    if iter_count % 500 == 0 and max_r != maximum:
      # save the file
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(weights, f)
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(labels, f)
      print("Output Saved")
    if max_r == maximum:
      # save the file
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(weights, f)
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(labels, f)
      print("Output Saved, Maximum Reached")

### Run 4

#### Load Dataset

In [None]:
# PARAMETERS

preprocessing = True # set to true if we want to clean and perform some preprocessing
preproc_heavier = False # set to True if we want a heavier preprocessing
do_discretization = True # set to True if we want to discretize numerical features
model_explained = 'text_mixed' # either "multimodal" or "text_mixed"
lime_discharge_only = False # set to True if we want to explain only discharge note features
lemmatization = False # set to True if we want to lemmatize
lasso_selection = True # set to True if we want lasso selection


preproc_tag_2 = np.where(preproc_heavier, '_heavier', '')
preproc_tag = np.where(preprocessing, f'_preproc{preproc_tag_2}', f'{preproc_tag_2}')
preproc_tag = np.where(lemmatization, f'{preproc_tag}_lemmatization' , preproc_tag)
discretization_tag = np.where(do_discretization, '', '_no_discr')
discretization_tag = np.where(lime_discharge_only, '_disch_only', discretization_tag) # if we explain discharge only, we override discretization/not use it
discretization_tag = np.where(lasso_selection, discretization_tag, f'{discretization_tag}_no_lasso')


feat_selection = np.where(lasso_selection, 'lasso_path', 'auto')

In [None]:
if preprocessing:    
  df = pd.read_feather(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/df_los{preproc_tag}')
  print('Dataframe Loaded')
else:
  print('Unprocessed Dataframe')
  # import dataset
  file = '/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/df_mixed_discharge.csv'
  df = pd.read_csv(file, low_memory=False)
  # drop the variables to be exempted from the analysis and rename new dataset
  df = df.drop(columns = ['Unnamed: 0', 'HADM_ID', 'subject_id','icu_los'])
  # selection criterion : only patients 18 and older and with a length of stay or 1 day or greater
  df = df.loc[(df['age']>=18) & (df['los']>=1),:]
  # check proportion of missing values
  missing = pd.DataFrame(df.isna().mean(), columns = ['proportions'])
  # drop variables having more than 20 % missing values
  df = df.drop(columns=['albumin_min','patientweight','type_stay'])
  # save df
  df_copy = df.copy()
  # impute missing values
  df = df_copy.interpolate()
  # compute Lower and Upper Fence according to Tukey's criteria
  y = df['los']
  Q1 = np.percentile(y, 25)
  Q3 = np.percentile(y, 75)
  IQR = Q3-Q1
  LF = Q1 - 1.5*IQR
  UF = Q3 + 1.5*IQR
  print(f'First quartile = {Q1:.3f}, Third Quartile = {Q3:.3f}, Interquartile Interval = {IQR:.3f}')
  print(f'Lower Fence = {LF:.3f}, Upper Fence = {UF:.3f}')
  # create categorical LOS variable where prolonged LOS is any value greater than Upper Fence
  df['los_cat'] = df['los']> UF

In [None]:
# split the data into training and test
df_train, df_test = train_test_split(df, train_size=0.80, stratify = df['los_cat'], random_state=42)

In [None]:
df_train.shape, df_test.shape

#### Load Model

In [None]:

# paramètres du modèle
# paramètres du modèle
# paramètres du modèle
if preprocessing:
  save_path = f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/models/{model_explained}{preproc_tag}'
  print('With Preprocessing')
else:
  save_path = f'/content/drive/MyDrive/AutoGluon/models/{model_explained}_2022-05-29'
  save_path = f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/models/{model_explained}{preproc_tag}'
  print('Without Preprocessing')
  
if model_explained == 'multimodal':
  from autogluon.tabular import TabularPredictor
  predictor = TabularPredictor.load(save_path, require_version_match = False)
if model_explained == 'text_mixed':
  from autogluon.text import TextPredictor
  predictor = TextPredictor.load(save_path)

In [None]:
x_test = df_test.drop("los_cat", axis = 1)

In [None]:
autogluon_wrap = AutogluonWrapper(predictor, list(x_test.columns))

In [None]:
autogluon_wrap.predict_binary_prob(x_test.iloc[0])

#### LIME

In [None]:
# load weights
try :
    with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'rb') as handle:
        weights = pickle.load(handle)
        print('Weights loaded')
        print(len(weights))
except :
    weights = [] # initialize an empty dictionary if no existing file is present
    print('New Weight List')
# load labels
try :
    with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'rb') as handle:
        labels = pickle.load(handle)
        print('Label Loaded')
        print(len(labels))
except :
    labels = [] # initialize an empty dictionary if no existing file is present
    print('New Label List')

In [None]:
x_train = df_train.drop("los_cat", axis = 1)

In [None]:
if lime_discharge_only:
  lime_explainer = LimeMultimodalExplainer(class_names = ['short', 'long'], random_state = 42, feature_selection = feat_selection)
else:
  lime_explainer = LimeTextMixed(x_train, categorical_features = x_train.select_dtypes('object').columns, text_features = 'discharge',class_names = ['short', 'long'],
                               random_state = 42, feature_selection = feat_selection, discretize_continuous = do_discretization)

In [None]:
prova2 =lime_explainer.explain_instance(x_test.iloc[0,:], predict_fn = autogluon_wrap.predict_binary_prob, num_samples = 500)

In [None]:
prova2.as_list()

In [None]:
minimum = len(weights)  # beginning of our list
maximum = len(x_test) # end of our list len(user_list)
steps = 500 # how many users per batch
range_list = list(np.arange(minimum,maximum,steps))

iter_count = 0

In [None]:
if maximum != len(weights):
  for i in range(len(range_list)):
    min_r = range_list[i]
    try:
      max_r = range_list[i+1]
    except:
      max_r = maximum
    print("Range: {} to {}".format(min_r, max_r))
    batch_pred = x_test.iloc[min_r:max_r]
    #Iterate over first 100 rows in feature matrix
    for index, row in batch_pred.iterrows():
        
        #Get explanation
        exp = lime_explainer.explain_instance(row, 
                                    autogluon_wrap.predict_binary_prob, 
                                    num_features=100,
                                    num_samples = 500)
        
        #Get weights
        exp_weight, exp_labels = return_weights(exp)
        weights.append(exp_weight)
        labels.append(exp_labels)
    iter_count += steps
    if iter_count % 500 == 0 and max_r != maximum:
      # save the file
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(weights, f)
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(labels, f)
      print("Output Saved")
    if max_r == maximum:
      # save the file
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_weights{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(weights, f)
      with open(f'/content/drive/MyDrive/MIMIC-III Text Mining/LOS/data/feature_importance/{model_explained}_labels{preproc_tag}{discretization_tag}.pkl', 'wb') as f:
          pickle.dump(labels, f)
      print("Output Saved, Maximum Reached")