# Feature Importance

### Loading Libraries

In [1]:
# Randomness
import random

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
from pandas import Timestamp

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio
%matplotlib inline

# Date & Time
from datetime import datetime, timedelta

# Typing
from typing import Tuple, List, Dict, Union, Optional, Any, Generator

# Scikit-Learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_roc_curve

# Scientific Statistical Python
from scipy.stats import jarque_bera

## Feature Importance with Substitution Effects

### Mean Decrease Impurity (MDI)
#### MDI Feature Importance

In [2]:
def feat_imp_MDI(fit: Any, featNames: np.ndarray) -> pd.DataFrame:
    df0 = {i: tree.feature_importances_ for i, tree in enumerate(fit.estimators_)}
    df0 = pd.DataFrame.from_dict(df0, orient='index')
    df0.columns = featNames
    df0 = df0.replace(0, np.nan)    # because max_features=1
    imp = pd.concat({'mean': df0.mean(), 'std': df0.std() * df0.shape[0] ** (-0.5)}, axis=1)
    imp /= imp['mean'].sum()
    return imp

### Mean Decrease Accuracy
#### MDA Feature Importance

In [3]:
def feat_imp_MDA(
    clf: Any, X: pd.DataFrame, y: pd.Series, cv: int, sample_weight: pd.Series,
    t1: pd.Series, pctEmbargo: float, scoring: str = 'neg_log_loss') -> Tuple[pd.DataFrame, float]:

    if scoring not in ['neg_log_loss', 'accuracy']:
        raise Exception('wrong scoring method.')
    cvGen = PurgedKFold(n_splits=cv, t1=t1, pctEmbargo=pctEmbargo)    # purged cv
    scr0, scr1 = pd.Series(), pd.DataFrame(columns=X.columns, dtype=object)
    
    for i, (train, test) in enumerate(cvGen.split(X=X)):
        X0, y0, w0 = X.iloc[train, :], y.iloc[train], sample_weight.iloc[train]
        X1, y1, w1 = X.iloc[test, :], y.iloc[test], sample_weight.iloc[test]
        fit = clf.fit(X=X0, y=y0, sample_weight=w0.values)
        if scoring == 'neg_log_loss':
            prob = fit.predict_proba(X1)
            scr0.loc[i] = -log_loss(y1, prob, sample_weight=w1.values, labels=clf.classes_)
        else:
            pred = fit.predict(X1)
            scr0.loc[i] = accuracy_score(y1, pred, sample_weight=w1.values)
        for j in X.columns:
            X1_ = X1.copy(deep=True)
            np.random.shuffle(X1_[j].values)    # permutation of a single column
            if scoring == 'neg_log_loss':
                prob = fit.predict_proba(X1_)
                scr1.loc[i, j] = -log_loss(y1, prob, sample_weight=w1.values, labels=clf.classes_)
            else:
                pred = fit.predict(X1_)
                scr1.loc[i, j] = accuracy_score(y1, pred, sample_weight=w1.values)
    imp = (-scr1).add(scr0, axis=0)
    if scoring == 'neg_log_loss':
        imp = imp / -scr1
    else:
        imp = imp / (1.0 - scr1)
    imp = pd.concat({'mean': imp.mean(), 'std': imp.std() * imp.shape[0] ** (-0.5)}, axis=1)
    return imp, scr0.mean()

## Feature Importance without Substitution Effects

### Single Feature Importance
#### SFI Implementation

In [8]:
from sklearn.model_selection import KFold

In [9]:
def aux_feat_imp_SFI(
    featNames: np.ndarray, clf: Any, trnsX: pd.DataFrame, cont: pd.DataFrame, scoring: str, cvGen: PurgedKFold) -> pd.DataFrame:
    
    imp = pd.DataFrame(columns=['mean', 'std'], dtype=object)
    for featName in featNames:
        df0 = cvScore(clf, X=trnsX[[featName]], y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cvGen=cvGen)
        imp.loc[featName, 'mean'] = df0.mean()
        imp.loc[featName, 'std'] = df0.std() * df0.shape[0] ** (-0.5)
    return imp

NameError: name 'PurgedKFold' is not defined