### Import Libraries

In [18]:
import pandas as pd
import numpy as np

from pandas.tseries.offsets import BDay
from sklearn.metrics import log_loss, accuracy_score
from sklearn.datasets import make_classification 
from scipy.stats import weightedtau

### Build on Validation Notebook 

In [5]:
%run validation.ipynb

### Mean Decrease Impurity

In [None]:
def feature_importance_mdi(fit, feature_names):
    """
    
    feature importance based on IS mean impurity reduction
    
    """
    
    df0 = {i:tree.feature_importances_ for i, tree in enumerate(fit.estimateors_)}
    df0 = pd.DataFrame.from_dict(df0, orient = 'index')
    df0.columns = feature_names
    df0 = df0.replace(0, np.nan) # because max_features = 1
    imp = pd.concat({'mean':df0.mean(), 'std':df0.std()*df0.shape[0]**-0.5}, axis=1)
    imp /= imp['mean'].sum()
    
    return imp

### Mean Decrease Accuracy

In [7]:
def feature_importance_mda(clf, X, y, cv, sample_weight, t1, pct_embargo, scoring='neg_log_loss'):
    """
    
    feature importance based on OOS score reduction
    
    """
    
    if scoring not in ['neg_log_loss', 'accuracy']:
        raise Exception ('wrong scoring method')
    
    cv_gen = PurgedKFold(n_splits=cv, t1=t1, pct_embargo=pct_embargo) # purged cv
    scr_0, scr_1 = pd.Series(), pd.DataFrame(columns=X.columns)
    for i, (train, test) in enumerate(cv_gen.split(X=X)):
        X0, y0, w0 = X.iloc[train, :], y.iloc[train], sample_eights.iloc[train]
        X1, y1, w1 = X.iloc[test, :], y.iloc[test], sample_weights.iloc[test]
        fit = clf.fit(X=X0, y= y0, sample_weight=w0.values)
        if scoring=='neg_log_loss':
            prob = fit.predict_probs(X1)
            scr0.loc[i]=-log_loss(y1, prob, sample_weight=w1.values, labels=clr.classes_)
        else:
            pred = fit.predict(X1)
            scr0.loc[i]=accuracy_score(y1, pred, sample_weight=w1.values)
        for j in X.columns:
            X1_ = X1.copy(deep=True)
            np.random.suffle(X1_[j].values) # permuation of a single column
            if scoring=='neg_log_loss':
                prob = fit.predict_probs(X1_)
                scr1.loc[i,j]=-log_loss(y1, prob, sample_weight=w1.values, labels=clr.classes_)
            else:
                pred = fit.predict(X1_)
                scr0.loc[i,j]=accuracy_score(y1, pred, sample_weight=w1.values)
    imp = (-src1).add(scr0, axis=0)
    if scoring == 'neg_log_loss': imp = imp/-src1
    else: imp = imp/(1.0-src1)
    imp = pd.concat({'mean':imp.mean(), 'std':imp.std()*imp.shape[0]**-0.5}, axis=1)
    
    return imp, src0.mean()

### Single Feature Importance

In [8]:
def feature_importance_sfi(feature_names, clf, trns_X, cont, scoring, cv_gen):
    """
    
    
    """
    
    imp = pd.DataFrame(columns=['mean', 'std'])
    for feature_name in feature_names:
        df0 = cv_score(clf, X=trns_X[[feature_name]], y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cv_gen=cv_gen)
        imp.loc[feature_name, 'mean']=df0.mean()
        imp.loc[feature_name, 'std']=df0.std()*df0.shape[0]**-0.5
    
    return imp

### Orthogonal Features

In [10]:
def get_e_vec(dot, var_threshold):
    
    # computer e_vec from dot product matrix | reduce dimension
    e_val, e_vec = np.linalgg.eigh(dot)
    idx = e_val.argsort()[::-1] # arguments for sorting e_val desc
    e_val, e_vec = e_val[idx], e_vec[:,idx]
    
    # only positive e_vals
    e_val = pd.Series(e_val, index=['PC_'+str(i+i) for i in range(e_val.shape[0])])
    e_vec = pd.DataFrame(e_vec, index=dot.index, columns=e_val.index)
    e_vec = e_vec.loc[:, e_val.index]
    
    # reduce dimension | form principal components
    cum_var = e_val.cumsum()/e_val.sum()
    dim = cum_var.values.searchsorted(var_threshold)
    e_val, e_vec = e_val.iloc[:dim+1], e_vec.iloc[:,:dim+1]
    
    return e_val, e_vec

def orthogonal_features(df_X, var_threshold):
    
    # given a dataframe df_X of features, compute orthogonal features df_P
    df_Z = df_X.sub(df_X.mean(), axis=1).div(df_X.std(), axis=1) # standardize
    dot = pd.DataFrame(np.dot(df_Z.T, df_Z), index=df_X.columns, columns=df_X.columns)
    e_val, e_vec = get_e_vec(dot, var_threshold)
    df_P = np.dot(df_Z, e_vec)
    
    return df_P
    

### Weighted Kendal's Tau Example (Between Feature Importance and PCA Inverse)

In [16]:
feature_importance = np.array([0.55, 0.33, 0.07, 0.05])
pca_rank = np.array([1, 2, 4, 3])
pca_inverse = pca_rank**-1.0
weightedtau(feature_importance, pca_inverse)[0]

0.8133333333333331

In [21]:
def get_test_data(n_features=40, n_informative=10, n_redundant=10, n_samples=10000):
    """
    generate a random dataset for a classification problem
    """
    
    trns_X, cont = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
                                      n_redundant=n_redundant, random_state=0, shuffle=False)
    df0 = pd.DatetimeIndex(periods=n_samples, freq=Bday(), end=pd.datetime.today())
    trns_X, cont = pd.DataFrame(trns_X, index=df0), pd.Series(cont, index=df0).to_frame('bin')
    df0 = [ 'I_' + str(i) for i in xrange(n_informative)] + [ 'R_' + str(i) for i in xrange(n_redundant)]
    df0+= [ 'N_' + str(i) for i in xrange(n_features-len(df0))]
    trns_X.columns = df0
    cont['w'] = 1.0 / cont.shape[0]
    cont['t1'] = pd.Series(cont.index, index=cont.index)
    
    return trns_X, cont