# Which feature is most important for the gradient boosting classifier to determine tinnitus occurcence?
We used three different methods to determine the feature importance:
- correlation
- permutation importance
- gini importance

In [1]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import joblib
from sklearn.inspection import permutation_importance
from scipy.stats import pointbiserialr

In [2]:
# add src to path so the noteboook can import utilities.py
import sys
sys.path.append('C:\\Users\\joa24jm\\Documents\\tinnitus-country\\src\\')

from d00_utils import utilities as u

In [3]:
#%% read in df
p_loc = 'C:/Users/joa24jm/Documents/tinnitus-country/'

df = pd.read_csv(p_loc + 'data/03_processed/df_equal_splits.csv')

In [4]:
features = ['AT', 'CA', 'CH','DE','GB', 'IT', 'NL', 'NO', 'RU', 'US', # countries
            'autumn', 'spring', 'summer', 'winter',                  # season
            'Male', 'year_of_birth',                                 # demographics
            'question4', 'question5', 'question6', 'question7'
            ]      # EMAs

X = df[features] # all columns except for the last
y = df['question1']  # last col as target


# split up data into train and test, stratify on y, set random_state and shuffle
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 42,
                                                    shuffle = True,
                                                    stratify = y)

In [5]:
# read in trained model
clf = joblib.load(p_loc + 'results/04_models/best_estimator/gb.pkl')



In [6]:
clf

GradientBoostingClassifier(learning_rate=0.5, max_depth=10, max_features=0.5,
                           random_state=42, subsample=1, verbose=1)

In [6]:
# set up result dataframe
res = pd.DataFrame(index = features)

## Gini importance


In [7]:
gini_importances = clf.feature_importances_.tolist()

In [8]:
res['gini'] = gini_importances

## Permutation importance

In [9]:
r = permutation_importance(clf, x_test, y_test, n_repeats = 10, random_state = 42)

In [10]:
permutation_importances = r.importances_mean.tolist()
res['permutation'] = permutation_importances

## Correlation
- categorical-categorical: Cramer's V
- categorical-continous  : Point Biserial

In [12]:
# define feature types
feature_types = {'AT': 'categorical',
 'CA': 'categorical',
 'CH': 'categorical',
 'DE': 'categorical',
 'GB': 'categorical',
 'IT': 'categorical',
 'NL': 'categorical',
 'NO': 'categorical',
 'RU': 'categorical',
 'US': 'categorical',
 'autumn': 'categorical',
 'spring': 'categorical',
 'summer': 'categorical',
 'winter': 'categorical',
 'Male': 'categorical',
 'year_of_birth': 'continous',
 'question4': 'continous',
 'question5': 'continous',
 'question6': 'continous',
 'question7': 'continous'}

In [13]:
# calculation correlations
res['correlations'] = None
for col in feature_types.keys():
    if feature_types[col] == 'categorical':
        c = u.cramers_corrected_stat(pd.crosstab(X[col], y))
    else:
        c = pointbiserialr(X[col].values, y.values)[0]
    res.loc[col, 'correlations'] = c
        

## Rank features importances

In [14]:
res[['gini_rank', 'permutation_rank', 'correlations_rank']] = res.abs().rank(ascending = False)

In [15]:
res.style.format({'gini': '{:.2%}', 
                  'permutation':'{:.2%}', 
                  'correlations':'{:.2%}',
                  'gini_rank':'{:.0f}',
                  'permutation_rank':'{:.0f}',
                  'correlations_rank':'{:.0f}'})

Unnamed: 0,gini,permutation,correlations,gini_rank,permutation_rank,correlations_rank
AT,0.35%,0.19%,2.88%,20,20,14
CA,0.59%,0.85%,4.37%,19,16,11
CH,1.65%,1.26%,9.38%,14,13,5
DE,2.37%,1.99%,3.64%,9,9,13
GB,0.92%,0.78%,0.00%,16,17,20
IT,0.62%,0.35%,7.61%,18,19,7
NL,0.93%,0.99%,0.25%,15,15,17
NO,0.84%,0.38%,7.51%,17,18,8
RU,2.19%,1.09%,13.41%,10,14,3
US,2.10%,2.26%,7.77%,11,7,6


In [16]:
# save results
# res.to_csv(p_loc + 'results/01_tables/feature_importance.csv')