# Which feature is most important for the gradient boosting classifier to determine tinnitus occurcence?
We used three different methods to determine the feature importance:
- correlation
- permutation importance
- gini importance

In [1]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
import joblib
from sklearn.inspection import permutation_importance
from scipy.stats import pointbiserialr

In [2]:
# add src to path so the noteboook can import utilities.py
import sys
sys.path.append('C:\\Users\\joa24jm\\Documents\\tinnitus-country\\src\\')

from d00_utils import utilities as u

In [3]:
#%% read in df
p_loc = 'C:/Users/joa24jm/Documents/tinnitus-country/'

df = pd.read_csv(p_loc + 'data/03_processed/df_equal_splits.csv')

## Gradient Boosting **Classifier** `Tinnitus Yes or No`

In [4]:
features = ['AT', 'CA', 'CH','DE','GB', 'IT', 'NL', 'NO', 'RU', 'US', # countries
            'autumn', 'spring', 'summer', 'winter',                  # season
            'Male', 'year_of_birth',                                 # demographics
            'question4', 'question5', 'question6', 'question7'
            ]      # EMAs

X = df[features] # all columns except for the last
y = df['question1']  # last col as target


# split up data into train and test, stratify on y, set random_state and shuffle
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 42,
                                                    shuffle = True,
                                                    stratify = y)

In [5]:
# read in trained model
clf = joblib.load(p_loc + 'results/04_models/best_estimator/gb.pkl')



In [6]:
clf

GradientBoostingClassifier(learning_rate=0.5, max_depth=10, max_features=0.5,
                           random_state=42, subsample=1, verbose=1)

In [7]:
# set up result dataframe
res = pd.DataFrame(index = features)

## Gini importance


In [8]:
gini_importances = dict(zip(x_test.columns, clf.feature_importances_.tolist()))

In [9]:
res['gini_clf'] = res.index.map(gini_importances)

### Permutation importance

In [10]:
r = permutation_importance(clf, x_test, y_test, n_repeats = 10, random_state = 42)

In [11]:
permutation_importances = dict(zip(x_test.columns, r.importances_mean.tolist()))
res['permutation_clf'] = res.index.map(permutation_importances)

### Correlation
- categorical-categorical: Cramer's V
- categorical-continous  : Point Biserial

In [12]:
# define feature types
feature_types = {'AT': 'categorical',
 'CA': 'categorical',
 'CH': 'categorical',
 'DE': 'categorical',
 'GB': 'categorical',
 'IT': 'categorical',
 'NL': 'categorical',
 'NO': 'categorical',
 'RU': 'categorical',
 'US': 'categorical',
 'autumn': 'categorical',
 'spring': 'categorical',
 'summer': 'categorical',
 'winter': 'categorical',
 'Male': 'categorical',
 'year_of_birth': 'continous',
 'question4': 'continous',
 'question5': 'continous',
 'question6': 'continous',
 'question7': 'continous'}

In [13]:
# calculation correlations
res['correlations_clf'] = None
for col in feature_types.keys():
    if feature_types[col] == 'categorical':
        c = u.cramers_corrected_stat(pd.crosstab(X[col], y))
    else:
        c = pointbiserialr(X[col].values, y.values)[0]
    res.loc[col, 'correlations_clf'] = c
        

### Rank features importances

In [14]:
res[['gini_clf_rank', 'permutation_clf_rank', 'correlations_clf_rank']] = res.abs().rank(ascending = False)

In [15]:
res.style.format({'gini_clf': '{:.2%}', 
                  'permutation_clf':'{:.2%}', 
                  'correlations_clf':'{:.2%}',
                  'gini_clf_rank':'{:.0f}',
                  'permutation_clf_rank':'{:.0f}',
                  'correlations_clf_rank':'{:.0f}'})

Unnamed: 0,gini_clf,permutation_clf,correlations_clf,gini_clf_rank,permutation_clf_rank,correlations_clf_rank
AT,0.35%,0.19%,2.88%,20,20,14
CA,0.59%,0.85%,4.37%,19,16,11
CH,1.65%,1.26%,9.38%,14,13,5
DE,2.37%,1.99%,3.64%,9,9,13
GB,0.92%,0.78%,0.00%,16,17,20
IT,0.62%,0.35%,7.61%,18,19,7
NL,0.93%,0.99%,0.25%,15,15,17
NO,0.84%,0.38%,7.51%,17,18,8
RU,2.19%,1.09%,13.41%,10,14,3
US,2.10%,2.26%,7.77%,11,7,6


## Gradient Boosting **Regressor** `Tinnitus Loudness`

In [16]:
df = pd.read_csv(p_loc + 'data/03_processed/df_equal_splits_with_age_with_question2_question_3.csv')

In [17]:
# append age index to res dataframe
res = res.append(pd.DataFrame(index = ['age']))

In [18]:
features = ['AT', 'CA', 'CH','DE','GB', 'IT', 'NL', 'NO', 'RU', 'US', # countries
            'autumn', 'spring', 'summer', 'winter',                  # season
            'Male', 'age',                                 # demographics
            'question4', 'question5', 'question6', 'question7'
            ]      # EMAs

X = df[features] # all columns except for the last
y = df['question2']  # last col as target


# split up data into train and test, stratify on y, set random_state and shuffle
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 42,
                                                    shuffle = True)

In [19]:
# read in trained model
reg = joblib.load(p_loc + 'results/04_models/best_estimator/gb_regressor/gb_21_07_12_16_34.pkl')

In [20]:
reg

GradientBoostingRegressor(learning_rate=0.5, max_depth=10, max_features=0.75,
                          random_state=42, subsample=1, verbose=1)

### Gini importance

In [21]:
gini_importances = dict(zip(x_test.columns, reg.feature_importances_.tolist()))
res['gini_reg'] = res.index.map(gini_importances)

In [22]:
gini_importances

{'AT': 0.004005770607977503,
 'CA': 0.0023793270916620235,
 'CH': 0.008985429871409852,
 'DE': 0.019754157129697185,
 'GB': 0.011135356496379294,
 'IT': 0.0023984020939284098,
 'NL': 0.006994795145202882,
 'NO': 0.0029809487858589064,
 'RU': 0.001564617451743012,
 'US': 0.009645271600897254,
 'autumn': 0.013330201279390191,
 'spring': 0.010962208755617035,
 'summer': 0.013191562459933963,
 'winter': 0.015881838317825237,
 'Male': 0.033155737636983845,
 'age': 0.30578059782433986,
 'question4': 0.07361243581241782,
 'question5': 0.047075259419085226,
 'question6': 0.2731327285608401,
 'question7': 0.14403335365881031}

In [23]:
res['gini_reg'] = res.index.map(gini_importances)

In [24]:
res['gini_reg']

AT               0.004006
CA               0.002379
CH               0.008985
DE               0.019754
GB               0.011135
IT               0.002398
NL               0.006995
NO               0.002981
RU               0.001565
US               0.009645
autumn           0.013330
spring           0.010962
summer           0.013192
winter           0.015882
Male             0.033156
year_of_birth         NaN
question4        0.073612
question5        0.047075
question6        0.273133
question7        0.144033
age              0.305781
Name: gini_reg, dtype: float64

### Permutation importance


In [25]:
r = permutation_importance(reg, x_test, y_test, n_repeats = 10, random_state = 42)
permutation_importances = dict(zip(x_test.columns, r.importances_mean.tolist()))
res['permutation_reg'] = res.index.map(permutation_importances)

In [26]:
# define feature types
# note that 'year_of_birth' has become 'age'
feature_types = {'AT': 'categorical',
 'CA': 'categorical',
 'CH': 'categorical',
 'DE': 'categorical',
 'GB': 'categorical',
 'IT': 'categorical',
 'NL': 'categorical',
 'NO': 'categorical',
 'RU': 'categorical',
 'US': 'categorical',
 'autumn': 'categorical',
 'spring': 'categorical',
 'summer': 'categorical',
 'winter': 'categorical',
 'Male': 'categorical',
 'age': 'continous',
 'question4': 'continous',
 'question5': 'continous',
 'question6': 'continous',
 'question7': 'continous'}

### Correlation
- categorical-continuous  : Point Biserial
- continuous-continuous : SpearmanR

In [27]:
# as loudness has a non-gaussian distribution, the spearmanr correlation coefficient applies.
from scipy.stats import spearmanr

In [28]:
res

Unnamed: 0,gini_clf,permutation_clf,correlations_clf,gini_clf_rank,permutation_clf_rank,correlations_clf_rank,gini_reg,permutation_reg
AT,0.003525,0.001909,0.028785,20.0,20.0,14.0,0.004006,0.008727
CA,0.005936,0.00855,0.043684,19.0,16.0,11.0,0.002379,0.004958
CH,0.016477,0.012649,0.093836,14.0,13.0,5.0,0.008985,0.026791
DE,0.023666,0.019906,0.036434,9.0,9.0,13.0,0.019754,0.080127
GB,0.009213,0.007804,0.0,16.0,17.0,20.0,0.011135,0.029519
IT,0.006245,0.003479,0.076135,18.0,19.0,7.0,0.002398,0.006719
NL,0.00928,0.009877,0.002537,15.0,15.0,17.0,0.006995,0.019488
NO,0.008392,0.003834,0.075149,17.0,18.0,8.0,0.002981,0.00566
RU,0.021857,0.010935,0.134062,10.0,14.0,3.0,0.001565,0.007452
US,0.021026,0.022582,0.077724,11.0,7.0,6.0,0.009645,0.033802


In [29]:
# calculation correlations
res['correlations_reg'] = None
for col in feature_types.keys():
    if feature_types[col] == 'categorical':
        c = pointbiserialr(X[col].values, y.values)[0]
    elif feature_types[col] == 'continous':
        c, _ = spearmanr(X[col].values, y.values)
    else:
        print(f'Variable {feature_types[col]} type not supported. Must be either categorical or continous')
    
    res.loc[col, 'correlations_reg'] = c

In [30]:
res

Unnamed: 0,gini_clf,permutation_clf,correlations_clf,gini_clf_rank,permutation_clf_rank,correlations_clf_rank,gini_reg,permutation_reg,correlations_reg
AT,0.003525,0.001909,0.028785,20.0,20.0,14.0,0.004006,0.008727,-0.035759
CA,0.005936,0.00855,0.043684,19.0,16.0,11.0,0.002379,0.004958,-0.046657
CH,0.016477,0.012649,0.093836,14.0,13.0,5.0,0.008985,0.026791,-0.088333
DE,0.023666,0.019906,0.036434,9.0,9.0,13.0,0.019754,0.080127,0.099763
GB,0.009213,0.007804,0.0,16.0,17.0,20.0,0.011135,0.029519,0.04852
IT,0.006245,0.003479,0.076135,18.0,19.0,7.0,0.002398,0.006719,-0.013719
NL,0.00928,0.009877,0.002537,15.0,15.0,17.0,0.006995,0.019488,-0.10345
NO,0.008392,0.003834,0.075149,17.0,18.0,8.0,0.002981,0.00566,-0.037403
RU,0.021857,0.010935,0.134062,10.0,14.0,3.0,0.001565,0.007452,-0.017414
US,0.021026,0.022582,0.077724,11.0,7.0,6.0,0.009645,0.033802,0.04079


In [35]:
import math
math.isnan(res.loc['age', 'gini_clf'])

True

In [36]:
for col in res.columns:
    if math.isnan(res.loc['age', col]):
        res.loc['age', col] = res.loc['year_of_birth', col]

res.drop(index = 'year_of_birth', inplace = True)      


In [38]:
res

Unnamed: 0,gini_clf,permutation_clf,correlations_clf,gini_clf_rank,permutation_clf_rank,correlations_clf_rank,gini_reg,permutation_reg,correlations_reg
AT,0.003525,0.001909,0.028785,20.0,20.0,14.0,0.004006,0.008727,-0.035759
CA,0.005936,0.00855,0.043684,19.0,16.0,11.0,0.002379,0.004958,-0.046657
CH,0.016477,0.012649,0.093836,14.0,13.0,5.0,0.008985,0.026791,-0.088333
DE,0.023666,0.019906,0.036434,9.0,9.0,13.0,0.019754,0.080127,0.099763
GB,0.009213,0.007804,0.0,16.0,17.0,20.0,0.011135,0.029519,0.04852
IT,0.006245,0.003479,0.076135,18.0,19.0,7.0,0.002398,0.006719,-0.013719
NL,0.00928,0.009877,0.002537,15.0,15.0,17.0,0.006995,0.019488,-0.10345
NO,0.008392,0.003834,0.075149,17.0,18.0,8.0,0.002981,0.00566,-0.037403
RU,0.021857,0.010935,0.134062,10.0,14.0,3.0,0.001565,0.007452,-0.017414
US,0.021026,0.022582,0.077724,11.0,7.0,6.0,0.009645,0.033802,0.04079


In [39]:
res[['gini_reg_rank', 'permutation_reg_rank', 'correlations_reg_rank']] = res[['gini_reg', 'permutation_reg', 'correlations_reg']].abs().rank(ascending = False)

In [40]:
res.style.format({'gini_clf': '{:.2%}', 
                  'permutation_clf':'{:.2%}', 
                  'correlations_clf':'{:.2%}',
                  'gini_clf_rank':'{:.0f}',
                  'permutation_clf_rank':'{:.0f}',
                  'correlations_clf_rank':'{:.0f}',
                  'gini_reg': '{:.2%}', 
                  'permutation_reg':'{:.2%}', 
                  'correlations_reg':'{:.2%}',
                  'gini_reg_rank':'{:.0f}',
                  'permutation_reg_rank':'{:.0f}',
                  'correlations_reg_rank':'{:.0f}'})

Unnamed: 0,gini_clf,permutation_clf,correlations_clf,gini_clf_rank,permutation_clf_rank,correlations_clf_rank,gini_reg,permutation_reg,correlations_reg,gini_reg_rank,permutation_reg_rank,correlations_reg_rank
AT,0.35%,0.19%,2.88%,20,20,14,0.40%,0.87%,-3.58%,16,16,16
CA,0.59%,0.85%,4.37%,19,16,11,0.24%,0.50%,-4.67%,19,20,11
CH,1.65%,1.26%,9.38%,14,13,5,0.90%,2.68%,-8.83%,14,14,7
DE,2.37%,1.99%,3.64%,9,9,13,1.98%,8.01%,9.98%,7,7,6
GB,0.92%,0.78%,0.00%,16,17,20,1.11%,2.95%,4.85%,11,12,9
IT,0.62%,0.35%,7.61%,18,19,7,0.24%,0.67%,-1.37%,18,18,20
NL,0.93%,0.99%,0.25%,15,15,17,0.70%,1.95%,-10.34%,15,15,5
NO,0.84%,0.38%,7.51%,17,18,8,0.30%,0.57%,-3.74%,17,19,14
RU,2.19%,1.09%,13.41%,10,14,3,0.16%,0.75%,-1.74%,20,17,19
US,2.10%,2.26%,7.77%,11,7,6,0.96%,3.38%,4.08%,13,11,12


In [None]:
# save results
# res.to_csv(p_loc + 'results/01_tables/feature_importance_with_regression.csv')