In [None]:
def FIscores(directory):
    # directory: the pass for a csv data file.
    # Environment Settings

    !pip install minepy

    print(__doc__)
    import numpy as np
    import pandas as pd
    %matplotlib inline
    import seaborn as sns
    import matplotlib.pyplot as plt

    from sklearn.datasets import fetch_openml
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.impute import SimpleImputer
    from sklearn.inspection import permutation_importance
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder

    print('')
    print('□ The pass of the dataset for the analysis is as follows:\n\t {}'.format(directory))
    print('')

    df = pd.read_csv(directory, encoding= 'unicode_escape')
    print('')
    print('□ Number of colums and rows of the dataset:')
    print(df.shape)
    print('')

    pd.set_option('display.max_rows', 1000)
    nullcounts = df.isnull().sum().values

    if max(nullcounts) == 0:
        print('◇ All the cells were not empty.')
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]
    else:
        print('◇ Some cells were empty.')
        print(nullcounts)
        print('Non-zero integers are the number of empty cells for each variable.\n⇒　They were filled with the median value of the same row.')
        df2 = df.fillna(df.median())
        X = df2.iloc[:, :-1]
        y = df2.iloc[:, -1]

    print('')
    print('□ The dataset is expressed as X and y.')
    print('')
    print('')    
    feature_names = X.columns
    feature_types = X.dtypes
    feature_kinds = list(map(lambda t: len(set(t)), X.values.transpose()))
    all_names=X.columns

    fis = ['feature importance score']

    # Univariate Feature Importance for categorical target
    print('◆ Feature Importance with F stastistic computed')
    print('')
    from sklearn.feature_selection import f_regression

    f1 = pd.Series(f_regression(X, y)[0], index = X.columns)
    fi_fstatistic = f1 / f1.sum() * 100
    fi_fstatistic_pd = pd.DataFrame(fi_fstatistic, columns=['FI with F-statistic'])
    fi_fstatistic_val = fi_fstatistic.values
    # print('◇ Feature Importance with F statistic in the original order')
    # print('')
    # display(fi_fstatistic_pd)
    #  fi_fstatistic_pd.to_csv('./fi_fstatistic.csv')

    print('')

    print('◆ Feature Importance with Maximal Information Coefficient computed')
    print('')
    from minepy import MINE
    def get_mic(X, y):
        mine = MINE()
        mine.compute_score(X, y)
        return mine.mic()
    f2 = X.apply(lambda feature: get_mic(feature, y))
    fi_MIC = f2 / f2.sum() * 100
    fi_MIC_pd = pd.DataFrame(fi_MIC, columns=['FI with MIC'])
    fi_MIC_val = fi_MIC.values

    # print('◇ Feature Importance with MIC (Maximal Information Coefficient) in the original order')
    # print('')
    # display(fi_MIC_pd)
    # fi_MIC_pd.to_csv('./fi_MIC.csv')
    print('')

    print('◆ Impurity Reduction with Random Forest computed')
    print('')
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier().fit(X, y)
    fi_IR = pd.Series(rf.feature_importances_ * 100, index = X.columns)
    fi_IR_pd = pd.DataFrame(fi_IR, columns=['FI with IR'])
    fi_IR_val = fi_IR.values


    print('')
    # display(fi_IR_pd)
    # fi_IR_pd.to_csv('./fi_IR.csv')

    print('◆ Split Count with XGBoost computed')
    print('')    
    from xgboost import XGBClassifier
    xgb = XGBClassifier().fit(X, y)
    f3 = pd.Series(xgb.get_booster().get_score(importance_type='weight'))
    fi_split = f3 / f3.sum() * 100
    fi_split_pd0 = pd.DataFrame(fi_split, columns=['FI with Split Count'])
    fi_split_val0 = fi_split.values
    part_names = fi_split_pd0.index

    # display(fi_split_pd0)
    # fi_split_pd0.to_csv('./fi_split.csv')

    bag =[]
    for item in all_names:
        if item in part_names:
            pos = list(part_names).index(item)
            bag.append(fi_split_val0[pos])
        else:
            bag.append(0)
    fi_split_pd = pd.DataFrame(bag, columns=['FI with Split Count'])
    fi_split_pd.index = all_names
    fi_split_val = fi_split_pd.values

    # display(fi_split_pd)
    # fi_split_pd.to_csv('./fi_split.csv')


    print('')
    print('◆ Coverage with XGBoost computed')
    print('')   

    from xgboost import XGBClassifier
    f4 = pd.Series(xgb.get_booster().get_score(importance_type='cover'))
    fi_coverage = f4 / f4.sum() * 100
    fi_coverage_pd0 = pd.DataFrame(fi_coverage, columns = ['FI with Coverage'])
    fi_coverage_val0 = fi_coverage.values
    part_names = fi_coverage_pd0.index


    bag =[]
    for item in all_names:
        if item in part_names:
            pos = list(part_names).index(item)
            bag.append(fi_coverage_val0[pos])
        else:
            bag.append(0)
    fi_coverage_pd = pd.DataFrame(bag, columns=['FI with Coverage'])
    fi_coverage_pd.index = all_names
    fi_coverage_val = fi_coverage_pd.values    

    # display(fi_coverage_pd)
    # fi_coverage_pd.to_csv('./fi_coverage.csv')


    print('')
    print('◆ Permutation Importance test computed')
    print('')       

    types = []
    i = 0
    categorical_items = []
    numerical_items = []

    # category data: 0, numric data: 1
    for cont in feature_types:
        if cont == 'float64' or 'int64':
            numerical_items.append(feature_names[i])
            i += 1
        else:
            categorical_items.append(feature_names[i])
            i += 1


    categorical_columns = categorical_items
    numerical_columns = numerical_items

    X = X[categorical_columns + numerical_columns]

    categorical_encoder = OneHotEncoder(handle_unknown='ignore')
    numerical_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='mean'))
    ])

    preprocessing = ColumnTransformer(
        [('cat', categorical_encoder, categorical_columns),
        ('num', numerical_pipe, numerical_columns)])

    rf = Pipeline([
        ('preprocess', preprocessing),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    rf.fit(X, y)

    # Impurity-based Permutation Importance test on Training data
    ohe = (rf.named_steps['preprocess']
            .named_transformers_['cat'])
    if not categorical_columns == []:
        feature_names = ohe.get_feature_names_out(input_features=categorical_columns)
        feature_names = np.r_[feature_names, numerical_columns]
    else: 
        feature_names = np.r_[feature_names]

    originalnames=feature_names
    originalscores=rf.named_steps['classifier'].feature_importances_

    # print('■ Feature Importances in the ORIGINAL order')
    fi_permutation0 = np.array([originalnames,originalscores]).transpose()
    feature = ['variable', 'FI with Permutation']
    fi_permutation_pd = pd.DataFrame(fi_permutation0, columns=feature)
    fi_permutation_pd.set_index('variable', inplace=True)
    fi_permutation_val = fi_permutation_pd.values
    # fi_permutation.to_csv('./fi_permutation.csv')
    # display(result3)

    print('')
    print('')

    if len(fi_fstatistic_pd)==len(fi_MIC_pd)==len(fi_split_pd)==len(fi_coverage_pd)==len(fi_permutation_pd):
        all_fi = pd.concat([fi_fstatistic_pd, fi_MIC_pd, fi_IR_pd, fi_split_pd, fi_coverage_pd, fi_permutation_pd], axis=1)
        all_fi_val = all_fi.values
    else:
        print('The lengths of all augments for concat must coincide.')


    print('■ The result of Feature Importances with 6 different methods')
    display(all_fi)
    print('')
    print('◇ All the feature importances are expored in the root directory as all_feature_importances.csv')
    all_fi.to_csv('./all_feature_importances.csv')

    print('')
    print('')

In [None]:
# これをデータのあるファイルのディレクトリに変える。必要なデータの構造：　数値データからなるもの。一番右端の列が目的変数、その他の列は説明変数。
directory = 'drive/MyDrive/Kataoka/data/newMaxGD2.csv'

In [None]:
FIscores(directory)

Collecting minepy
  Downloading minepy-1.2.5.tar.gz (495 kB)
[?25l[K     |▋                               | 10 kB 20.1 MB/s eta 0:00:01[K     |█▎                              | 20 kB 22.3 MB/s eta 0:00:01[K     |██                              | 30 kB 13.7 MB/s eta 0:00:01[K     |██▋                             | 40 kB 10.7 MB/s eta 0:00:01[K     |███▎                            | 51 kB 8.8 MB/s eta 0:00:01[K     |████                            | 61 kB 8.9 MB/s eta 0:00:01[K     |████▋                           | 71 kB 8.2 MB/s eta 0:00:01[K     |█████▎                          | 81 kB 8.9 MB/s eta 0:00:01[K     |██████                          | 92 kB 8.3 MB/s eta 0:00:01[K     |██████▋                         | 102 kB 7.6 MB/s eta 0:00:01[K     |███████▎                        | 112 kB 7.6 MB/s eta 0:00:01[K     |████████                        | 122 kB 7.6 MB/s eta 0:00:01[K     |████████▋                       | 133 kB 7.6 MB/s eta 0:00:01[K     |█████

Unnamed: 0,FI with F-statistic,FI with MIC,FI with IR,FI with Split Count,FI with Coverage,FI with Permutation
ï»¿BlankRBxM,0.089787,2.121202,0.845006,0.490196,3.004662,0.010278
Men,1.110153,0.324488,0.558604,0.0,0.0,0.002018
ageDATEDIF,0.401123,2.328301,1.59924,0.0,0.0,0.009999
localization3RenalCortex1innner2middle3outer,0.018924,0.03728,0.124603,0.0,0.0,0.000867
arteriolosclerosisG03,9.705273,2.425102,1.632801,2.45098,6.090855,0.031563
intimaThickenedG03,5.130943,1.601467,2.774264,3.921569,3.625853,0.018685
interstitialInflammationPercent,0.491709,0.383914,0.800447,0.0,0.0,0.003939
interstitialfibrosisPercent,0.273369,0.869677,1.017867,0.0,0.0,0.006982
Glomuruli,0.853944,2.128785,1.372074,0.490196,3.377224,0.01309
GSPercent,0.008899,1.132936,0.815622,0.0,0.0,0.012821



◇ All the feature importances are expored in the root directory as all_feature_importances.csv


