In [None]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 60)


In [None]:
ROOT = ".."
filename = f"{ROOT}/data/TC_ReCo_detail_descriptor.csv"
df = pd.read_csv(filename)
DESCRIPTOR_NAMES = ['C_R', 'C_T', 'vol_per_atom', 'Z', 'f4', 'd5',
                    'L4f', 'S4f', 'J4f', '(g-1)J4f', '(2-g)J4f']
TARGET_NAME = 'Tc'


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xraw = df[DESCRIPTOR_NAMES].values
scaler.fit(Xraw)
X = scaler.transform(Xraw)
y = df[TARGET_NAME].values


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
rf = RandomForestRegressor(n_estimators=100, random_state=1)
rf.fit(X, y)


In [None]:
df_rf_imp = pd.DataFrame(
    {"descriptor": DESCRIPTOR_NAMES,
     "importance": rf.feature_importances_})
from importance_misc import plot_importance
plot_importance(df_rf_imp, "descriptor", "importance",)


In [None]:
from sklearn.inspection import permutation_importance
perm_imp = permutation_importance(
    rf, X, y, n_repeats=30, random_state=20)


In [None]:
df_perm = pd.DataFrame({"importances_mean":
                        perm_imp["importances_mean"],
                        "descriptor": DESCRIPTOR_NAMES})
plot_importance(df_perm, "descriptor", "importances_mean")


In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(fit_intercept=True)
lr.fit(X, y)
df_lr_coef = pd.DataFrame(
    {"descriptor": DESCRIPTOR_NAMES, "abs(coef)": np.abs(lr.coef_)})
plot_importance(df_lr_coef, "descriptor", "abs(coef)")


In [None]:
perm_imp = permutation_importance(
    lr, X, y, n_repeats=30, random_state=20)


In [None]:
df_perm = pd.DataFrame({"importances_mean": perm_imp["importances_mean"],
                        "descriptor": DESCRIPTOR_NAMES})
plot_importance(df_perm, "descriptor", "importances_mean")
