In [None]:
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
import warnings
from sklearn.model_selection import cross_val_score, KFold
import pandas as pd
import itertools
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
import os
import progressbar 

%matplotlib inline

pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 60)
warnings.filterwarnings("ignore")

In [None]:
ROOT=".."
df = pd.read_csv(f"{ROOT}/data/TC_ReCo_detail_descriptor.csv")
DESCRIPTOR_NAMES = ['C_R', 'C_T', 'vol_per_atom', 'Z', 'f4', 
                    'd5', 'L4f', 'S4f', 'J4f', '(g-1)J4f', '(2-g)J4f']
TARGET_NAME = 'Tc'


In [None]:
Xraw = df[DESCRIPTOR_NAMES].values
y = df[TARGET_NAME].values

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xraw)
X = scaler.transform(Xraw)


In [None]:
def all_combinations(n: int, m: int=None):
    seq = range(n)
    if m is None:
        m = n
    for i in range(1, m+1):
        for x in itertools.combinations(seq, i):
            yield x

In [None]:
from sklearn.metrics import make_scorer
def make_cv_model(x, y, nfold=5, random_state=0):
    kf = KFold(n_splits=nfold, shuffle=True, random_state=random_state)
    meanlist = []
    varlist = []
    reg = Ridge(fit_intercept=True, normalize=False, alpha=0.001)
    scorelist = cross_val_score(
        reg, x, y, scoring=make_scorer(r2_score), cv=kf)
    mean = np.mean(scorelist) # 平均
    std = np.std(scorelist) # 標準偏差
    reg.fit(x, y) # 回帰モデルを作り直し係数を得る．
    return mean, std, reg.coef_


In [None]:
combi_list = []
mean_list = []
std_list = []
coef_list = []
P = X.shape[1]
ncombi = 2**P-1
bar = progressbar.ProgressBar(max_value=ncombi)
for i, icombi in enumerate(all_combinations(P)):
    bar.update(i+1)
    # icombi = np.array(icombi)
    # combi = np.array(DESCRIPTOR_NAMES)[np.array(icombi)]
    combi_list.append(icombi)
    xtry = X[:, icombi]
    ytry = y
    mean, std, coef = make_cv_model(xtry, ytry)
    mean_list.append(mean)
    std_list.append(std)
    # The first element　of coef is the coefficient to y
    coef_list.append(coef.ravel())

mean_list = np.array(mean_list)
std_list = np.array(std_list)


In [None]:
from all_combinations_misc import plot_r2_hist
df_score = pd.DataFrame({"combination": combi_list, 
                         "score_mean": mean_list, 
                         "score_std": std_list, "coef": coef_list})
df_score.sort_values(by="score_mean", ascending=False, inplace=True)
df_score.reset_index(drop=True, inplace=True)
plot_r2_hist(df_score,xlim=(-0.5,1))


In [None]:
from all_combinations_misc import calculate_coeffix
coeffixlist = calculate_coeffix(DESCRIPTOR_NAMES,
                                df_score["combination"].values, 
                                df_score["coef"].values)
df_coef = pd.DataFrame(coeffixlist, columns=DESCRIPTOR_NAMES)
df_result = pd.concat([df_score, df_coef], axis=1)
df_result.sort_values(by="score_mean", ascending=False, inplace=True)
df_result.reset_index(drop=True, inplace=True)

In [None]:
if True:
    fig, ax = plt.subplots() # matplotlib の図と座標軸を得る
    df_result.iloc[:200, :].plot(y="score_mean", yerr="score_std", ax=ax)
    ax.set_xlabel("index") # 横軸名
    ax.set_ylabel("$R^2$") # 縦軸名
    fig.tight_layout()
else:
    from all_combinations_misc import plot_index_r2
    plot_index_r2(df_result.iloc[:200,], y="score_mean", yerr="score_std", 
                xlabel="index", ylabel="$R^2$")

In [None]:
from all_combinations_misc import plot_weight_diagram
fig, axes = plt.subplots(1,2, figsize=(10,3))
plot_weight_diagram(df_result, DESCRIPTOR_NAMES, nmax=50, ax=axes[0])
plot_weight_diagram(df_result, DESCRIPTOR_NAMES, nmax=200, ax=axes[1])

In [None]:
from all_combinations_misc import make_and_plot_block_weight_list
querylist = ["score_mean<0.15", "score_mean>0.15 and score_mean<0.5",
               "score_mean>0.5 and score_mean<0.7", "score_mean>0.7"]
make_and_plot_block_weight_list(df_result, DESCRIPTOR_NAMES, querylist)


In [None]:
from all_combinations_misc import make_indicator_diagram
from all_combinations_misc import make_all_ind_by_index
df_indicator_diagram = make_indicator_diagram(
    df_result, DESCRIPTOR_NAMES)
regions = [_i for _i in range(6)]
regionsize = 200
df_imp_by_index = make_all_ind_by_index(
    df_indicator_diagram, DESCRIPTOR_NAMES, regions, regionsize)
from all_combinations_misc import plot_df_imp_by_index
plot_df_imp_by_index(df_imp_by_index, DESCRIPTOR_NAMES,
                     regions, regionsize)

In [None]:
relv = []
global_max = df_result["score_mean"].max()
for descriptor in DESCRIPTOR_NAMES:
    _df = df_result[df_result[descriptor]==0]
    local_max = _df["score_mean"].max()
    relv.append([descriptor, global_max - local_max])
df_relv = pd.DataFrame(relv, columns=["descriptor", "diffR2"])
from all_combinations_misc import plot_importance
plot_importance(df_relv, x="descriptor", y="diffR2")