### Discovery of Superionic Solid-State Electrolyte for Li-Ion Batteries via Machine Learing

#### Module

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import plot_confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier      


import shap

#### Surrogate Model

In [None]:
models=[rf,lgbm]

models_name = ['RF','LGBM']


PRED = pd.DataFrame()
PROBA = pd.DataFrame()
TEST_PRED = pd.DataFrame()
TEST_PROBA = pd.DataFrame()

ACC = pd.DataFrame()
PRECISION = pd.DataFrame()
RECALL = pd.DataFrame()
F1 = pd.DataFrame()
FI = pd.DataFrame()

for clf_n, clf in zip(models_name,models):
    
    A_result = []
    P_result = []
    R_result = []
    F_result = []
    
    
    for i in range(0,100):
        
        X_train, X_test, y_train, y_test = train_test_split(DB_X,DB_y, train_size =0.8, random_state = i, stratify = DB_y)
        
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        y_proba = clf.predict_proba(X_test)[:,1]
        
        test_pred = clf.predict(TEST_DB)
        test_proba = clf.predict_proba(TEST_DB)[:,1]
        
        Pred = pd.DataFrame(y_pred,columns={str(clf_n)+'_'+str(i)})
        Proba = pd.DataFrame(y_proba,columns={str(clf_n)+'_'+str(i)})
        FI_result = pd.DataFrame(fi,columns={str(clf_n)+'_'+str(i)},index = DB_X.columns)
        
        Test_pred = pd.DataFrame(test_pred,columns={str(clf_n)+'_'+str(i)})
        Test_proba = pd.DataFrame(test_proba,columns={str(clf_n)+'_'+str(i)})
        
        acc = accuracy_score(y_test,y_pred)
        precision = precision_score(y_test,y_pred)
        recall = recall_score(y_test,y_pred)
        f1 = f1_score(y_test,y_pred)
        
        A_result.append(acc)
        P_result.append(precision)
        R_result.append(recall)
        F_result.append(f1)
        
        PRED = pd.concat([PRED,Pred],axis=1)   
        PROBA = pd.concat([PROBA,Proba],axis=1)
        FI = pd.concat([FI,FI_result],axis=1)
        
        TEST_PRED = pd.concat([TEST_PRED,Test_pred],axis=1)   
        TEST_PROBA = pd.concat([TEST_PROBA,Test_proba],axis=1)        
        
        plot_confusion_matrix(clf, X_test, y_test, cmap = 'Blues')
        plt.show()
        print('Model: {} - Random_state : {} - ACC: {:.4f} | Prec: {:.4f} | Recall: {:.4f} | F1: {:.4f} '.format(clf_n,i,accuracy_score(y_test,y_pred),
                                                                                             precision_score(y_test,y_pred),
                                                                                             recall_score(y_test,y_pred),
                                                                                             f1_score(y_test,y_pred)))
    
    Acc = pd.DataFrame(A_result,columns={str(clf_n)})
    ACC = pd.concat([ACC,Acc],axis=1)      
    
    Precision = pd.DataFrame(P_result,columns={str(clf_n)})
    PRECISION = pd.concat([PRECISION,Precision],axis=1)       
    
    Recall = pd.DataFrame(R_result,columns={str(clf_n)})
    RECALL = pd.concat([RECALL,Recall],axis=1)   
    
    F1_score = pd.DataFrame(F_result,columns={str(clf_n)})
    F1 = pd.concat([F1,F1_score],axis=1)

#### SHAP Analysis

In [None]:
rf_model = rf.fit(DB_X,DB_y)
shap_values = shap.TreeExplainer(rf_model).shap_values(DB_X)
shap.summary_plot(shap_values, DB_X)

In [None]:
lgbm_model = lgbm.fit(DB_X,DB_y)
shap_values = shap.TreeExplainer(lgbm_model).shap_values(DB_X)
shap.summary_plot(shap_values, DB_X)