In [1]:
# common libraries
import pandas as pd
import numpy as np
import pickle
#import inspect
#import re

# sklearn additional libraries
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import classification_report, multilabel_confusion_matrix

# classification models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

def show_classification_report(x, y, title = None, labels = None):
    print(f'{title}')
    print(classification_report(x, y, target_names=labels))

def plot_multilabel_confusion_matrix(y_true, y_predicted, labels=None):
    vis_arr = multilabel_confusion_matrix(list(y_true), y_predicted)
    fig, ax = plt.subplots(len(vis_arr)//3+1, 3, figsize=(12, 4*(len(vis_arr)//3+1)))
    for axes, cfs_matrix, label in zip(ax.flatten(), vis_arr, labels):
            plot_multiclass_confusion_matrix(cfs_matrix, axes, label[:40], ["N", "Y"])
    fig.tight_layout()
    plt.show()

Load datasets and journals labels

In [2]:
labels = []
with open('../support_files/journals_labels.txt', 'r') as f:
    for line in f:
        labels.append(line[:-1])
labels = [l[:73] for l in labels]

In [3]:
X, y = pd.read_pickle('../data/X.pkl'), pd.read_pickle('../data/y.pkl')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=2319, 
                                                    shuffle=True, 
                                                    test_size=0.25)

### LightGBM

In [10]:
lgbm_model = LGBMClassifier(random_state=123, reg_alpha=0.25, reg_lambda=0.1)

In [11]:
lgbm_model.fit(X_train, list(y_train), 
               eval_metric='multi_logloss', 
               eval_set=[(X_test, y_test)])

[1]	valid_0's multi_logloss: 2.2759
[2]	valid_0's multi_logloss: 2.12056
[3]	valid_0's multi_logloss: 2.00778
[4]	valid_0's multi_logloss: 1.91962
[5]	valid_0's multi_logloss: 1.84808
[6]	valid_0's multi_logloss: 1.78929
[7]	valid_0's multi_logloss: 1.73716
[8]	valid_0's multi_logloss: 1.69249
[9]	valid_0's multi_logloss: 1.65285
[10]	valid_0's multi_logloss: 1.61766
[11]	valid_0's multi_logloss: 1.58489
[12]	valid_0's multi_logloss: 1.55595
[13]	valid_0's multi_logloss: 1.5304
[14]	valid_0's multi_logloss: 1.50625
[15]	valid_0's multi_logloss: 1.4834
[16]	valid_0's multi_logloss: 1.46363
[17]	valid_0's multi_logloss: 1.44531
[18]	valid_0's multi_logloss: 1.4273
[19]	valid_0's multi_logloss: 1.41094
[20]	valid_0's multi_logloss: 1.39614
[21]	valid_0's multi_logloss: 1.38142
[22]	valid_0's multi_logloss: 1.36756
[23]	valid_0's multi_logloss: 1.35438
[24]	valid_0's multi_logloss: 1.34295
[25]	valid_0's multi_logloss: 1.33155
[26]	valid_0's multi_logloss: 1.32028
[27]	valid_0's multi_logl

In [13]:
show_classification_report(list(y_train), lgbm_model.predict(X_train), 
                           title='TRAIN (LightGBM)', labels=labels)
show_classification_report(list(y_test), lgbm_model.predict(X_test), 
                           title='TEST (LightGBM)', labels=labels)

TRAIN (LightGBM)
                                                                           precision    recall  f1-score   support

                                            Bulletin of Materials Science       0.92      0.80      0.85      2520
                                          Journal of Electronic Materials       0.89      0.87      0.88      2594
                         Journal of Materials Engineering and Performance       0.90      0.87      0.89      2595
                      Journal of Materials Science: Materials in Medicine       0.92      0.94      0.93      2577
                                         Journal of Nanoparticle Research       0.88      0.90      0.89      2589
                                Journal of Sol-Gel Science and Technology       0.93      0.95      0.94      2566
Journal of Surface Investigation: X-ray, Synchrotron and Neutron Techniqu       0.93      0.97      0.95      2332
                Journal of Wuhan University of Technology-Mate

### LightGBM CrossValidation and ParametersSearch

In [5]:
lgbm_model = LGBMClassifier(random_state=122)

In [6]:
params = {'num_leaves': [5, 10, 25, 50, 100],
          'learning_rate': [0.05, 0.1, 0.25, 0.5],
          'n_estimators':[10, 50, 100, 200],
          'max_depth':[3, 5, 7, 10, 15, 20, 25], 
          'reg_alpha': [0, 0.1, 0.2, 0.25, 0.5, 1, 2, 5],
          'reg_lambda': [0, 0.1, 0.2, 0.25, 0.5, 1, 2, 5]}

In [7]:
cv = KFold(n_splits=5, 
           random_state=122, 
           shuffle=True)

In [8]:
rs = RandomizedSearchCV(lgbm_model, 
                        params, 
                        scoring='recall_macro', 
                        cv=cv, 
                        n_jobs=-1, 
                        error_score='raise', 
                        verbose=-1)

In [9]:
rs.fit(X_train, list(y_train),
       eval_metric='multi_logloss', 
       eval_set=[(X_test, y_test)])

[1]	valid_0's multi_logloss: 2.04275
[2]	valid_0's multi_logloss: 1.88612
[3]	valid_0's multi_logloss: 1.7803
[4]	valid_0's multi_logloss: 1.70396
[5]	valid_0's multi_logloss: 1.64277
[6]	valid_0's multi_logloss: 1.59652
[7]	valid_0's multi_logloss: 1.55583
[8]	valid_0's multi_logloss: 1.52222
[9]	valid_0's multi_logloss: 1.49205
[10]	valid_0's multi_logloss: 1.46411
[11]	valid_0's multi_logloss: 1.44042
[12]	valid_0's multi_logloss: 1.4196
[13]	valid_0's multi_logloss: 1.3996
[14]	valid_0's multi_logloss: 1.38459
[15]	valid_0's multi_logloss: 1.37015
[16]	valid_0's multi_logloss: 1.35633
[17]	valid_0's multi_logloss: 1.34367
[18]	valid_0's multi_logloss: 1.33166
[19]	valid_0's multi_logloss: 1.32151
[20]	valid_0's multi_logloss: 1.31058
[21]	valid_0's multi_logloss: 1.30219
[22]	valid_0's multi_logloss: 1.29343
[23]	valid_0's multi_logloss: 1.28572
[24]	valid_0's multi_logloss: 1.27928
[25]	valid_0's multi_logloss: 1.27226
[26]	valid_0's multi_logloss: 1.26713
[27]	valid_0's multi_log

[1]	valid_0's multi_logloss: 2.04241
[2]	valid_0's multi_logloss: 1.88637
[3]	valid_0's multi_logloss: 1.77978
[4]	valid_0's multi_logloss: 1.7067
[5]	valid_0's multi_logloss: 1.64714
[6]	valid_0's multi_logloss: 1.60048
[7]	valid_0's multi_logloss: 1.55825
[8]	valid_0's multi_logloss: 1.52292
[9]	valid_0's multi_logloss: 1.48999
[10]	valid_0's multi_logloss: 1.46206
[11]	valid_0's multi_logloss: 1.43992
[12]	valid_0's multi_logloss: 1.42026
[13]	valid_0's multi_logloss: 1.40067
[14]	valid_0's multi_logloss: 1.38419
[15]	valid_0's multi_logloss: 1.36864
[16]	valid_0's multi_logloss: 1.35518
[17]	valid_0's multi_logloss: 1.3436
[18]	valid_0's multi_logloss: 1.33202
[19]	valid_0's multi_logloss: 1.32139
[20]	valid_0's multi_logloss: 1.31147
[21]	valid_0's multi_logloss: 1.304
[22]	valid_0's multi_logloss: 1.29719
[23]	valid_0's multi_logloss: 1.29107
[24]	valid_0's multi_logloss: 1.28315
[25]	valid_0's multi_logloss: 1.27615
[26]	valid_0's multi_logloss: 1.26955
[27]	valid_0's multi_logl

[1]	valid_0's multi_logloss: 2.04463
[2]	valid_0's multi_logloss: 1.88939
[3]	valid_0's multi_logloss: 1.78343
[4]	valid_0's multi_logloss: 1.70883
[5]	valid_0's multi_logloss: 1.65054
[6]	valid_0's multi_logloss: 1.60284
[7]	valid_0's multi_logloss: 1.5637
[8]	valid_0's multi_logloss: 1.52957
[9]	valid_0's multi_logloss: 1.49932
[10]	valid_0's multi_logloss: 1.47204
[11]	valid_0's multi_logloss: 1.44948
[12]	valid_0's multi_logloss: 1.42927
[13]	valid_0's multi_logloss: 1.4103
[14]	valid_0's multi_logloss: 1.39299
[15]	valid_0's multi_logloss: 1.37719
[16]	valid_0's multi_logloss: 1.36359
[17]	valid_0's multi_logloss: 1.3513
[18]	valid_0's multi_logloss: 1.3391
[19]	valid_0's multi_logloss: 1.32849
[20]	valid_0's multi_logloss: 1.31812
[21]	valid_0's multi_logloss: 1.30865
[22]	valid_0's multi_logloss: 1.30063
[23]	valid_0's multi_logloss: 1.29315
[24]	valid_0's multi_logloss: 1.28552
[25]	valid_0's multi_logloss: 1.28026
[26]	valid_0's multi_logloss: 1.27375
[27]	valid_0's multi_logl

[1]	valid_0's multi_logloss: 2.04607
[2]	valid_0's multi_logloss: 1.88259
[3]	valid_0's multi_logloss: 1.77722
[4]	valid_0's multi_logloss: 1.70426
[5]	valid_0's multi_logloss: 1.64546
[6]	valid_0's multi_logloss: 1.60076
[7]	valid_0's multi_logloss: 1.55611
[8]	valid_0's multi_logloss: 1.52175
[9]	valid_0's multi_logloss: 1.49039
[10]	valid_0's multi_logloss: 1.46108
[11]	valid_0's multi_logloss: 1.43952
[12]	valid_0's multi_logloss: 1.41859
[13]	valid_0's multi_logloss: 1.40024
[14]	valid_0's multi_logloss: 1.38457
[15]	valid_0's multi_logloss: 1.36876
[16]	valid_0's multi_logloss: 1.35478
[17]	valid_0's multi_logloss: 1.34236
[18]	valid_0's multi_logloss: 1.32896
[19]	valid_0's multi_logloss: 1.31689
[20]	valid_0's multi_logloss: 1.30766
[21]	valid_0's multi_logloss: 1.29958
[22]	valid_0's multi_logloss: 1.29165
[23]	valid_0's multi_logloss: 1.28568
[24]	valid_0's multi_logloss: 1.27808
[25]	valid_0's multi_logloss: 1.27103
[26]	valid_0's multi_logloss: 1.26546
[27]	valid_0's multi_

[1]	valid_0's multi_logloss: 2.41823
[2]	valid_0's multi_logloss: 2.31198
[3]	valid_0's multi_logloss: 2.2291
[4]	valid_0's multi_logloss: 2.15966
[5]	valid_0's multi_logloss: 2.10051
[6]	valid_0's multi_logloss: 2.04748
[7]	valid_0's multi_logloss: 1.99972
[8]	valid_0's multi_logloss: 1.95747
[9]	valid_0's multi_logloss: 1.92035
[10]	valid_0's multi_logloss: 1.88606
[11]	valid_0's multi_logloss: 1.85395
[12]	valid_0's multi_logloss: 1.82459
[13]	valid_0's multi_logloss: 1.79668
[14]	valid_0's multi_logloss: 1.77132
[15]	valid_0's multi_logloss: 1.74773
[16]	valid_0's multi_logloss: 1.72594
[17]	valid_0's multi_logloss: 1.70523
[18]	valid_0's multi_logloss: 1.68607
[19]	valid_0's multi_logloss: 1.66763
[20]	valid_0's multi_logloss: 1.65065
[21]	valid_0's multi_logloss: 1.63404
[22]	valid_0's multi_logloss: 1.61788
[23]	valid_0's multi_logloss: 1.60333
[24]	valid_0's multi_logloss: 1.58884
[25]	valid_0's multi_logloss: 1.57556
[26]	valid_0's multi_logloss: 1.56244
[27]	valid_0's multi_l

In [10]:
rs.best_params_

{'reg_lambda': 0.1,
 'reg_alpha': 0,
 'num_leaves': 25,
 'n_estimators': 200,
 'max_depth': 10,
 'learning_rate': 0.05}

[6]	valid_0's multi_logloss: 1.60937
[7]	valid_0's multi_logloss: 1.56881
[8]	valid_0's multi_logloss: 1.53386
[9]	valid_0's multi_logloss: 1.50164
[10]	valid_0's multi_logloss: 1.47599
[1]	valid_0's multi_logloss: 2.30871
[2]	valid_0's multi_logloss: 2.15347
[3]	valid_0's multi_logloss: 2.03726
[4]	valid_0's multi_logloss: 1.94725
[5]	valid_0's multi_logloss: 1.87331
[6]	valid_0's multi_logloss: 1.80819
[7]	valid_0's multi_logloss: 1.75324
[8]	valid_0's multi_logloss: 1.70604
[9]	valid_0's multi_logloss: 1.66502
[10]	valid_0's multi_logloss: 1.6278
[1]	valid_0's multi_logloss: 2.32833
[2]	valid_0's multi_logloss: 2.19704
[3]	valid_0's multi_logloss: 2.10125
[4]	valid_0's multi_logloss: 2.02579
[5]	valid_0's multi_logloss: 1.96297
[6]	valid_0's multi_logloss: 1.90808
[7]	valid_0's multi_logloss: 1.86236
[8]	valid_0's multi_logloss: 1.82059
[9]	valid_0's multi_logloss: 1.78509
[10]	valid_0's multi_logloss: 1.75275
[11]	valid_0's multi_logloss: 1.72497
[12]	valid_0's multi_logloss: 1.698

[35]	valid_0's multi_logloss: 1.5156
[36]	valid_0's multi_logloss: 1.50931
[37]	valid_0's multi_logloss: 1.50259
[38]	valid_0's multi_logloss: 1.49642
[39]	valid_0's multi_logloss: 1.48985
[40]	valid_0's multi_logloss: 1.48447
[41]	valid_0's multi_logloss: 1.47851
[42]	valid_0's multi_logloss: 1.47317
[43]	valid_0's multi_logloss: 1.46762
[44]	valid_0's multi_logloss: 1.46216
[45]	valid_0's multi_logloss: 1.45736
[46]	valid_0's multi_logloss: 1.45226
[47]	valid_0's multi_logloss: 1.44697
[48]	valid_0's multi_logloss: 1.44193
[49]	valid_0's multi_logloss: 1.43722
[50]	valid_0's multi_logloss: 1.43267
[51]	valid_0's multi_logloss: 1.42783
[52]	valid_0's multi_logloss: 1.42363
[53]	valid_0's multi_logloss: 1.41949
[54]	valid_0's multi_logloss: 1.41531
[55]	valid_0's multi_logloss: 1.41163
[56]	valid_0's multi_logloss: 1.40763
[57]	valid_0's multi_logloss: 1.40324
[58]	valid_0's multi_logloss: 1.39924
[59]	valid_0's multi_logloss: 1.39578
[60]	valid_0's multi_logloss: 1.39233
[61]	valid_0'

[34]	valid_0's multi_logloss: 1.52392
[35]	valid_0's multi_logloss: 1.51659
[36]	valid_0's multi_logloss: 1.50949
[37]	valid_0's multi_logloss: 1.50259
[38]	valid_0's multi_logloss: 1.49589
[39]	valid_0's multi_logloss: 1.48951
[40]	valid_0's multi_logloss: 1.48384
[41]	valid_0's multi_logloss: 1.47779
[42]	valid_0's multi_logloss: 1.47182
[43]	valid_0's multi_logloss: 1.46688
[44]	valid_0's multi_logloss: 1.46145
[45]	valid_0's multi_logloss: 1.45616
[46]	valid_0's multi_logloss: 1.45112
[47]	valid_0's multi_logloss: 1.44605
[48]	valid_0's multi_logloss: 1.44133
[49]	valid_0's multi_logloss: 1.43648
[50]	valid_0's multi_logloss: 1.43204
[51]	valid_0's multi_logloss: 1.42766
[52]	valid_0's multi_logloss: 1.42337
[53]	valid_0's multi_logloss: 1.41899
[54]	valid_0's multi_logloss: 1.41489
[55]	valid_0's multi_logloss: 1.41123
[56]	valid_0's multi_logloss: 1.40726
[57]	valid_0's multi_logloss: 1.4034
[58]	valid_0's multi_logloss: 1.39982
[59]	valid_0's multi_logloss: 1.39602
[60]	valid_0'

[84]	valid_0's multi_logloss: 1.32728
[85]	valid_0's multi_logloss: 1.32492
[86]	valid_0's multi_logloss: 1.32286
[87]	valid_0's multi_logloss: 1.32084
[88]	valid_0's multi_logloss: 1.31889
[89]	valid_0's multi_logloss: 1.31702
[90]	valid_0's multi_logloss: 1.31487
[91]	valid_0's multi_logloss: 1.31285
[92]	valid_0's multi_logloss: 1.31069
[93]	valid_0's multi_logloss: 1.30903
[94]	valid_0's multi_logloss: 1.30726
[95]	valid_0's multi_logloss: 1.30527
[96]	valid_0's multi_logloss: 1.30347
[97]	valid_0's multi_logloss: 1.30139
[98]	valid_0's multi_logloss: 1.29976
[99]	valid_0's multi_logloss: 1.29796
[100]	valid_0's multi_logloss: 1.29632
[1]	valid_0's multi_logloss: 2.0589
[2]	valid_0's multi_logloss: 1.89872
[3]	valid_0's multi_logloss: 1.79449
[4]	valid_0's multi_logloss: 1.71838
[5]	valid_0's multi_logloss: 1.66124
[6]	valid_0's multi_logloss: 1.61369
[7]	valid_0's multi_logloss: 1.57321
[8]	valid_0's multi_logloss: 1.53736
[9]	valid_0's multi_logloss: 1.50777
[10]	valid_0's multi_

### Use the best parameters for final model

In [11]:
rs_lgbm = LGBMClassifier(random=122, **rs.best_params_, verbose = -1)

In [12]:
rs_lgbm.fit(X_train, list(y_train), eval_metric='multi_logloss')

In [13]:
show_classification_report(list(y_train), rs_lgbm.predict(X_train), 
                           title='TRAIN (LightGBM)', labels=labels)
show_classification_report(list(y_test), rs_lgbm.predict(X_test), 
                           title='TEST (LightGBM)', labels=labels)

TRAIN (LightGBM)
                                                                           precision    recall  f1-score   support

                                            Bulletin of Materials Science       0.88      0.73      0.80      2520
                                          Journal of Electronic Materials       0.84      0.81      0.83      2594
                         Journal of Materials Engineering and Performance       0.85      0.82      0.83      2595
                      Journal of Materials Science: Materials in Medicine       0.89      0.90      0.89      2577
                                         Journal of Nanoparticle Research       0.82      0.84      0.83      2589
                                Journal of Sol-Gel Science and Technology       0.90      0.91      0.90      2566
Journal of Surface Investigation: X-ray, Synchrotron and Neutron Techniqu       0.89      0.95      0.92      2332
                Journal of Wuhan University of Technology-Mate

#### Save the model

In [24]:
pickle.dump(rs_lgbm, open('../lgbm_classifier.model', 'wb'))

### catboost

In [25]:
catboost_model = CatBoostClassifier(random_seed=122, 
                                    custom_loss=['Recall'],
                                    loss_function='MultiClass',
                                    task_type='CPU', 
                                    verbose=False)

In [26]:
catboost_model.fit(X_train, y_train, 
                   eval_set=[(X_test, y_test)], 
                   plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[35]	valid_0's multi_logloss: 1.40049
[36]	valid_0's multi_logloss: 1.39425
[37]	valid_0's multi_logloss: 1.38843
[38]	valid_0's multi_logloss: 1.38257
[39]	valid_0's multi_logloss: 1.3769
[40]	valid_0's multi_logloss: 1.37199
[41]	valid_0's multi_logloss: 1.36712
[42]	valid_0's multi_logloss: 1.36228
[43]	valid_0's multi_logloss: 1.35782
[44]	valid_0's multi_logloss: 1.35316
[45]	valid_0's multi_logloss: 1.34898
[46]	valid_0's multi_logloss: 1.34492
[47]	valid_0's multi_logloss: 1.34086
[48]	valid_0's multi_logloss: 1.33677
[49]	valid_0's multi_logloss: 1.33274
[50]	valid_0's multi_logloss: 1.32897
[51]	valid_0's multi_logloss: 1.32542
[52]	valid_0's multi_logloss: 1.32221
[53]	valid_0's multi_logloss: 1.31841
[54]	valid_0's multi_logloss: 1.315
[55]	valid_0's multi_logloss: 1.31191
[56]	valid_0's multi_logloss: 1.30878
[57]	valid_0's multi_logloss: 1.30594
[58]	valid_0's multi_logloss: 1.30328
[59]	valid_0's multi_logloss: 1.30047
[60]	valid_0's multi_logloss: 1.29755
[61]	valid_0's 

[1]	valid_0's multi_logloss: 1.91259
[2]	valid_0's multi_logloss: 1.74174
[3]	valid_0's multi_logloss: 1.63522
[4]	valid_0's multi_logloss: 1.56062
[5]	valid_0's multi_logloss: 1.49997
[6]	valid_0's multi_logloss: 1.45516
[7]	valid_0's multi_logloss: 1.41662
[8]	valid_0's multi_logloss: 1.38149
[9]	valid_0's multi_logloss: 1.35765
[10]	valid_0's multi_logloss: 1.3406
[11]	valid_0's multi_logloss: 1.32354
[12]	valid_0's multi_logloss: 1.30905
[13]	valid_0's multi_logloss: 1.29639
[14]	valid_0's multi_logloss: 1.28677
[15]	valid_0's multi_logloss: 1.27669
[16]	valid_0's multi_logloss: 1.26965
[17]	valid_0's multi_logloss: 1.26428
[18]	valid_0's multi_logloss: 1.25885
[19]	valid_0's multi_logloss: 1.25321
[20]	valid_0's multi_logloss: 1.24855
[21]	valid_0's multi_logloss: 1.24615
[22]	valid_0's multi_logloss: 1.24419
[23]	valid_0's multi_logloss: 1.24249
[24]	valid_0's multi_logloss: 1.24039
[25]	valid_0's multi_logloss: 1.23832
[26]	valid_0's multi_logloss: 1.2354
[27]	valid_0's multi_lo

[1]	valid_0's multi_logloss: 1.91219
[2]	valid_0's multi_logloss: 1.74567
[3]	valid_0's multi_logloss: 1.6396
[4]	valid_0's multi_logloss: 1.56034
[5]	valid_0's multi_logloss: 1.5011
[6]	valid_0's multi_logloss: 1.45533
[7]	valid_0's multi_logloss: 1.41527
[8]	valid_0's multi_logloss: 1.38547
[9]	valid_0's multi_logloss: 1.35862
[10]	valid_0's multi_logloss: 1.33856
[11]	valid_0's multi_logloss: 1.3225
[12]	valid_0's multi_logloss: 1.30675
[13]	valid_0's multi_logloss: 1.29341
[14]	valid_0's multi_logloss: 1.28392
[15]	valid_0's multi_logloss: 1.27476
[16]	valid_0's multi_logloss: 1.2694
[17]	valid_0's multi_logloss: 1.26267
[18]	valid_0's multi_logloss: 1.25852
[19]	valid_0's multi_logloss: 1.25376
[20]	valid_0's multi_logloss: 1.25036
[21]	valid_0's multi_logloss: 1.24695
[22]	valid_0's multi_logloss: 1.24399
[23]	valid_0's multi_logloss: 1.24126
[24]	valid_0's multi_logloss: 1.23907
[25]	valid_0's multi_logloss: 1.23882
[26]	valid_0's multi_logloss: 1.23721
[27]	valid_0's multi_logl

[1]	valid_0's multi_logloss: 1.91614
[2]	valid_0's multi_logloss: 1.74832
[3]	valid_0's multi_logloss: 1.64084
[4]	valid_0's multi_logloss: 1.56557
[5]	valid_0's multi_logloss: 1.50859
[6]	valid_0's multi_logloss: 1.46136
[7]	valid_0's multi_logloss: 1.42164
[8]	valid_0's multi_logloss: 1.39154
[9]	valid_0's multi_logloss: 1.36605
[10]	valid_0's multi_logloss: 1.34489
[11]	valid_0's multi_logloss: 1.32782
[12]	valid_0's multi_logloss: 1.31181
[13]	valid_0's multi_logloss: 1.30159
[14]	valid_0's multi_logloss: 1.29123
[15]	valid_0's multi_logloss: 1.2818
[16]	valid_0's multi_logloss: 1.27473
[17]	valid_0's multi_logloss: 1.26723
[18]	valid_0's multi_logloss: 1.26093
[19]	valid_0's multi_logloss: 1.25535
[20]	valid_0's multi_logloss: 1.25081
[21]	valid_0's multi_logloss: 1.24845
[22]	valid_0's multi_logloss: 1.2447
[23]	valid_0's multi_logloss: 1.24302
[24]	valid_0's multi_logloss: 1.24024
[25]	valid_0's multi_logloss: 1.23913
[26]	valid_0's multi_logloss: 1.23684
[27]	valid_0's multi_lo

<catboost.core.CatBoostClassifier at 0x126b7b4c0>

In [28]:
show_classification_report(list(y_train), catboost_model.predict(X_train), 
                           title='TRAIN (catboost)', labels=labels)
show_classification_report(list(y_test), catboost_model.predict(X_test), 
                           title='TEST (catboost)', labels=labels)

TRAIN (catboost)
                                                                           precision    recall  f1-score   support

                                            Bulletin of Materials Science       0.69      0.57      0.63      2520
                                          Journal of Electronic Materials       0.71      0.70      0.70      2594
                         Journal of Materials Engineering and Performance       0.70      0.67      0.68      2595
                      Journal of Materials Science: Materials in Medicine       0.78      0.81      0.79      2577
                                         Journal of Nanoparticle Research       0.68      0.69      0.69      2589
                                Journal of Sol-Gel Science and Technology       0.81      0.78      0.80      2566
Journal of Surface Investigation: X-ray, Synchrotron and Neutron Techniqu       0.80      0.88      0.84      2332
                Journal of Wuhan University of Technology-Mate

In [29]:
feature_importances = pd.DataFrame({'feature': catboost_model.feature_names_, 
                                    'importance': catboost_model.feature_importances_})
feature_importances = feature_importances.sort_values('importance', ascending=False)

In [30]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(feature_importances)

               feature  importance
376              steel    2.888946
168           friction    2.872815
15               alloy    2.716157
444  words_in_abstract    2.547561
263      nanoparticles    2.464437
365                sol    2.444340
267                 nm    1.880120
85           corrosion    1.445441
409             tissue    1.320221
301           prepared    1.272738
433               wear    1.220185
174                gel    1.217113
38            behavior    1.192108
387            surface    1.180297
50                cell    1.168958
336             result    1.090750
154               film    1.061354
383            studied    1.012837
384              study    0.940908
350              shown    0.884263
250     microstructure    0.868082
286        performance    0.818120
60             coating    0.804990
394        temperature    0.771686
378           strength    0.753089
348               show    0.733721
112             device    0.726126
130           electr

In [None]:
plot_multilabel_confusion_matrix(y_test, y_pred, labels=labels)

In [31]:
pickle.dump(catboost_model, open('../catboost.model', 'wb'))