#  Featurizing special test structures and evaluating the model predictions on them in more detail

## Import packages

In [1]:
import pandas as pd 
from glob import glob
import os 
from pathlib import Path

# own packages
from mine_mof_oxstate.featurize import GetFeatures, FeatureCollector

# plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sys
sys.path.append('../machine_learn_oxstates')
# ml
from joblib import dump, load


Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)

DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.
DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


## Calculate Features

Identically to what we did for all structures before we will first calculate all features and save them in pickle files.

In [20]:
special_test_structures = glob('../../test_structures/*/*.cif')

In [21]:
special_test_structures

['../../test_structures/blind/ME01_P1.cif',
 '../../test_structures/new_showcases/MUZCEQ .cif',
 '../../test_structures/new_showcases/NEZPOA.cif',
 '../../test_structures/new_showcases/OLIKUR.cif',
 '../../test_structures/new_showcases/GEPTUR.cif',
 '../../test_structures/new_showcases/PBE.cif',
 '../../test_structures/new_showcases/FIVYEP01.cif',
 '../../test_structures/new_showcases/TUZGUQ.cif',
 '../../test_structures/new_showcases/UDACEH.cif',
 '../../test_structures/new_showcases/KOCLEW.cif',
 '../../test_structures/new_showcases/MOYZAC.cif',
 '../../test_structures/new_showcases/VIFLOL.cif',
 '../../test_structures/new_showcases/ASABUR.cif',
 '../../test_structures/new_showcases/PEXSAO.cif',
 '../../test_structures/new_showcases/GASMUK.cif',
 '../../test_structures/new_showcases/TAXZIC.cif',
 '../../test_structures/new_showcases/DOVBIB.cif',
 '../../test_structures/new_showcases/FAQLIV.cif',
 '../../test_structures/new_showcases/CAVWEC.cif',
 '../../test_structures/new_showcases/

In [22]:
from pymatgen import Structure
s = Structure.from_file( '../../test_structures/blind/ME01_P1.cif')


In [23]:
test_features_dict = {}

already_featurized = [Path(s).stem for s in glob("features/*.pkl")]
for s in special_test_structures:
    name = Path(s).stem
    if (name not in already_featurized) & (name != 'mix_fe_mo'):
        print(name)
        gf = GetFeatures.from_file(s, 'features')
        gf.run_featurization()

PBE


INFO:Featurize:iterating over 6 metal sites

CrystalNN: distance cutoffs set but no oxidation states specified on sites! For better results, set the site oxidation states in the structure.


CrystalNN: cannot locate an appropriate radius, covalent or atomic radii will be used, this can lead to non-optimal results.



In [None]:
rl = FeatureCollector.create_dict_for_feature_table('features/HEQVUU.pkl')

## Collect features 

In [24]:
import numpy as np 
features_dict = { }

features = glob('features/*.pkl')

for feature in features:
    try:
        rl = FeatureCollector.create_dict_for_feature_table(feature)
        #print(rl)
        features = []
        for d in rl:
            features.append(d['feature'])
        features = np.vstack(features)
        features = FeatureCollector._select_features(['crystal_nn_fingerprint', 'local_property_stats', 'row', 'column'], features)
        features_dict[Path(feature).stem] = features
    except Exception as e:
        print(e)

DEBUG:FeatureCollector:the feature names are ['wt CN_1', 'sgl_bd CN_1', 'wt CN_2', 'L-shaped CN_2', 'water-like CN_2', 'bent 120 degrees CN_2', 'bent 150 degrees CN_2', 'linear CN_2', 'wt CN_3', 'trigonal planar CN_3', 'trigonal non-coplanar CN_3', 'T-shaped CN_3', 'wt CN_4', 'square co-planar CN_4', 'tetrahedral CN_4', 'rectangular see-saw-like CN_4', 'see-saw-like CN_4', 'trigonal pyramidal CN_4', 'wt CN_5', 'pentagonal planar CN_5', 'square pyramidal CN_5', 'trigonal bipyramidal CN_5', 'wt CN_6', 'hexagonal planar CN_6', 'octahedral CN_6', 'pentagonal pyramidal CN_6', 'wt CN_7', 'hexagonal pyramidal CN_7', 'pentagonal bipyramidal CN_7', 'wt CN_8', 'body-centered cubic CN_8', 'hexagonal bipyramidal CN_8', 'wt CN_9', 'q2 CN_9', 'q4 CN_9', 'q6 CN_9', 'wt CN_10', 'q2 CN_10', 'q4 CN_10', 'q6 CN_10', 'wt CN_11', 'q2 CN_11', 'q4 CN_11', 'q6 CN_11', 'wt CN_12', 'cuboctahedral CN_12', 'q2 CN_12', 'q4 CN_12', 'q6 CN_12', 'wt CN_13', 'wt CN_14', 'wt CN_15', 'wt CN_16', 'wt CN_17', 'wt CN_18', 

## Load model and scaler to make predictions

In [95]:
#model = load('/home/kevin/Dropbox/proj62_guess_oxidation_states/_backup/models/20190924-082740_ensemble_0.joblib')
#scaler = load('/home/kevin/Dropbox/proj62_guess_oxidation_states/_backup/models/scaler_0.joblib')

model  = load('../models_large/votingclassifier.joblib')
scaler = load('../models_large/scaler.joblib')

In [69]:
len(features_dict['andres'][0])

124

In [85]:
for k in features_dict.keys():
    X = scaler.transform(features_dict[k])
    prediction = model.predict(X)
    print('Predicted {} for {}'.format(prediction, k))

Predicted [3 3 3 3 3 3] for GASMUK
Predicted [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] for TAXZIC
Predicted [3] for DOVBIB
Predicted [2 2 2 2 3 3] for andres
Predicted [4 4 4 4 4 4 4 4] for FAQLIV
Predicted [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 6 6 6 6 6 6 6 6] for CAVWEC
Predicted [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3] for BUPVEP
Predicted [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] for PIZHAJ
Predicted [2 2 2 2 2 2 2 2 2 2 2 2] for JIZJAF
Predicted [1 1 1 1 2 2 2 2] for ACRNCU01
Predicted [3 3 3 3 3 3 3 3 2 2] for DAQDUN
Predicted [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] for ORIVUI
Predicted [2 2 2 2] for YAMLOQ
Predicted [3 3 3 3] for IDIWIB
Predicted [2 2 2 2] for MIFQOJ
Predicted [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3] for COKNOH01
Predicted [1 1 1 1] for EQIZAF
Predicted [2 2 2 2 2 2 2 2] for NEZPUG
Predicted [3 3 3 3 3 3 3 3 3 3 3 3] for AQONAW
Predicted [2 2 2 2 2 2 2 2 2 2 2 2] for JIZJIN
Predicted

## Load the holdout set 

In [26]:
import numpy as np

In [72]:
X_holdout = np.load('../../_backup/models_large/data/valid/features.npy')

In [73]:
y_holdout = np.load('../../_backup/models_large/data/valid/labels.npy')

In [74]:
import pickle
with open('../../_backup/models_large/data/valid/names.pkl', 'rb') as fh:
    names = pickle.load(fh)

In [86]:
X_holdout_transf = scaler.transform(X_holdout)
predictions = model.predict(X_holdout_transf)

In [87]:
X_holdout_transf.shape

(39973, 124)

In [104]:
model.predict_proba(X_holdout_transf[500:501])

array([[0.05054956, 0.73278269, 0.18271083, 0.01407691, 0.00984449,
        0.01003551]])

In [None]:
model.predict(X_holdout_transf)

In [105]:
y_holdout[500]

2

In [89]:
counter = 0
for i, name in enumerate(names):
    if predictions[i] != y_holdout[i]:
        print('Predicted {} for {}, in CSD {}'.format(predictions[i], name, y_holdout[i]))
        counter +=1 

Predicted 3 for XETHAF, in CSD 5
Predicted 2 for CUFMOG, in CSD 3
Predicted 2 for BABBUC, in CSD 3
Predicted 2 for SALBOX, in CSD 3
Predicted 3 for XETHAF, in CSD 5
Predicted 2 for CEKROB, in CSD 3
Predicted 3 for RAHCUA, in CSD 2
Predicted 4 for WUDLIQ, in CSD 2
Predicted 2 for OLELAS, in CSD 1
Predicted 1 for ODAJOV, in CSD 3
Predicted 3 for JEFSUJ, in CSD 2
Predicted 2 for JOHKEW, in CSD 1
Predicted 2 for JOHKEW, in CSD 1
Predicted 3 for XETHAF, in CSD 5
Predicted 2 for SALBOX, in CSD 3
Predicted 4 for WUDLIQ, in CSD 2
Predicted 3 for UCEGOY, in CSD 2
Predicted 2 for JARMEU10, in CSD 1
Predicted 4 for WUDLIQ, in CSD 2
Predicted 1 for PEPVAI, in CSD 2
Predicted 2 for ERUNUA, in CSD 3
Predicted 2 for SALBOX, in CSD 3
Predicted 2 for HUWHOW, in CSD 3
Predicted 4 for WUDLIQ, in CSD 2
Predicted 1 for ODAJOV, in CSD 3
Predicted 2 for ERUNUA, in CSD 3
Predicted 2 for CUFMOG, in CSD 3
Predicted 1 for FIGZUS, in CSD 2
Predicted 3 for JEFSUJ, in CSD 2
Predicted 1 for PEPVAI, in CSD 2
Predicte

In [90]:
counter/len(y_holdout) * 100 

0.10507092287293923

## Try some explanations

In [16]:
import lime 
import lime.lime_tabular 
import pickle 

with open('../data/helper/feature_names.pkl', 'rb') as fh: 
    features = pickle.load(fh)

# explainer = lime.lime_tabular(model, feature_names= )

In [64]:
X_train = np.load('../../_backup/data/features/features.npy')
#indices = np.random.choice(range(len(X_train)), 500)

In [65]:
X_train = scaler.transform(X_train)

In [69]:
np.argmin(np.sum(np.abs(X_train-scaler.transform(features_dict['ME01_P1'])[2]), axis=-1))

10308

In [None]:
# but lime would need soft voting ...
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=features, class_names=[1, 2, 3, 4, 5, 6])

In [None]:
with open('../../_backup/data/helper/names.pkl', 'rb') as fh: 
    names = pickle.load(fh)

In [70]:
names[10308]

'FAZQIH'