#  Featurizing special test structures and evaluating the model predictions on them in more detail

## Import packages

In [1]:
import pandas as pd 
from glob import glob
import os 
from pathlib import Path

# own packages
from mine_mof_oxstate.featurize import GetFeatures, FeatureCollector

# plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import sys
sys.path.append('../machine_learn_oxstates')
# ml
from joblib import dump, load

__init__.py: CACHEDIR=/home/kevin/.cache/matplotlib
font_manager.py: Using fontManager instance from /home/kevin/.cache/matplotlib/fontlist-v310.json
pyplot.py: Loaded backend module://ipykernel.pylab.backend_inline version unknown.
pyplot.py: Loaded backend module://ipykernel.pylab.backend_inline version unknown.
pyplot.py: Loaded backend module://ipykernel.pylab.backend_inline version unknown.


## Calculate Features

Identically to what we did for all structures before we will first calculate all features and save them in pickle files.

In [2]:
special_test_structures = glob('/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/test_structures/new_showcases/*.cif')

In [3]:
special_test_structures

['/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/test_structures/new_showcases/SIBHER.cif',
 '/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/test_structures/new_showcases/KOCLEW.cif',
 '/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/test_structures/new_showcases/EQIZAF.cif',
 '/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/test_structures/new_showcases/ACRNCU01.cif',
 '/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/test_structures/new_showcases/AQONAW.cif']

In [None]:
test_features_dict = {}

already_featurized = [Path(s).stem for s in glob("features/*.pkl")]
for s in special_test_structures:
    name = Path(s).stem
    if (name not in already_featurized) and  (name != 'mix_fe_mo'):
        print(name)
        gf = GetFeatures(s, 'features')
        gf.run_featurization()

featurize.py: could not load /home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/test_structures/new_showcases/SIBHER.cif


SIBHER
KOCLEW


featurize.py: iterating over 6 metal sites

CrystalNN: distance cutoffs set but no oxidation states specified on sites! For better results, set the site oxidation states in the structure.


CrystalNN: cannot locate an appropriate radius, covalent or atomic radii will be used, this can lead to non-optimal results.



In [10]:
rl = FeatureCollector.create_dict_for_feature_table('features/HEQVUU.pkl')

TypeError: string indices must be integers

## Collect features 

In [None]:
import numpy as np 
features_dict = { }

features = glob('features/*.pkl')

for feature in features:
    try:
        rl = FeatureCollector.create_dict_for_feature_table(feature)
        #print(rl)
        features = []
        for d in rl:
            features.append(d['feature'])
        features = np.vstack(features)
        features = FeatureCollector._select_features(['crystal_nn_fingerprint', 'ward_prb', 'row', 'column'], features)
        features_dict[Path(feature).stem] = features
    except Exception as e:
        print(e)

## Load model and scaler to make predictions

In [34]:
#model = load('/home/kevin/Dropbox/proj62_guess_oxidation_states/_backup/models/20190924-082740_ensemble_0.joblib')
#scaler = load('/home/kevin/Dropbox/proj62_guess_oxidation_states/_backup/models/scaler_0.joblib')

model  = load('../models/20190925-213752_ensemble_0.joblib')
scaler = load('../models/scaler_0.joblib')

In [37]:
for k in features_dict.keys():
    X = scaler.transform(features_dict[k])
    prediction = model.predict(X)
    print('Predicted {} for {}'.format(prediction, k))

Predicted [3 3 3 3 3 3 3 3] for HEQVUU
Predicted [3 3 3 3 3 3 3 3] for QAMTEG
Predicted [2 2 2 2 2 2 2 2 2 2 2 2] for JIZJUZ
Predicted [3 3] for ZITMUN
Predicted [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] for ORIVUI
Predicted [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] for GUVZII
Predicted [2 2 2 2 2 2 2 2 2 2 2 2] for JIZJEJ
Predicted [2 2 2 2] for yamloq
Predicted [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3] for GUVZEE
Predicted [2 2 2 2 2 2 2 2 2 2 2 2] for JIZJOT
Predicted [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3] for ZITFIU
Predicted [3 3 3 3] for IDIWIB
Predicted [2 2 2 2] for MAHSUK01
Predicted [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] for QIDFOB
Predicted [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] for COKNOH01
Predicted [2 2 2 2 2 2 2 2 2 2 2 2] for JIZJAF
Predicted [1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2] for KAJZIH
Predicted [1 1 1 1] for EQIZAF
Predicted [4 4 4 4] for IDIWOH
Predicted [2 2 2 2 2 2 2 2 2 2 2 2] for JIZJIN
Predicted [2 2 1 1] for ME01_P1
Predicted [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3] for HEQWAB
P

## Load the holdout set 

In [1]:
import numpy as np

In [5]:
X_holdout = np.load('../holdout/features.npy')

In [14]:
y_holdout = np.load('../holdout/labels.npy')

In [6]:
import pickle
with open('../holdout/names.pkl', 'rb') as fh:
    names = pickle.load(fh)

In [13]:
X_holdout_transf = scaler.transform(X_holdout)
predictions = model.predict(X_holdout_transf)

Predicted 6 for BETGUF
Predicted 2 for XUZQAM
Predicted 2 for VEDYAF
Predicted 2 for RILVOZ
Predicted 2 for NUPTIB
Predicted 2 for BILFAE
Predicted 2 for LUPPIV18
Predicted 2 for ALIVEX
Predicted 1 for DIFYUP
Predicted 1 for YALGAU
Predicted 4 for KATPUQ
Predicted 2 for ALIDUV
Predicted 2 for COJWUT
Predicted 3 for LUTFIQ
Predicted 2 for MIVCUQ
Predicted 3 for NAXJAY
Predicted 1 for KAJGOT
Predicted 2 for QEWDIH
Predicted 2 for VEDYOT
Predicted 3 for LAGSUJ
Predicted 2 for EDEDUN01
Predicted 2 for FIGHAF
Predicted 2 for KOMFIC
Predicted 2 for NELJOD
Predicted 2 for TIJGOJ
Predicted 2 for KOCWOP
Predicted 3 for SAKJAP
Predicted 2 for FALQIU
Predicted 2 for TEDWIL04
Predicted 3 for ISOROX01
Predicted 3 for FIDXAS
Predicted 3 for ITUWIE
Predicted 3 for MUXHIW01
Predicted 3 for ROLREQ
Predicted 2 for DOZMOV
Predicted 2 for GIGYUR
Predicted 1 for CUGZOT
Predicted 3 for CUVZAV
Predicted 3 for BOCXEZ
Predicted 2 for RUJWOJ
Predicted 1 for TISMAK
Predicted 3 for HULQIP
Predicted 3 for JOBXEG
P

In [17]:
counter = 0
for i, name in enumerate(names):
    if predictions[i] != y_holdout[i]:
        print('Predicted {} for {}, in CSD {}'.format(predictions[i], name, y_holdout[i]))
        counter +=1 

Predicted 1 for EQIZAF, in CSD 2
Predicted 2 for ACRNCU01, in CSD 1
Predicted 3 for BUHVOP, in CSD 4
Predicted 2 for IKUQAI, in CSD 3
Predicted 2 for AQONAW, in CSD 3
Predicted 3 for TAGCAH, in CSD 4
Predicted 3 for SIBHER, in CSD 4
Predicted 2 for TEYLOB, in CSD 1
Predicted 2 for QOZQIF, in CSD 3
Predicted 3 for LEXBOI, in CSD 2
Predicted 2 for NESTEK, in CSD 4
Predicted 3 for CIRTAB, in CSD 2
Predicted 2 for KOCLEW, in CSD 4
Predicted 3 for HULNOR, in CSD 2
Predicted 2 for DOVBIB, in CSD 3
Predicted 1 for PEXSAO, in CSD 2
Predicted 2 for GASMUK, in CSD 3
Predicted 3 for FEZKED, in CSD 4
Predicted 2 for MOYZAC, in CSD 3
Predicted 2 for UDACEH, in CSD 3
Predicted 2 for MIFQOJ, in CSD 3
Predicted 2 for KOBBEI, in CSD 3
Predicted 2 for PEFHIS, in CSD 1
Predicted 3 for LIBDAB, in CSD 2
Predicted 1 for ASABUR, in CSD 3
Predicted 2 for MUZCEQ, in CSD 3
Predicted 6 for CIXVOU, in CSD 5
Predicted 2 for LECKIN, in CSD 3
Predicted 2 for SOGYUI, in CSD 3
Predicted 2 for SOYFAO, in CSD 3
Predicte

In [18]:
counter

119