/
fetch_process_wbm_dataset.py
629 lines (483 loc) · 22 KB
/
fetch_process_wbm_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
# %%
import gzip
import os
import pickle
import urllib.request
import warnings
from glob import glob
import numpy as np
import pandas as pd
from aviary.wren.utils import get_aflow_label_from_spglib
from pymatgen.analysis.phase_diagram import PatchedPhaseDiagram
from pymatgen.core import Composition, Structure
from pymatgen.entries.compatibility import (
MaterialsProject2020Compatibility,
MaterialsProjectCompatibility,
)
from pymatgen.entries.computed_entries import ComputedStructureEntry
from pymatviz import density_scatter
from pymatviz.utils import save_fig
from tqdm import tqdm
from matbench_discovery import FIGS, today
from matbench_discovery.data import DATA_FILES
from matbench_discovery.energy import get_e_form_per_atom
from matbench_discovery.plots import pio
try:
import gdown
except ImportError:
print(
"gdown not installed. Needed for downloading WBM initial + relaxed structures "
"from Google Drive."
)
"""
Dataset generated with DFT and published in Jan 2021 as
"Predicting stable crystalline compounds using chemical similarity"
https://nature.com/articles/s41524-020-00481-6
"""
module_dir = os.path.dirname(__file__)
warnings.filterwarnings("ignore", category=UserWarning, module="pymatgen")
assert pio.templates.default == "plotly_dark+global"
# %% links to google drive files received via email from 1st author Hai-Chen Wang
# on 2021-06-15 containing initial and relaxed structures
google_drive_ids = {
1: "1ZUgtYwrfZn_P8bULWRtTXepyAxHVxS5C",
2: "1-3uu2AcARJxH7GReteGVASZTuttFGiW_",
3: "1hc5BvDiFfTu_tc5F8m7ONSw2OgL9vN6o",
4: "1aMYxG5YJUgMHpbWmHpzL4hRfmP26UQqh",
5: "17kQt2r78ReWle4PhEIOXG7w7BFdezGM1",
}
# %%
for step, file_id in google_drive_ids.items():
file_path = f"{module_dir}/raw/wbm-structures-step-{step}.json.bz2"
if os.path.exists(file_path):
print(f"{file_path} already exists, skipping")
continue
print(f"{step=}")
gdown.download(f"https://drive.google.com/u/0/uc?id={file_id}", file_path)
# %%
summary_path = f"{module_dir}/raw/wbm-summary.txt"
if not os.path.exists(summary_path):
summary_id_file = "1639IFUG7poaDE2uB6aISUOi65ooBwCIg"
summary_url = f"https://drive.google.com/u/0/uc?id={summary_id_file}"
gdown.download(summary_url, summary_path)
# %%
json_paths = sorted(glob(f"{module_dir}/raw/wbm-structures-step-*.json.bz2"))
step_lens = (61848, 52800, 79205, 40328, 23308)
# step 3 has 79,211 initial structures but only 79,205 ComputedStructureEntries
# i.e. 6 extra structures which have missing energy, volume, etc. in the summary file
bad_struct_ids = (70802, 70803, 70825, 70826, 70828, 70829)
# step 5 has 2 missing initial structures: 23166, 23294
assert len(json_paths) == len(step_lens), "Mismatch in WBM steps and JSON files"
wbm_struct_json_checksums = (
-7815922250032563359,
-86268461085685423,
-7707371069320539066,
-3579196048285845088,
-248039116266365352,
)
dfs_wbm_structs = {}
for json_path in json_paths:
step = int(json_path.split(".json.bz2")[0][-1])
assert step in range(1, 6)
if step in dfs_wbm_structs:
print(f"{step=} {json_path=} already loaded.")
continue
print(f"{step=}")
df = pd.read_json(json_path).T
# we hash index only for speed
# could use joblib.hash(df) to hash whole df but it's slow
checksum = pd.util.hash_pandas_object(df.index).sum()
assert checksum == wbm_struct_json_checksums[step - 1], "bad JSON file checksum"
if step == 3:
df = df.drop(index=[f"step_3_{id}" for id in bad_struct_ids])
# re-index after dropping bad structures to get same indices as summary file
# where IDs are consecutive, i.e. step_3_70801 is followed by step_3_70802,
# not step_3_70804, etc.
# df.index = [f"step_3_{idx + 1}" for idx in range(len(df))]
step_len = step_lens[step - 1]
assert len(df) == step_len, f"bad len for {step=}: {len(df)} != {step_len}"
dfs_wbm_structs[step] = df
# NOTE step 5 is missing 2 initial structures, see nan_init_structs_ids below
assert dict(dfs_wbm_structs[5].isna().sum()) == {"opt": 0, "org": 2}
assert list(dfs_wbm_structs[5].query("org.isna()").index) == [
"step_5_23165",
"step_5_23293",
]
# %%
df_wbm = pd.concat(dfs_wbm_structs.values())
assert len(df_wbm) == sum(step_lens)
def increment_wbm_material_id(wbm_id: str) -> str:
"""Maps step_1_0, step_1_1, ... onto wbm-1-1, wbm-1-2, ..."""
try:
prefix, step_num, material_num = wbm_id.split("_")
except ValueError:
print(f"bad {wbm_id=}")
return wbm_id
msg = f"bad {wbm_id=}, {prefix=} {step_num=} {material_num=}"
assert prefix == "step", msg
assert step_num.isdigit(), msg
assert material_num.isdigit(), msg
return f"wbm-{step_num}-{int(material_num) + 1}"
df_wbm.index = df_wbm.index.map(increment_wbm_material_id)
df_wbm.index.name = "material_id"
assert df_wbm.index[0] == "wbm-1-1"
assert df_wbm.index[-1] == "wbm-5-23308"
df_wbm["initial_structure"] = df_wbm.pop("org")
df_wbm["final_structure"] = df_wbm.pop("opt")
assert list(df_wbm.columns) == ["initial_structure", "final_structure"]
# %% download WBM ComputedStructureEntries from
# https://archive.materialscloud.org/record/2021.68
mat_cloud_url = "https://archive.materialscloud.org/record/file?record_id=840"
for filename in (
"README.txt",
# same file as summary_path above except this doesn't have ID column
# "summary.txt.bz2",
*(f"step_{step}.json.bz2" for step in range(1, 6)),
):
file_path = f"{module_dir}/raw/wbm-cse-{filename.lower().replace('_', '-')}"
if os.path.exists(file_path):
print(f"{file_path} already exists, skipping")
continue
urllib.request.urlretrieve(f"{mat_cloud_url}&{filename=}", file_path)
# %%
cse_step_paths = sorted(glob(f"{module_dir}/raw/wbm-cse-step-*.json.bz2"))
assert len(cse_step_paths) == 5
"""
There is a discrepancy of 6 entries between the files on Materials Cloud containing the
ComputedStructureEntries (CSE) and those on Google Drive containing initial+relaxed
structures. The CSE files contain 257,489 entries but the summary file on materials
cloud and the org/opt (initial/relaxed) structures both have 257,495 rows. The
additional structures seem to be "step_3_70802", "step_3_70803", "step_3_70825",
"step_3_70826", "step_3_70828", and "step_3_70829". By removing these entries, the
compositions between the Materials Cloud and Google Drive data align.
"""
dfs_wbm_cses = {}
for json_path in cse_step_paths:
step = int(json_path.split(".json.bz2")[0][-1])
print(f"{step=}")
assert step in range(1, 6)
if step in dfs_wbm_cses:
print(f"{json_path=} already loaded.")
continue
df = pd.read_json(json_path)
step_len = step_lens[step - 1]
dfs_wbm_cses[step] = df
assert len(df) == step_len, f"{step=}: {len(df)} != {step_len}"
# %%
df_wbm["computed_structure_entry"] = pd.concat(dfs_wbm_cses.values()).to_numpy()
for mat_id, cse in df_wbm.computed_structure_entry.items():
# needed to ensure MaterialsProjectCompatibility can process the entries
cse["parameters"]["run_type"] = (
"GGA+U" if cse["parameters"]["is_hubbard"] else "GGA"
)
cse["entry_id"] = mat_id
assert cse["entry_id"].startswith("wbm-")
assert pd.Series(
cse["parameters"]["run_type"] for cse in tqdm(df_wbm.computed_structure_entry)
).value_counts().to_dict() == {"GGA": 248481, "GGA+U": 9008}
# make sure only 2 materials have missing initial structures with expected IDs
nan_init_structs_ids = ["wbm-5-23166", "wbm-5-23294"]
assert list(df_wbm.query("initial_structure.isna()").index) == nan_init_structs_ids
# drop the two materials with missing initial structures
df_wbm = df_wbm.drop(index=nan_init_structs_ids)
# %% get composition from CSEs
df_wbm["composition_from_cse"] = [
ComputedStructureEntry.from_dict(cse).composition
for cse in tqdm(df_wbm.computed_structure_entry)
]
df_wbm["composition_from_final_struct"] = [
Structure.from_dict(struct).composition for struct in tqdm(df_wbm.final_structure)
]
# all but 1 composition matches between CSE and final structure
# mismatching ID: wbm-1-37977 which becomes equal on reduction:
# CSE Comp: Ag4 Bi4 O12
# final structure Comp: Ag16 Bi16 O48
df_mismatch = df_wbm.query("composition_from_cse != composition_from_final_struct")
assert len(df_mismatch) == 1
assert df_mismatch.index[0] == "wbm-1-37977"
assert (
df_mismatch.iloc[0].composition_from_cse.reduced_composition
== df_mismatch.iloc[0].composition_from_final_struct.reduced_composition
)
df_wbm.pop("composition_from_final_struct") # not needed anymore
# %% randomly sample structures and ensure they match between CSE and final structure
n_samples = 1000
for row in tqdm(df_wbm.sample(n_samples).itertuples(), total=n_samples):
struct_final = Structure.from_dict(row.final_structure)
struct_from_cse = Structure.from_dict(row.computed_structure_entry["structure"])
assert struct_final.matches(struct_from_cse), f"structure mismatch for {row.Index=}"
# and check initial and final compositions match
struct_init = Structure.from_dict(row.initial_structure)
assert (
struct_init.composition == struct_final.composition
), f"composition mismatch for {row.Index=}"
# %% extract alphabetical formula from CSEs (will be used as ground-truth formulas since
# more informative than reduced formulas found in df_summary)
df_wbm["formula_from_cse"] = [
x.alphabetical_formula for x in df_wbm.pop("composition_from_cse")
]
# %%
col_map = {
"# comp": "formula",
"nsites": "n_sites",
"vol": "volume",
"e": "uncorrected_energy",
"e_form": "e_form_per_atom_wbm",
"e_hull": "e_above_hull_wbm",
"gap": "bandgap_pbe",
"id": "material_id",
}
# WBM summary was shared twice, once on google drive, once on materials cloud
# download both and check for consistency
df_summary = pd.read_csv(
f"{module_dir}/raw/wbm-summary.txt", sep="\t", names=col_map.values()
).set_index("material_id")
df_summary_bz2 = pd.read_csv(
f"{mat_cloud_url}&filename=summary.txt.bz2", sep="\t"
).rename(columns=col_map)
# duplicate Ga3Ru2U3 step_3_28147 (1st one is wbm-2-18689) has 0 volume in
# df_summary_bz2 vs 155.41 in df_summary
query_str = "volume > 0 & formula != 'Ga3Ru2U3'"
pd.testing.assert_frame_equal(
df_summary.reset_index(drop=True).query(query_str),
df_summary_bz2.reset_index(drop=True).query(query_str),
)
assert sum(df_summary.index == "None") == 6
# the 'None' materials have 0 volume, energy, n_sites, bandgap, etc.
assert all(df_summary[df_summary.index == "None"].drop(columns=["formula"]) == 0)
assert len(df_summary.query("volume > 0")) == len(df_wbm) + len(nan_init_structs_ids)
# make sure dropping materials with 0 volume removes exactly 6 materials, the same ones
# listed in bad_struct_ids above
assert all(
df_summary.reset_index().query("volume == 0").index.values - sum(step_lens[:2])
== bad_struct_ids
)
df_summary.index = df_summary.index.map(increment_wbm_material_id) # format IDs
# drop materials with id='None' and missing initial structures
df_summary = df_summary.drop(index=[*nan_init_structs_ids, "None"])
# the 8403 material IDs in step 3 with final number larger than any of the ones in
# bad_struct_ids are now misaligned between df_summary and df_wbm
# the IDs in df_summary are consecutive while the IDs in df_wbm skip over the numbers in
# bad_struct_ids. we fix this with fix_bad_struct_index_mismatch() by mapping the IDs in
# df_wbm to the ones in df_summary so that both indices become consecutive.
assert sum(df_summary.index != df_wbm.index) == 8403
def fix_bad_struct_index_mismatch(material_id: str) -> str:
"""Decrement material IDs in step 3 by the number of IDs with smaller final number
in bad_struct_ids. This should fix the index mismatch between df_summary and df_wbm.
"""
_, step_num, mat_num = material_id.split("-")
step_num, mat_num = int(step_num), int(mat_num)
if step_num == 3:
mat_num -= sum(mat_num > idx + 1 for idx in bad_struct_ids)
return f"wbm-{step_num}-{mat_num}"
# don't accidentally apply the fix twice
if sum(df_summary.index != df_wbm.index) != 0:
df_wbm.index = df_wbm.index.map(fix_bad_struct_index_mismatch)
# check that the index mismatch is fixed
assert sum(df_summary.index != df_wbm.index) == 0
# update ComputedStructureEntry entry_ids to match material_ids
for mat_id, cse in df_wbm.computed_structure_entry.items():
entry_id = cse["entry_id"]
if mat_id != entry_id:
print(f"{mat_id=} != {entry_id=}, updating entry_id to mat_id")
cse["entry_id"] = mat_id
# sort formulas alphabetically
df_summary["alph_formula"] = [
Composition(x).alphabetical_formula for x in df_summary.formula
]
# alphabetical formula and original formula differ due to spaces, number 1 after element
# symbols (FeO vs Fe1 O1), and element order (FeO vs OFe)
assert sum(df_summary.alph_formula != df_summary.formula) == 257_483
df_summary["formula"] = df_summary.pop("alph_formula")
# %% write initial structures and computed structure entries to compressed json
for fname, cols in (
("computed-structure-entries", ["computed_structure_entry"]),
("init-structs", ["initial_structure"]),
(
"computed-structure-entries+init-structs",
["initial_structure", "computed_structure_entry"],
),
):
cols = ["formula_from_cse", *cols]
df_wbm[cols].reset_index().to_json(f"{module_dir}/{today}-wbm-{fname}.json.bz2")
# %%
# df_summary and df_wbm formulas differ because summary formulas are reduced while
# df_wbm formulas are not (e.g. Ac6 U2 vs Ac3 U1 in summary). unreduced is more
# informative so we use it.
assert sum(df_summary.formula != df_wbm.formula_from_cse) == 114_273
assert sum(df_summary.formula == df_wbm.formula_from_cse) == 143_214
df_summary.formula = df_wbm.formula_from_cse
# fix bad energy which is 0 in df_summary but a more realistic -63.68 in CSE
df_summary.at["wbm-2-18689", "uncorrected_energy"] = df_wbm.loc[
"wbm-2-18689"
].computed_structure_entry["energy"]
# NOTE careful with ComputedEntries as object vs as dicts, the meaning of keys changes:
# cse.energy == cse.uncorrected_energy + cse.correction
# whereas
# cse.as_dict()["energy"] == cse.uncorrected_energy
# %% scatter plot summary energies vs CSE energies
df_summary["uncorrected_energy_from_cse"] = [
cse["energy"] for cse in tqdm(df_wbm.computed_structure_entry)
]
# check CSE and summary energies are consistent, only exceeding 0.1 eV difference twice
diff_e_cse_e_summary = (
df_summary.uncorrected_energy - df_summary.uncorrected_energy_from_cse
)
assert (
diff_e_cse_e_summary.max() < 0.15 and sum(diff_e_cse_e_summary > 0.1) == 2
), df_summary.query("energy - uncorrected_energy_from_cse > 0.1")
density_scatter(df_summary.uncorrected_energy, df_summary.uncorrected_energy_from_cse)
# %% remove suspicious formation energy outliers
e_form_cutoff = 5
n_too_stable = sum(df_summary.e_form_per_atom_wbm < -e_form_cutoff)
print(f"{n_too_stable = }") # n_too_stable = 502
n_too_unstable = sum(df_summary.e_form_per_atom_wbm > e_form_cutoff)
print(f"{n_too_unstable = }") # n_too_unstable = 22
fig = df_summary.hist(
x="e_form_per_atom_wbm", backend="plotly", log_y=True, range_x=[-5.5, 5.5]
)
fig_compressed = False
fig.add_vline(x=e_form_cutoff, line=dict(dash="dash"))
fig.add_vline(x=-e_form_cutoff, line=dict(dash="dash"))
fig.add_annotation(
**dict(x=0, y=1, yref="paper", yshift=20),
text=f"<b>dataset cropped to within +/- {e_form_cutoff} eV/atom</b>",
showarrow=False,
)
x_axis_title = "WBM uncorrected formation energy (eV/atom)"
fig.update_layout(xaxis_title=x_axis_title, margin=dict(l=10, r=10, t=40, b=10))
fig.update_yaxes(fixedrange=True) # disable zooming y-axis
fig.show(
config=dict(
modeBarButtonsToRemove=["lasso2d", "select2d", "autoScale2d", "toImage"],
displaylogo=False,
)
)
# %%
# no need to store all 250k x values in plot, leads to 1.7 MB file, subsample every 10th
# point is enough to see the distribution
if not fig_compressed:
fig_compressed = True
# keep only every 10th data point, round to 3 decimal places to reduce file size
fig.data[0].x = [round(x, 3) for x in fig.data[0].x[::10]]
# recommended to upload SVG to vecta.io/nano afterwards for compression
img_path = f"{FIGS}/hist-wbm-e-form-per-atom"
# save_fig(fig, f"{img_path}.svg", width=800, height=300)
save_fig(fig, f"{img_path}.svelte")
# %%
assert len(df_summary) == len(df_wbm) == 257_487
query_str = f"{-e_form_cutoff} < e_form_per_atom_wbm < {e_form_cutoff}"
dropped_ids = sorted(set(df_summary.index) - set(df_summary.query(query_str).index))
assert len(dropped_ids) == 502 + 22
assert dropped_ids[:3] == "wbm-1-12142 wbm-1-12143 wbm-1-12144".split()
assert dropped_ids[-3:] == "wbm-5-9121 wbm-5-9211 wbm-5-934".split()
df_summary = df_summary.query(query_str)
df_wbm = df_wbm.loc[df_summary.index]
# make sure we dropped the expected number 524 of materials
assert len(df_summary) == len(df_wbm) == 257_487 - 502 - 22
# %%
for mat_id, cse in df_wbm.computed_structure_entry.items():
assert mat_id == cse["entry_id"], f"{mat_id} != {cse['entry_id']}"
df_wbm["cse"] = [
ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
]
# raw WBM ComputedStructureEntries have no energy corrections applied:
assert all(cse.uncorrected_energy == cse.energy for cse in df_wbm.cse)
# summary and CSE n_sites match
assert all(df_summary.n_sites == [len(cse.structure) for cse in df_wbm.cse])
# entries are corrected in-place by default so we apply legacy corrections first
# and then leave the new corrections in place below
# having both old and new corrections allows updating predictions from older models
# like MEGNet that were trained on MP release prior to new corrections by subtracting
# old corrections and adding the new ones
entries_old_corr = MaterialsProjectCompatibility().process_entries(
df_wbm.cse, clean=True, verbose=True
)
assert len(entries_old_corr) == len(df_wbm), f"{len(entries_old_corr)=} {len(df_wbm)=}"
# extract legacy MP energy corrections to df_megnet
df_wbm["e_correction_per_atom_mp_legacy"] = [
cse.correction_per_atom for cse in df_wbm.cse
]
# clean up legacy corrections and apply new corrections
entries_new_corr = MaterialsProject2020Compatibility().process_entries(
df_wbm.cse, clean=True, verbose=True
)
assert len(entries_new_corr) == len(df_wbm), f"{len(entries_new_corr)=} {len(df_wbm)=}"
n_corrected = sum(cse.uncorrected_energy != cse.energy for cse in df_wbm.cse)
assert n_corrected == 100_930, f"{n_corrected=} expected 100,930"
df_summary["e_correction_per_atom_mp2020"] = [
cse.correction_per_atom for cse in df_wbm.cse
]
assert df_summary.e_correction_per_atom_mp2020.mean().round(4) == -0.1069
# %%
with gzip.open(DATA_FILES.mp_patched_phase_diagram, "rb") as zip_file:
ppd_mp: PatchedPhaseDiagram = pickle.load(zip_file)
# %% calculate e_above_hull for each material
# this loop needs above warnings.filterwarnings() to not crash Jupyter kernel with logs
# takes ~20 min at 200 it/s for 250k entries in WBM
each_col = "e_above_hull_mp2020_corrected_ppd_mp"
assert each_col not in df_summary
for mat_id, cse in tqdm(df_wbm.cse.items(), total=len(df_wbm)):
assert mat_id == cse.entry_id, f"{mat_id=} != {cse.entry_id=}"
assert cse.entry_id in df_summary.index, f"{cse.entry_id=} not in df_summary"
e_above_hull = ppd_mp.get_e_above_hull(cse, allow_negative=True)
df_summary.at[cse.entry_id, each_col] = e_above_hull
# %% calculate formation energies from CSEs wrt MP elemental reference energies
# first make sure source and target dfs have matching indices
assert sum(df_wbm.index != df_summary.index) == 0
e_form_col = "e_form_per_atom_uncorrected"
assert e_form_col not in df_summary
for row in tqdm(df_wbm.itertuples(), total=len(df_wbm)):
mat_id, cse, formula = row.Index, row.cse, row.formula_from_cse
assert mat_id == cse.entry_id, f"{mat_id=} != {cse.entry_id=}"
assert mat_id in df_summary.index, f"{mat_id=} not in df_summary"
entry_like = dict(composition=formula, energy=cse.uncorrected_energy)
e_form = get_e_form_per_atom(entry_like)
e_form_ppd = ppd_mp.get_form_energy_per_atom(cse)
correction = cse.correction_per_atom
# make sure the PPD.get_e_form_per_atom() and standalone get_e_form_per_atom()
# method of calculating formation energy agree
assert (
abs(e_form - (e_form_ppd - correction)) < 1e-4
), f"{mat_id=}: {e_form=:.5} != {e_form_ppd - correction=:.5}"
df_summary.at[cse.entry_id, e_form_col] = e_form
# %%
df_init_struct = pd.read_json(
f"{module_dir}/2022-10-19-wbm-init-structs.json.bz2"
).set_index("material_id")
wyckoff_col = "wyckoff_spglib"
if wyckoff_col not in df_init_struct:
df_init_struct[wyckoff_col] = None
for idx, struct in tqdm(
df_init_struct.initial_structure.items(), total=len(df_init_struct)
):
if not pd.isna(df_summary.at[idx, wyckoff_col]):
continue
try:
struct = Structure.from_dict(struct)
df_summary.at[idx, wyckoff_col] = get_aflow_label_from_spglib(struct)
except Exception as exc:
print(f"{idx=} {exc=}")
assert df_summary[wyckoff_col].isna().sum() == 0
# %% site-stats.json.gz was generated by scripts/compute_struct_fingerprints.py
df_fp = pd.read_json(f"{module_dir}/site-stats.json.gz").set_index("material_id")
init_fp_col = "initial_site_stats_fingerprint"
final_fp_col = "final_site_stats_fingerprint"
fp_diff_col = "site_stats_fingerprint_init_final_norm_diff"
df_fp[fp_diff_col] = (
df_fp[final_fp_col].map(np.array) - df_fp[init_fp_col].map(np.array)
).map(np.linalg.norm)
df_fp[fp_diff_col].hist(bins=100, backend="plotly")
# %% write final summary data to disk (yeah!)
df_summary.round(6).to_csv(f"{module_dir}/{today}-wbm-summary.csv")
# %% read summary data from disk
df_summary = pd.read_csv(f"{module_dir}/2022-10-19-wbm-summary.csv").set_index(
"material_id"
)
# %% read WBM initial structures and computed structure entries from disk
df_wbm = pd.read_json(
f"{module_dir}/2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
).set_index("material_id")
df_wbm["cse"] = [
ComputedStructureEntry.from_dict(x) for x in tqdm(df_wbm.computed_structure_entry)
]