In [1]:
import os
import re
import json
import multiprocessing as mp

import numpy as np
import pandas as pd

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed

from pymatgen.core import Structure
from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.io.vasp.inputs import Potcar, Incar
from pymatgen.io.vasp.outputs import Outcar, Vasprun

In [2]:
# Notebook outputs + shared helpers
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..')) if os.path.basename(os.getcwd()) == 'notebooks' else os.getcwd()
OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'outputs')
STRUCT_EXTXYZ_DIR = os.path.join(OUTPUT_DIR, 'structures_extxyz')
STRUCT_VASP_DIR = os.path.join(OUTPUT_DIR, 'structures_vasp')

os.makedirs(STRUCT_EXTXYZ_DIR, exist_ok=True)
os.makedirs(STRUCT_VASP_DIR, exist_ok=True)

print('PROJECT_ROOT:', PROJECT_ROOT)
print('OUTPUT_DIR:', OUTPUT_DIR)

# Parallel backend used throughout this notebook
PARALLEL_BACKEND = 'process'  # 'process' or 'thread'

# Helper: infer solute from POTCAR nelectrons map JSON
# - For pure Ti: returns 'Ti'
# - For Ti+X: returns 'X'
# - If multiple non-Ti keys: returns 'X+Y+...'

def infer_solute(potcar_map_json: str):
    if not potcar_map_json:
        return 'Ti'
    try:
        m = json.loads(potcar_map_json)
        if not isinstance(m, dict):
            return None
        keys = [str(k) for k in m.keys() if str(k) != 'Ti']
        if not keys:
            return 'Ti'
        keys = sorted(keys)
        return keys[0] if len(keys) == 1 else '+'.join(keys)
    except Exception:
        return None

PROJECT_ROOT: /root/TiGB_Project/TiGBProject_AlV_Data
OUTPUT_DIR: /root/TiGB_Project/TiGBProject_AlV_Data/outputs


In [32]:
# Parse ALL VASP calculation folders under the full TiGB data directory
# Windows path given: E:\Onedrive\TiGB-Project\Data
# WSL path typically: /mnt/e/OneDrive/TiGB-Project/Data

DATA_ROOT_INPUT = r"E:\Onedrive\TiGB-Project\Data"


def _normalize_data_root(p: str) -> str:
    # Convert a Windows drive path like E:\... to WSL /mnt/e/...
    if re.match(r"^[A-Za-z]:\\", p):
        drive = p[0].lower()
        rest = p[2:].replace('\\', '/').lstrip('/')
        return f"/mnt/{drive}/{rest}"
    return p.replace('\\', '/')


data_root = _normalize_data_root(DATA_ROOT_INPUT)

# Common casing / mount-name fallbacks
if not os.path.isdir(data_root):
    data_root_alt = data_root.replace('/Onedrive/', '/OneDrive/')
    if os.path.isdir(data_root_alt):
        data_root = data_root_alt

if not os.path.isdir(data_root):
    raise FileNotFoundError(f"Data root not found: {data_root} (from {DATA_ROOT_INPUT})")

print('DATA_ROOT:', data_root)


def discover_vasp_calc_dirs(root: str):
    """Return directories that look like VASP calc folders."""
    calc_dirs = []
    for dirpath, dirnames, filenames in os.walk(root):
        fn = set(filenames)

        has_vasprun = 'vasprun.xml' in fn
        has_outcar = 'OUTCAR' in fn
        has_structure = ('CONTCAR' in fn) or ('POSCAR' in fn) or any(f.endswith('.vasp') for f in filenames)

        if has_vasprun or (has_outcar and has_structure):
            calc_dirs.append(dirpath)

    # Deduplicate + stable order
    calc_dirs = sorted(set(calc_dirs))
    return calc_dirs


all_calc_dirs = discover_vasp_calc_dirs(data_root)
print(f"Discovered {len(all_calc_dirs)} calc directories")


def parse_calc_folder(calc_dir: str):
    # Create a stable job name from relative path
    rel = os.path.relpath(calc_dir, data_root)
    job_name = rel.replace(os.sep, '-')

    # Try to infer higher-level grouping
    parts = rel.split(os.sep)
    top_level = parts[0] if parts else None

    row_data = {
        'job_name': job_name,
        'top_level': top_level,
        'relpath': rel,
        'path': calc_dir,
    }

    parse_errors = []

    # --- Structure ---
    structure = None
    structure_file = None

    # prefer CONTCAR/POSCAR, else any *.vasp file (skip starters if possible)
    struct_candidates = []
    for name in ['CONTCAR', 'POSCAR']:
        p = os.path.join(calc_dir, name)
        if os.path.exists(p):
            struct_candidates.append((name, p))

    if not struct_candidates:
        vasp_files = [f for f in os.listdir(calc_dir) if f.endswith('.vasp') and 'starter' not in f]
        vasp_files = sorted(vasp_files)
        if vasp_files:
            struct_candidates.append((vasp_files[0], os.path.join(calc_dir, vasp_files[0])))

    for name, p in struct_candidates:
        try:
            structure = Structure.from_file(p)
            structure_file = name
            break
        except Exception as e:
            parse_errors.append(f"structure:{name}:{e}")

    if structure is not None:
        row_data['structure_file'] = structure_file
        row_data['num_atoms'] = len(structure)
        row_data['formula'] = structure.formula
        row_data['volume'] = structure.volume
        row_data['density'] = structure.density

        try:
            # Use pymatgen's Monty JSON serializer to preserve numpy arrays (e.g. selective_dynamics)
            row_data['pmg_structure_json'] = structure.to_json()
        except Exception as e:
            parse_errors.append(f"pmg_structure_json:{e}")

        try:
            atoms = AseAtomsAdaptor.get_atoms(structure)
            row_data['ase_atoms_json'] = json.dumps(atoms.as_dict(), default=str)
        except Exception as e:
            parse_errors.append(f"ase_atoms_json:{e}")

    # --- Prefer VASPRUN for energy/forces/stress ---
    row_data['energy_eV'] = None
    row_data['forces_eV_A'] = None
    row_data['stress_kbar_3x3'] = None

    vasprun_path = os.path.join(calc_dir, 'vasprun.xml')
    if os.path.exists(vasprun_path):
        try:
            vr = Vasprun(vasprun_path, parse_dos=False, parse_eigen=False)
            row_data['energy_eV'] = float(vr.final_energy)

            try:
                last = vr.ionic_steps[-1]
                row_data['forces_eV_A'] = np.array(last.get('forces')).tolist() if last.get('forces') is not None else None
                row_data['stress_kbar_3x3'] = np.array(last.get('stress')).tolist() if last.get('stress') is not None else None
            except Exception as e:
                parse_errors.append(f"vasprun:ionic_steps:{e}")

            # fall back structure from vasprun
            if structure is None:
                try:
                    structure = vr.final_structure
                    row_data['structure_file'] = 'vasprun.xml'
                    row_data['num_atoms'] = len(structure)
                    row_data['formula'] = structure.formula
                    row_data['volume'] = structure.volume
                    row_data['density'] = structure.density
                    row_data['pmg_structure_json'] = structure.to_json()
                    atoms = AseAtomsAdaptor.get_atoms(structure)
                    row_data['ase_atoms_json'] = json.dumps(atoms.as_dict(), default=str)
                except Exception as e:
                    parse_errors.append(f"vasprun:final_structure:{e}")

        except Exception as e:
            parse_errors.append(f"vasprun:{e}")

    # --- OUTCAR fallback energy-only ---
    outcar_path = os.path.join(calc_dir, 'OUTCAR')
    if row_data['energy_eV'] is None and os.path.exists(outcar_path):
        try:
            outcar = Outcar(outcar_path)
            if hasattr(outcar, 'final_energy') and outcar.final_energy is not None:
                row_data['energy_eV'] = float(outcar.final_energy)
        except Exception as e:
            parse_errors.append(f"outcar:{e}")

    if row_data['energy_eV'] is not None and structure is not None and len(structure) > 0:
        row_data['energy_per_atom_eV'] = row_data['energy_eV'] / len(structure)

    # --- INCAR / POTCAR (optional; may not exist everywhere) ---
    incar_path = os.path.join(calc_dir, 'INCAR')
    if os.path.exists(incar_path):
        try:
            incar = Incar.from_file(incar_path)
            incar_dict = dict(incar)
            row_data['incar'] = json.dumps(incar_dict, default=str)
            for k in ['SYSTEM', 'ENCUT', 'ISPIN', 'ISMEAR', 'SIGMA', 'GGA']:
                if k in incar_dict:
                    row_data[f'incar_{k.lower()}'] = incar_dict[k]
        except Exception as e:
            parse_errors.append(f"incar:{e}")

    potcar_path = os.path.join(calc_dir, 'POTCAR')
    if os.path.exists(potcar_path) and structure is not None:
        try:
            potcar = Potcar.from_file(potcar_path)
            nelectrons_map = {str(ps.element): float(ps.nelectrons) for ps in potcar}
            row_data['potcar_nelectrons_map'] = json.dumps(nelectrons_map, default=str)
            row_data['potcar_total_nelectrons'] = sum(
                float(amt) * float(nelectrons_map[el])
                for el, amt in structure.composition.get_el_amt_dict().items()
                if el in nelectrons_map
            )
        except Exception as e:
            parse_errors.append(f"potcar:{e}")

    if parse_errors:
        row_data['parse_errors'] = ' | '.join(parse_errors)

    return row_data


# Parallel parse all calc folders
if PARALLEL_BACKEND == 'process':
    ctx = mp.get_context('fork')
    Executor = lambda **kw: ProcessPoolExecutor(mp_context=ctx, **kw)
    # CPU-bound parsing → keep workers near CPU count
    max_workers_all = max(1, (os.cpu_count() or 8) - 1)
else:
    Executor = ThreadPoolExecutor
    # I/O-heavy parsing → allow more threads
    max_workers_all = min(32, (os.cpu_count() or 8) * 2)

print(f"Backend={PARALLEL_BACKEND} max_workers_all={max_workers_all}")

rows_all = []
with Executor(max_workers=max_workers_all) as ex:
    futures = [ex.submit(parse_calc_folder, d) for d in all_calc_dirs]
    for i, fut in enumerate(as_completed(futures), start=1):
        rows_all.append(fut.result())
        if i % 200 == 0 or i == len(futures):
            print(f"Parsed {i}/{len(futures)}")

# Build dataframe
all_df = pd.DataFrame(rows_all)

# Derive solute from POTCAR map
if 'potcar_nelectrons_map' in all_df.columns:
    all_df['solute'] = all_df['potcar_nelectrons_map'].apply(infer_solute)
else:
    all_df['solute'] = None

# Keep a copy of the full parsed table
all_df_all = all_df.copy()

# Keep ONLY pure Ti, Al, or V jobs (exclude mixed-solute labels like "Al+Cu")
all_df = all_df[all_df['solute'].isin(['Ti', 'Al', 'V'])].copy()
all_df = all_df[~all_df['solute'].astype(str).str.contains(r'\+', na=False)].copy()

all_df = all_df.sort_values(['top_level', 'relpath'], na_position='last').reset_index(drop=True)

print('all_df (Ti/Al/V) rows:', len(all_df))
print('all_df cols:', all_df.columns.tolist())

# Save filtered Ti/Al/V dataframe
all_pickle_path = os.path.join(OUTPUT_DIR, 'tigb_all_data_TiAlV.pkl')
all_df.to_pickle(all_pickle_path)
print('Wrote:', all_pickle_path)

all_flat_cols = [c for c in all_df.columns if c not in ['pmg_structure_json', 'ase_atoms_json', 'forces_eV_A', 'stress_kbar_3x3']]
all_df[all_flat_cols].to_csv(os.path.join(OUTPUT_DIR, 'tigb_all_data_TiAlV_flat.csv'), index=False)
print('Wrote:', os.path.join(OUTPUT_DIR, 'tigb_all_data_TiAlV_flat.csv'))

all_df.head()

FileNotFoundError: Data root not found: /mnt/e/Onedrive/TiGB-Project/Data (from E:\Onedrive\TiGB-Project\Data)

In [16]:
# Preview (all_df)
(all_df[['job_name','top_level','relpath','solute','energy_eV']].head(10) if 'all_df' in globals() else 'Run Cell 2 first')

Unnamed: 0,job_name,top_level,relpath,solute,energy_eV
0,BondOrder-VacuumGB-C1-Al-Al-25-d-0.00,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-25-d-0.00,Al,-391.850142
1,BondOrder-VacuumGB-C1-Al-Al-26-d-0.00,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-26-d-0.00,Al,-390.813759
2,BondOrder-VacuumGB-C1-Al-Al-27-d-1.38,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-27-d-1.38,Al,-391.17957
3,BondOrder-VacuumGB-C1-Al-Al-28-d-1.38,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-28-d-1.38,Al,-391.017228
4,BondOrder-VacuumGB-C1-Al-Al-29-d-2.59,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-29-d-2.59,Al,-391.087938
5,BondOrder-VacuumGB-C1-Al-Al-30-d-2.59,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-30-d-2.59,Al,-391.078324
6,BondOrder-VacuumGB-C1-Al-Al-31-d-3.83,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-31-d-3.83,Al,-392.186805
7,BondOrder-VacuumGB-C1-Al-Al-32-d-3.83,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-32-d-3.83,Al,-390.929759
8,BondOrder-VacuumGB-C1-Al-Al-33-d-5.07,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-33-d-5.07,Al,-391.078667
9,BondOrder-VacuumGB-C1-Al-Al-34-d-5.07,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-34-d-5.07,Al,-391.051983


In [24]:
# Create two filtered DataFrames based on solute content
# Goal:
# 1) Keep only systems where the POTCAR element set is a subset of {Ti, Al, V}
#    AND the inferred solute label is exactly one of {Ti, Al, V} (no mixed solutes like "Al+V").
# 2) Everything else goes to a second dataframe.

ALLOWED = {'Ti', 'Al', 'V'}


def potcar_elements(potcar_map_json: str):
    if not potcar_map_json:
        return set()
    try:
        m = json.loads(potcar_map_json)
        return set(str(k) for k in m.keys()) if isinstance(m, dict) else set()
    except Exception:
        return set()


def solute_label(row):
    # Our 'solute' column is "non-Ti element"; for pure Ti it is None.
    s = row.get('solute', None)
    return 'Ti' if (s is None or s == '' or (isinstance(s, float) and np.isnan(s))) else str(s)


def split_df_by_allowed_solutes(df_in: pd.DataFrame, name: str):
    if df_in is None or len(df_in) == 0:
        print(f"{name}: empty")
        return None, None

    df_work = df_in.copy()

    # Ensure solute exists
    if 'solute' not in df_work.columns and 'potcar_nelectrons_map' in df_work.columns:
        df_work['solute'] = df_work['potcar_nelectrons_map'].apply(infer_solute)

    # Compute allowed mask
    elems = df_work['potcar_nelectrons_map'].apply(potcar_elements) if 'potcar_nelectrons_map' in df_work.columns else [set()] * len(df_work)
    df_work['_potcar_elems'] = elems
    df_work['_solute_label'] = df_work.apply(solute_label, axis=1)

    allowed_only = df_work['_potcar_elems'].apply(lambda s: len(s) > 0 and s.issubset(ALLOWED))
    single_allowed_solute = df_work['_solute_label'].isin(ALLOWED) & (~df_work['_solute_label'].str.contains(r'\+', na=False))

    df_allowed = df_work[allowed_only & single_allowed_solute].drop(columns=['_potcar_elems', '_solute_label'])
    df_other = df_work[~(allowed_only & single_allowed_solute)].drop(columns=['_potcar_elems', '_solute_label'])

    # Write outputs
    allowed_pkl = os.path.join(OUTPUT_DIR, f"{name}_allowed_ti_al_v.pkl")
    other_pkl = os.path.join(OUTPUT_DIR, f"{name}_other_solutes.pkl")
    df_allowed.to_pickle(allowed_pkl)
    df_other.to_pickle(other_pkl)

    df_allowed.to_csv(os.path.join(OUTPUT_DIR, f"{name}_allowed_ti_al_v.csv"), index=False)
    df_other.to_csv(os.path.join(OUTPUT_DIR, f"{name}_other_solutes.csv"), index=False)

    print(f"{name}: allowed={len(df_allowed)} other={len(df_other)}")
    print('Wrote:', allowed_pkl)
    print('Wrote:', other_pkl)

    return df_allowed, df_other


# Apply to all_df (full data)
if 'all_df' in globals():
    all_df_allowed, all_df_other = split_df_by_allowed_solutes(all_df, name='tigb_all')

# Preview
if 'all_df_allowed' in locals() and all_df_allowed is not None:
    all_df_allowed[['job_name','solute','potcar_nelectrons_map']].head(10)
else:
    print('No preview available')

tigb_all: allowed=870 other=0
Wrote: /root/TiGB_Project/TiGBProject_AlV_Data/outputs/tigb_all_allowed_ti_al_v.pkl
Wrote: /root/TiGB_Project/TiGBProject_AlV_Data/outputs/tigb_all_other_solutes.pkl


In [25]:
all_df_allowed[["job_name","top_level","relpath","solute","energy_eV"]].head(10)

Unnamed: 0,job_name,top_level,relpath,solute,energy_eV
0,BondOrder-VacuumGB-C1-Al-Al-25-d-0.00,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-25-d-0.00,Al,-391.850142
1,BondOrder-VacuumGB-C1-Al-Al-26-d-0.00,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-26-d-0.00,Al,-390.813759
2,BondOrder-VacuumGB-C1-Al-Al-27-d-1.38,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-27-d-1.38,Al,-391.17957
3,BondOrder-VacuumGB-C1-Al-Al-28-d-1.38,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-28-d-1.38,Al,-391.017228
4,BondOrder-VacuumGB-C1-Al-Al-29-d-2.59,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-29-d-2.59,Al,-391.087938
5,BondOrder-VacuumGB-C1-Al-Al-30-d-2.59,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-30-d-2.59,Al,-391.078324
6,BondOrder-VacuumGB-C1-Al-Al-31-d-3.83,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-31-d-3.83,Al,-392.186805
7,BondOrder-VacuumGB-C1-Al-Al-32-d-3.83,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-32-d-3.83,Al,-390.929759
8,BondOrder-VacuumGB-C1-Al-Al-33-d-5.07,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-33-d-5.07,Al,-391.078667
9,BondOrder-VacuumGB-C1-Al-Al-34-d-5.07,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-34-d-5.07,Al,-391.051983


In [26]:
all_df_allowed[['job_name','pmg_structure_json','solute','potcar_nelectrons_map']].head(10)

Unnamed: 0,job_name,pmg_structure_json,solute,potcar_nelectrons_map
0,BondOrder-VacuumGB-C1-Al-Al-25-d-0.00,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
1,BondOrder-VacuumGB-C1-Al-Al-26-d-0.00,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
2,BondOrder-VacuumGB-C1-Al-Al-27-d-1.38,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
3,BondOrder-VacuumGB-C1-Al-Al-28-d-1.38,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
4,BondOrder-VacuumGB-C1-Al-Al-29-d-2.59,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
5,BondOrder-VacuumGB-C1-Al-Al-30-d-2.59,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
6,BondOrder-VacuumGB-C1-Al-Al-31-d-3.83,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
7,BondOrder-VacuumGB-C1-Al-Al-32-d-3.83,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
8,BondOrder-VacuumGB-C1-Al-Al-33-d-5.07,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"
9,BondOrder-VacuumGB-C1-Al-Al-34-d-5.07,"{""@module"": ""pymatgen.core.structure"", ""@class...",Al,"{""Ti"": 4.0, ""Al"": 3.0}"


In [27]:
subset.top_level.unique()

array(['BondOrder-VacuumGB', 'RGS-1sol', 'Segregation-1sol',
       'Segregation-1sol-addvac', 'SegregationProfile', 'Slab-1sol'],
      dtype=object)

In [28]:
subset[subset["top_level"] == "BondOrder-VacuumGB"]

Unnamed: 0,job_name,top_level,relpath,path,structure_file,num_atoms,formula,volume,density,pmg_structure_json,...,incar_system,incar_encut,incar_ispin,incar_ismear,incar_sigma,incar_gga,potcar_nelectrons_map,potcar_total_nelectrons,parse_errors,solute
0,BondOrder-VacuumGB-C1-Al-Al-25-d-0.00,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-25-d-0.00,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,52,Ti51 Al1,1362.612768,3.007854,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,Al-25-d-0.00,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""Al"": 3.0}",207.0,,Al
1,BondOrder-VacuumGB-C1-Al-Al-26-d-0.00,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-26-d-0.00,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,52,Ti51 Al1,1362.612768,3.007854,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,Al-26-d-0.00,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""Al"": 3.0}",207.0,,Al
2,BondOrder-VacuumGB-C1-Al-Al-27-d-1.38,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-27-d-1.38,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,52,Ti51 Al1,1362.612768,3.007854,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,Al-27-d-1.38,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""Al"": 3.0}",207.0,,Al
3,BondOrder-VacuumGB-C1-Al-Al-28-d-1.38,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-28-d-1.38,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,52,Ti51 Al1,1362.612768,3.007854,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,Al-28-d-1.38,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""Al"": 3.0}",207.0,,Al
4,BondOrder-VacuumGB-C1-Al-Al-29-d-2.59,BondOrder-VacuumGB,BondOrder-VacuumGB/C1/Al/Al-29-d-2.59,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,52,Ti51 Al1,1362.612768,3.007854,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,Al-29-d-2.59,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""Al"": 3.0}",207.0,,Al
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,BondOrder-VacuumGB-T2-V-V-49-d-5.58,BondOrder-VacuumGB,BondOrder-VacuumGB/T2/V/V-49-d-5.58,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,64,Ti63 V1,1898.310816,2.682462,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,V-49-d-5.58,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""V"": 5.0}",257.0,,V
208,BondOrder-VacuumGB-T2-V-V-50-d-6.29,BondOrder-VacuumGB,BondOrder-VacuumGB/T2/V/V-50-d-6.29,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,64,Ti63 V1,1898.310816,2.682462,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,V-50-d-6.29,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""V"": 5.0}",257.0,,V
209,BondOrder-VacuumGB-T2-V-V-51-d-6.29,BondOrder-VacuumGB,BondOrder-VacuumGB/T2/V/V-51-d-6.29,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,64,Ti63 V1,1898.310816,2.682462,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,V-51-d-6.29,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""V"": 5.0}",257.0,,V
210,BondOrder-VacuumGB-T2-V-V-52-d-6.97,BondOrder-VacuumGB,BondOrder-VacuumGB/T2/V/V-52-d-6.97,/mnt/e/Onedrive/TiGB-Project/Data/BondOrder-Va...,CONTCAR,64,Ti63 V1,1898.310816,2.682462,"{""@module"": ""pymatgen.core.structure"", ""@class...",...,V-52-d-6.97,450.0,1,1,0.2,Pe,"{""Ti"": 4.0, ""V"": 5.0}",257.0,,V


In [31]:
all_df.top_level.unique()

array(['BondOrder-VacuumGB', 'RGS-1sol', 'Segregation-1sol',
       'Segregation-1sol-addvac', 'SegregationProfile', 'Slab-1sol'],
      dtype=object)

In [None]:
# Write extxyz files for ALL rows in all_df that contain ONLY Ti, V, Al
# Write to the repo's data/ folder, organized by topmost level directory
# Attach whatever data is available (energy, forces, stress) - don't require all fields

from ase.io import write as ase_write

DATA_OUT_ROOT = os.path.join(PROJECT_ROOT, 'data')
os.makedirs(DATA_OUT_ROOT, exist_ok=True)

required_cols = {'top_level', 'job_name', 'pmg_structure_json'}
missing = required_cols - set(all_df.columns)
if missing:
    raise ValueError(f"all_df is missing required columns: {sorted(missing)}")

# Only require structure to exist (for composition check)
mask = all_df['pmg_structure_json'].notna()
subset = all_df[mask].copy()

print(f'Total rows with structure: {len(subset)} / {len(all_df)}')

# Check composition: only Ti, V, Al allowed
ALLOWED_ELEMENTS = {'Ti', 'Al', 'V'}

def structure_has_only_ti_al_v(pmg_json_str):
    if not pmg_json_str:
        return False
    try:
        struct = Structure.from_str(pmg_json_str, fmt='json')
        elements = set(str(el) for el in struct.composition.elements)
        return len(elements) > 0 and elements.issubset(ALLOWED_ELEMENTS)
    except Exception:
        return False

composition_mask = subset['pmg_structure_json'].apply(structure_has_only_ti_al_v)
subset = subset[composition_mask].copy()

print(f'Rows with Ti/Al/V-only composition: {len(subset)}')
print(f'Skipped {len(all_df[mask]) - len(subset)} rows with non-Ti/Al/V elements')

n_written = 0
n_skipped = 0

for _, row in subset.iterrows():
    top = row.get('top_level') or 'unknown'
    out_dir = os.path.join(DATA_OUT_ROOT, top)
    os.makedirs(out_dir, exist_ok=True)

    job_name = row['job_name']
    out_path = os.path.join(out_dir, f"{job_name}.extxyz")

    try:
        # pmg_structure_json is written via Structure.to_json() (Monty JSON)
        pmg_struct = Structure.from_str(row['pmg_structure_json'], fmt='json')
        atoms = AseAtomsAdaptor.get_atoms(pmg_struct)

        # Attach metadata
        atoms.info['job_name'] = job_name
        atoms.info['top_level'] = top
        atoms.info['solute'] = row.get('solute')

        # Attach energy if available
        if row.get('energy_eV') is not None:
            atoms.info['energy'] = float(row['energy_eV'])

        # Attach forces if available
        if row.get('forces_eV_A') is not None:
            try:
                atoms.arrays['forces'] = np.array(row['forces_eV_A'], dtype=float)
            except Exception:
                pass

        # Attach stress if available
        if row.get('stress_kbar_3x3') is not None:
            atoms.info['stress_kbar_3x3'] = row['stress_kbar_3x3']

        ase_write(out_path, atoms, format='extxyz')
        n_written += 1
    except Exception as e:
        print(f"Error writing {out_path}: {e}")
        n_skipped += 1
        continue

print(f"Wrote {n_written} extxyz files under: {DATA_OUT_ROOT}")
if n_skipped > 0:
    print(f"Skipped {n_skipped} rows due to write errors")

Ti/Al/V rows with energy+forces+stress: 766 / 870
Wrote 766 extxyz files under: /root/TiGB_Project/TiGBProject_AlV_Data/data


In [None]:
# Write key dataframes to CSV and pickle under the repo's data/ folder

DATA_OUT_ROOT = os.path.join(PROJECT_ROOT, 'data')
os.makedirs(DATA_OUT_ROOT, exist_ok=True)


def write_df_to_data(name: str, df: pd.DataFrame):
    if df is None:
        return
    if not isinstance(df, pd.DataFrame) or len(df) == 0:
        print(f"Skip {name}: empty or not a DataFrame")
        return

    pkl_path = os.path.join(DATA_OUT_ROOT, f"{name}.pkl")
    csv_path = os.path.join(DATA_OUT_ROOT, f"{name}.csv")

    df.to_pickle(pkl_path)
    df.to_csv(csv_path, index=False)

    print(f"Wrote {name} -> {pkl_path}")
    print(f"Wrote {name} -> {csv_path}")


# Main Ti/Al/V subset dataframe
if 'all_df' in globals():
    write_df_to_data('tigb_all_data_TiAlV', all_df)

# Full unfiltered dataframe (if kept)
if 'all_df_all' in globals():
    write_df_to_data('tigb_all_data_FULL', all_df_all)

# Allowed/other splits (if they exist)
if 'all_df_allowed' in globals():
    write_df_to_data('tigb_all_allowed_ti_al_v', all_df_allowed)
if 'all_df_other' in globals():
    write_df_to_data('tigb_all_other_solutes', all_df_other)

print('Done writing dataframes into:', DATA_OUT_ROOT)