## Dataset combination and filtering
### 01/10/25

In [1]:
#### Load libs
import os
from ase.db import connect
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from __future__ import annotations
import os, json, sqlite3, re, math
from functools import reduce
from pymatgen.core import Structure
from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from tqdm import tqdm 
from scipy.stats import pearsonr 

## Preprocessing, filtering, validation and merge

In [2]:
# --- Main Configuration ---

# --- INPUT FILES ---
# Original, unprocessed databases
C2DB_PATH = "c2db.db"
MP2D_PATH = "2dmatpedia_final.db" # Using the name from your provided code

# --- INTERMEDIATE FILES ---
# Databases after filtering and processing
FILTERED_C2DB_PATH = "c2db_filtered.db"
FILTERED_MP2D_PATH = "2dmatpedia_filtered.db"

# --- FINAL OUTPUT FILE ---
FINAL_COMBINED_DB_PATH = "final_combined_database.db"

# --- Filtering and Validation Criteria ---
EHULL_THRESHOLD = 0.1  # eV/atom
MIN_INTERATOMIC_DISTANCE = 0.5  # Angstroms (Å)


def process_and_filter_database(input_db_path, output_db_path, stability_key, property_rename_map=None):
    """
    Reads a database, renames properties, filters by stability and geometry,
    standardizes structures, removes duplicates, and saves to a new database.
    """
    print(f"\n--- Processing database: {input_db_path} ---")

    if not os.path.exists(input_db_path):
        print(f"ERROR: Input database '{input_db_path}' not found. Skipping.")
        return

    if os.path.exists(output_db_path):
        os.remove(output_db_path)

    db_in = connect(input_db_path)
    db_out = connect(output_db_path)

    seen_structures_fingerprints = set()
    stats = {'read': 0, 'removed_instability': 0, 'removed_invalid_geom': 0, 'removed_duplicates': 0, 'written': 0}

    for row in tqdm(db_in.select(), total=len(db_in), desc=f"Filtering {os.path.basename(input_db_path)}"):
        stats['read'] += 1
        kvp = dict(row.key_value_pairs)

        # STEP 1: RENAME PROPERTIES
        # If a rename map is provided, create a new dictionary with the updated keys.
        if property_rename_map:
            renamed_kvp = {}
            for key, value in kvp.items():
                new_key = property_rename_map.get(key, key) # Get new key or keep the old one
                renamed_kvp[new_key] = value
            kvp = renamed_kvp # Use the renamed dictionary for all subsequent steps

        # STEP 2: STABILITY FILTERING
        # This check now uses the potentially renamed stability key.
        if stability_key in kvp and kvp[stability_key] is not None:
            if kvp[stability_key] > EHULL_THRESHOLD:
                stats['removed_instability'] += 1
                continue
        
        # STEP 3: GEOMETRY STANDARDIZATION AND VALIDATION
        try:
            atoms = row.toatoms()
            structure_pmg = AseAtomsAdaptor.get_structure(atoms)
            sga = SpacegroupAnalyzer(structure_pmg, symprec=0.1)
            standardized_structure = sga.get_primitive_standard_structure()
            
            if len(standardized_structure) > 1:
                distances = standardized_structure.distance_matrix[np.triu_indices(len(standardized_structure), k=1)]
                if distances.size > 0 and np.min(distances) < MIN_INTERATOMIC_DISTANCE:
                    stats['removed_invalid_geom'] += 1
                    continue
        except Exception:
            stats['removed_invalid_geom'] += 1
            continue

        # STEP 4: DUPLICATE FILTERING
        fingerprint = (standardized_structure.formula, standardized_structure.to(fmt="cif"))
        if fingerprint in seen_structures_fingerprints:
            stats['removed_duplicates'] += 1
            continue
        else:
            seen_structures_fingerprints.add(fingerprint)

        # STEP 5: SAVE TO NEW DATABASE
        final_atoms = AseAtomsAdaptor.get_atoms(standardized_structure)
        # The renamed kvp dictionary is saved here
        db_out.write(final_atoms, key_value_pairs=kvp, data=row.get('data'))
        stats['written'] += 1
        
    # --- Print Report for this database ---
    print(f"\n--- Report for {input_db_path} ---")
    print(f"Total structures read: {stats['read']}")
    print(f"Removed due to instability ('{stability_key}' > {EHULL_THRESHOLD} eV/atom): {stats['removed_instability']}")
    print(f"Removed due to invalid geometry (dist < {MIN_INTERATOMIC_DISTANCE} Å): {stats['removed_invalid_geom']}")
    print(f"Removed due to being duplicates: {stats['removed_duplicates']}")
    print("-" * 30)
    print(f"Total structures saved to '{output_db_path}': {stats['written']}")


def combine_databases(source_db_paths, final_db_path):
    """Combines multiple ASE databases into a single new database."""
    print(f"\n--- Combining filtered databases into {final_db_path} ---")
    if os.path.exists(final_db_path):
        os.remove(final_db_path)

    total_written = 0
    with connect(final_db_path) as db_out:
        for db_path in source_db_paths:
            if not os.path.exists(db_path):
                print(f"WARNING: Filtered database '{db_path}' not found for combination. Skipping.")
                continue
            
            db_in = connect(db_path)
            count = 0
            for row in tqdm(db_in.select(), desc=f"Combining {os.path.basename(db_path)}"):
                db_out.write(row.toatoms(), key_value_pairs=row.key_value_pairs, data=row.get('data'))
                count += 1
            print(f"Added {count} entries from '{db_path}'.")
            total_written += count
    
    return total_written

# --- Main Execution Workflow ---
if __name__ == "__main__":
    # --- STAGE 1: Process C2DB (no renaming needed) ---
    process_and_filter_database(
        input_db_path=C2DB_PATH,
        output_db_path=FILTERED_C2DB_PATH,
        stability_key='ehull',
        property_rename_map=None # No changes here
    )

    # --- STAGE 2: Process 2DMatPedia (with property renaming) ---
    mp2d_rename_map = {
        'bandgap': 'gap',
        'decomposition_energy': 'ehull',
        'total_magnetization': 'magmom_u'
    }
    process_and_filter_database(
        input_db_path=MP2D_PATH,
        output_db_path=FILTERED_MP2D_PATH,
        stability_key='ehull', # Use the NEW key name for the stability check
        property_rename_map=mp2d_rename_map
    )

    # --- STAGE 3: Combine the filtered and harmonized databases ---
    final_count = combine_databases(
        source_db_paths=[FILTERED_C2DB_PATH, FILTERED_MP2D_PATH],
        final_db_path=FINAL_COMBINED_DB_PATH
    )

    # --- Final Summary ---
    print("\n--- Workflow Complete ---")
    print(f"The final combined database '{FINAL_COMBINED_DB_PATH}' was successfully created.")
    print(f"It contains a total of {final_count} high-quality, unique structures with harmonized properties.")


--- Processing database: c2db.db ---


Filtering c2db.db: 100%|██████████| 16905/16905 [01:04<00:00, 261.81it/s]



--- Report for c2db.db ---
Total structures read: 16905
Removed due to instability ('ehull' > 0.1 eV/atom): 11217
Removed due to invalid geometry (dist < 0.5 Å): 0
Removed due to being duplicates: 0
------------------------------
Total structures saved to 'c2db_filtered.db': 5688

--- Processing database: 2dmatpedia_final.db ---


Filtering 2dmatpedia_final.db: 100%|██████████| 6351/6351 [00:35<00:00, 177.57it/s]



--- Report for 2dmatpedia_final.db ---
Total structures read: 6351
Removed due to instability ('ehull' > 0.1 eV/atom): 3631
Removed due to invalid geometry (dist < 0.5 Å): 0
Removed due to being duplicates: 0
------------------------------
Total structures saved to '2dmatpedia_filtered.db': 2720

--- Combining filtered databases into final_combined_database.db ---


Combining c2db_filtered.db: 5688it [00:06, 908.81it/s] 


Added 5688 entries from 'c2db_filtered.db'.


Combining 2dmatpedia_filtered.db: 2720it [00:02, 1089.52it/s]

Added 2720 entries from '2dmatpedia_filtered.db'.

--- Workflow Complete ---
The final combined database 'final_combined_database.db' was successfully created.
It contains a total of 8408 high-quality, unique structures with harmonized properties.



