In [None]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import random
from pymatgen.core.structure import Structure
from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
import ase

In [None]:
# Load the original DFT data
df = pd.read_csv('../data/DFT_data.csv')

# Create a new column called 'flipped' and set it to empty string for original data
df['flipped'] = ''

# Check the first few rows
df.head()

In [None]:
# Define helper functions for structure analysis and flipping

def is_structure_invertible(structure):
    """
    Check if a structure is symmetric with respect to inversion in the z-direction.
    If the structure is symmetric, flipping won't create a meaningfully different structure.
    
    Args:
        structure: pymatgen Structure object
        
    Returns:
        bool: True if structure is invertible (symmetric in z), False otherwise
    """
    # Create a copy of the structure
    flipped = structure.copy()
    
    # Get fractional coordinates
    orig_frac_coords = structure.frac_coords
    
    # Flip the structure using Peter's method
    flipped_structure = flip_struct(structure)
    flipped_frac_coords = flipped_structure.frac_coords
    
    # Tolerance for coordinate comparison
    tol = 0.01
    
    # Structure is invertible if there's a matching atom for each position
    # Need to compare all atoms against all atoms (could be a different ordering)
    for i, orig_site in enumerate(structure):
        found_match = False
        for j, flipped_site in enumerate(flipped_structure):
            # Check if same element
            if orig_site.specie != flipped_site.specie:
                continue
                
            # Calculate distances in fractional coordinates
            # For x and y, they should be similar
            # For z, we need to account for periodicity
            dx = min(abs(orig_site.frac_coords[0] - flipped_site.frac_coords[0]), 
                     1 - abs(orig_site.frac_coords[0] - flipped_site.frac_coords[0]))
            
            dy = min(abs(orig_site.frac_coords[1] - flipped_site.frac_coords[1]), 
                     1 - abs(orig_site.frac_coords[1] - flipped_site.frac_coords[1]))
            
            if dx < tol and dy < tol:
                found_match = True
                break
                
        if not found_match:
            return False
            
    return True

def flip_struct(structure):
    """
    Flip a structure upside down by inverting the z-coordinates.
    Uses Peter's specified method for flipping slabs.
    
    Args:
        structure: pymatgen Structure object
        
    Returns:
        flipped Structure object
    """
    structure = structure.copy()
    species = structure.species_and_occu
    frac_coords = structure.frac_coords
    for f in frac_coords:
        f[2] = 1 - f[2]
    return Structure(structure.lattice, species, frac_coords)

# Define function to flip the slab structure using the functions above
def flip_slab(struct_dict):
    """
    Flips a crystal structure upside down using Peter's specified method.
    First checks if the structure is invertible (symmetric in z-direction).
    
    Args:
        struct_dict: Dictionary representation of a pymatgen Structure
        
    Returns:
        Dictionary representation of the flipped structure
    """
    # Convert dict to pymatgen Structure
    structure = Structure.from_dict(struct_dict)
    
    # Check if structure is invertible (symmetric in z)
    invertible = is_structure_invertible(structure)
    if invertible:
        print("Structure is symmetric in z-direction, flipping may not create a new structure")
    
    # Flip the structure using Peter's function
    flipped_structure = flip_struct(structure)
    
    return flipped_structure.as_dict()

In [None]:
# Create a duplicate dataframe for the flipped structures
df_flipped = df.copy()

# Set the flipped flag to 'flipped' string
df_flipped['flipped'] = 'flipped'

# Add a counter to keep track of progress
total_structures = len(df_flipped)
symmetric_count = 0
error_count = 0

# Add error handling to make the process more robust
def safely_flip_structure(slab_str, index=None):
    global symmetric_count, error_count
    try:
        # Show progress every 100 structures
        if index is not None and index % 100 == 0:
            print(f"Processing structure {index}/{total_structures}...")
        
        struct_dict = eval(slab_str)
        structure = Structure.from_dict(struct_dict)
        
        # Check if structure is invertible (symmetric in z)
        invertible = is_structure_invertible(structure)
        if invertible:
            symmetric_count += 1
            # Still flip it, but we're keeping count
        
        flipped_dict = flip_slab(struct_dict)
        return str(flipped_dict)
    except Exception as e:
        error_count += 1
        if index is not None and index % 100 == 0:
            print(f"Error flipping structure {index}: {e}")
        return slab_str  # Return original if flipping fails

# Apply the flip operation to each slab structure with error handling
print("Flipping structures - this may take some time...")
df_flipped['slab'] = df_flipped['slab'].apply(lambda x, i=0: safely_flip_structure(x, i))

# Swap top and bottom work functions
df_flipped['WF_temp'] = df_flipped['WF_top']
df_flipped['WF_top'] = df_flipped['WF_bottom']
df_flipped['WF_bottom'] = df_flipped['WF_temp']
df_flipped = df_flipped.drop('WF_temp', axis=1)

# Print summary
print(f"Flipping complete! Processed {total_structures} structures")
print(f"Symmetric structures: {symmetric_count}")
print(f"Errors encountered: {error_count}")

In [None]:
# Concatenate the original and flipped dataframes
df_augmented = pd.concat([df, df_flipped], ignore_index=True)

# Examine the results
print(f"Original data size: {len(df)}")
print(f"Augmented data size: {len(df_augmented)}")
df_augmented.head()

In [None]:
# Save the augmented dataset
df_augmented.to_csv('../data/DFT_data_augmented_peter.csv', index=False)
print("Saved augmented dataset to ../data/DFT_data_augmented_peter.csv")

In [None]:
# Let's verify the flipping worked correctly by examining one example
# Choose a sample structure from the original dataset
sample_idx = 0  # First structure

# Get structures from before and after flipping
original_str = df.iloc[sample_idx]['slab']
original_struct = Structure.from_dict(eval(original_str))

# Apply our flip function directly to check
try:
    flipped_dict = flip_slab(eval(original_str))
    flipped_struct = Structure.from_dict(flipped_dict)
    flipping_successful = True
except Exception as e:
    print(f"Error during verification: {e}")
    flipping_successful = False

if flipping_successful:
    # Print information about both structures
    print("ORIGINAL STRUCTURE:")
    print(f"Lattice parameters: a={original_struct.lattice.a:.4f}, b={original_struct.lattice.b:.4f}, c={original_struct.lattice.c:.4f}")
    print(f"Angles: alpha={original_struct.lattice.alpha:.2f}°, beta={original_struct.lattice.beta:.2f}°, gamma={original_struct.lattice.gamma:.2f}°")
    print(f"Volume: {original_struct.volume:.2f} Å³")
    print(f"Z-range: {min([site.coords[2] for site in original_struct]):.4f} - {max([site.coords[2] for site in original_struct]):.4f}")
    print(f"First 3 atomic z-coordinates: {[site.coords[2] for site in original_struct][:3]}")

    print("\nFLIPPED STRUCTURE (using Peter's method):")
    print(f"Lattice parameters: a={flipped_struct.lattice.a:.4f}, b={flipped_struct.lattice.b:.4f}, c={flipped_struct.lattice.c:.4f}")
    print(f"Angles: alpha={flipped_struct.lattice.alpha:.2f}°, beta={flipped_struct.lattice.beta:.2f}°, gamma={flipped_struct.lattice.gamma:.2f}°")
    print(f"Volume: {flipped_struct.volume:.2f} Å³")
    print(f"Z-range: {min([site.coords[2] for site in flipped_struct]):.4f} - {max([site.coords[2] for site in flipped_struct]):.4f}")
    print(f"First 3 atomic z-coordinates: {[site.coords[2] for site in flipped_struct][:3]}")

    # Compare work functions
    print("\nWORK FUNCTIONS:")
    print(f"Original - Top: {df.iloc[sample_idx]['WF_top']:.6f}, Bottom: {df.iloc[sample_idx]['WF_bottom']:.6f}")
    print(f"Should be - Top: {df.iloc[sample_idx]['WF_bottom']:.6f}, Bottom: {df.iloc[sample_idx]['WF_top']:.6f}")

In [None]:
# Add functionality to save structures as CIF files for visual inspection
import os
from pymatgen.io.cif import CifWriter

# Create a directory for CIF files if it doesn't exist
cif_dir = "../data/cif_examples_peter"
os.makedirs(cif_dir, exist_ok=True)

# Function to save a structure as CIF
def save_as_cif(structure, filename):
    writer = CifWriter(structure)
    writer.write_file(filename)
    print(f"Saved: {filename}")

# Select a few interesting examples for visual inspection
# We'll look for structures with non-perpendicular z-axis (beta/alpha angles far from 90°)
samples = []

# Look through a subset of structures
for i in range(min(1000, len(df))):
    try:
        struct = Structure.from_dict(eval(df.iloc[i]['slab']))
        # Check if this is an interesting case (non-perpendicular z-axis)
        alpha = struct.lattice.alpha
        beta = struct.lattice.beta
        
        # Is the z-axis not perpendicular to x/y plane?
        if abs(alpha - 90) > 5 or abs(beta - 90) > 5:
            mpid = df.iloc[i]['mpid']
            miller = df.iloc[i]['miller']
            term = df.iloc[i]['term']
            samples.append((i, struct, f"{mpid}_{miller}_{term}"))
            
            # Once we have a few examples, stop
            if len(samples) >= 5:
                break
    except Exception as e:
        continue

print(f"Found {len(samples)} interesting structures for visual inspection")

# Save original and flipped versions
for idx, struct, name in samples:
    # Original structure
    orig_filename = os.path.join(cif_dir, f"{name}.cif")
    save_as_cif(struct, orig_filename)
    
    # Flipped structure
    try:
        flipped_struct = flip_struct(struct)
        flipped_filename = os.path.join(cif_dir, f"{name}_flipped.cif")
        save_as_cif(flipped_struct, flipped_filename)
        
        # Print key information
        print(f"\nStructure {name}:")
        print(f"  Original - alpha: {struct.lattice.alpha:.2f}°, beta: {struct.lattice.beta:.2f}°, gamma: {struct.lattice.gamma:.2f}°")
        print(f"  Flipped  - alpha: {flipped_struct.lattice.alpha:.2f}°, beta: {flipped_struct.lattice.beta:.2f}°, gamma: {flipped_struct.lattice.gamma:.2f}°")
    except Exception as e:
        print(f"Error flipping structure {name}: {e}")

print(f"\nCIF files have been saved to {cif_dir}")
print("You can open these files in visualization software like VESTA to inspect them")

In [None]:
# Additional analysis: Compare properties between original and flipped structures
import matplotlib.pyplot as plt

# Select a subset of structures for analysis
sample_count = 100
analysis_samples = []

print("Analyzing property differences between original and flipped structures...")

# Process a subset of the data
for i in range(min(sample_count, len(df))):
    try:
        # Get original properties
        orig_wf_top = df.iloc[i]['WF_top']
        orig_wf_bottom = df.iloc[i]['WF_bottom']
        cleavage = df.iloc[i]['cleavage_energy']
        
        # Get flipped properties (after swapping)
        flipped_wf_top = df_flipped.iloc[i]['WF_top']
        flipped_wf_bottom = df_flipped.iloc[i]['WF_bottom']
        
        # Verify the swap worked correctly
        if abs(orig_wf_top - flipped_wf_bottom) < 1e-6 and abs(orig_wf_bottom - flipped_wf_top) < 1e-6:
            analysis_samples.append({
                'mpid': df.iloc[i]['mpid'],
                'miller': df.iloc[i]['miller'],
                'term': df.iloc[i]['term'],
                'orig_wf_diff': orig_wf_top - orig_wf_bottom,
                'flipped_wf_diff': flipped_wf_top - flipped_wf_bottom,
                'cleavage': cleavage
            })
    except Exception as e:
        continue

# Plot WF differences
plt.figure(figsize=(10, 6))
orig_diffs = [s['orig_wf_diff'] for s in analysis_samples]
flipped_diffs = [s['flipped_wf_diff'] for s in analysis_samples]

# Original vs flipped differences should be opposite (negative correlation)
plt.scatter(orig_diffs, flipped_diffs, alpha=0.7)
plt.plot([-1, 1], [1, -1], 'r--', label='Perfect negative correlation')
plt.xlabel('Original WF Difference (Top - Bottom)')
plt.ylabel('Flipped WF Difference (Top - Bottom)')
plt.title('Work Function Differences: Original vs Flipped (Peter\'s Method)')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()

# Print summary statistics
print(f"\nAnalyzed {len(analysis_samples)} structures")
print(f"Average absolute WF difference (original): {np.mean(np.abs(orig_diffs)):.4f} eV")
print(f"Average absolute WF difference (flipped): {np.mean(np.abs(flipped_diffs)):.4f} eV")
correlation = np.corrcoef(orig_diffs, flipped_diffs)[0, 1]
print(f"Correlation between original and flipped differences: {correlation:.4f}")
print(f"Expected correlation for perfect flipping: -1.0")