In [None]:
import pandas as pd
import numpy as np
from pymatgen.core.structure import Structure
import matplotlib.pyplot as plt

In [None]:
# Load the original DFT data
df = pd.read_csv('../data/DFT_data.csv')

# Create a new column called 'flipped' and set it to empty string for original data
df['flipped'] = ''

# Check the first few rows
df.head()

In [None]:
# Verify if count of sym_vac==True matches count of WF_top==WF_bottom
sym_vac_true_count = df[df['sym_vac'] == True].shape[0]
wf_equal_count = df[np.isclose(df['WF_top'], df['WF_bottom'], rtol=1e-5)].shape[0]

print(f"Number of rows with sym_vac=True: {sym_vac_true_count}")
print(f"Number of rows with WF_top ≈ WF_bottom: {wf_equal_count}")
print(f"Difference: {abs(sym_vac_true_count - wf_equal_count)}")

# Check overlapping distribution
sym_vac_and_wf_equal = df[(df['sym_vac'] == True) & np.isclose(df['WF_top'], df['WF_bottom'], rtol=1e-5)].shape[0]
print(f"Number of rows with both sym_vac=True AND WF_top ≈ WF_bottom: {sym_vac_and_wf_equal}")

In [None]:
# Define the simple flipping function
def flip_struct(structure):
    """
    Flip a structure upside down by inverting the z-coordinates.
    
    Args:
        structure: pymatgen Structure object
        
    Returns:
        flipped Structure object
    """
    structure = structure.copy()
    species = structure.species_and_occu
    frac_coords = structure.frac_coords
    for f in frac_coords:
        f[2] = 1 - f[2]
    return Structure(structure.lattice, species, frac_coords)

In [None]:
# Filter out symmetric structures (where flipping won't make a difference)
# We'll use sym_vac column as our guide
df_non_symmetric = df[df['sym_vac'] == False].copy()

print(f"Total structures: {len(df)}")
print(f"Non-symmetric structures (sym_vac=False): {len(df_non_symmetric)}")

# Double-check: are there any cases with sym_vac=False but equal WFs?
edge_cases = df[(df['sym_vac'] == False) & np.isclose(df['WF_top'], df['WF_bottom'], rtol=1e-5)].shape[0]
print(f"Edge cases (sym_vac=False but WF_top ≈ WF_bottom): {edge_cases}")

In [None]:
# Create empty dataframe for flipped structures
df_flipped = pd.DataFrame(columns=df.columns)

# Track progress
total_structures = len(df_non_symmetric)
successful_flips = 0

# Process each non-symmetric structure
print(f"Processing {total_structures} non-symmetric structures...")

for idx, row in df_non_symmetric.iterrows():
    # Show progress every 100 structures
    if idx % 100 == 0:
        print(f"Processing {idx}/{total_structures}...")
        
    try:
        # Get the structure
        struct_dict = eval(row['slab'])
        structure = Structure.from_dict(struct_dict)
        
        # Flip the structure
        flipped_structure = flip_struct(structure)
        
        # Create a new row for the flipped structure
        flipped_row = row.copy()
        flipped_row['slab'] = str(flipped_structure.as_dict())
        flipped_row['flipped'] = 'flipped'
        
        # Swap top and bottom work functions
        flipped_row['WF_top'], flipped_row['WF_bottom'] = row['WF_bottom'], row['WF_top']
        
        # Add to flipped dataframe
        df_flipped = df_flipped.append(flipped_row, ignore_index=True)
        
        successful_flips += 1
    except Exception as e:
        # Just skip this structure - don't keep the original
        if idx % 100 == 0:
            print(f"Error flipping structure {idx}: {e}")
        continue

print(f"Flipping complete! Successfully flipped {successful_flips}/{total_structures} structures")

In [None]:
# Concatenate the original and flipped dataframes
df_augmented = pd.concat([df, df_flipped], ignore_index=True)

# Examine the results
print(f"Original data size: {len(df)}")
print(f"Flipped data size: {len(df_flipped)}")
print(f"Augmented data size: {len(df_augmented)}")
df_augmented.head()

In [None]:
# Save the augmented dataset
df_augmented.to_csv('../data/DFT_data_augmented_simple.csv', index=False)
print("Saved augmented dataset to ../data/DFT_data_augmented_simple.csv")

In [None]:
# Verify the flipping worked correctly by examining one example
if len(df_flipped) > 0:
    # Choose a sample structure from the original and flipped datasets
    sample_idx = 0  # First structure in flipped set
    
    # Find the corresponding original structure (matching all keys except 'flipped')
    original_idx = None
    for idx, row in df.iterrows():
        if (row['mpid'] == df_flipped.iloc[sample_idx]['mpid'] and 
            row['miller'] == df_flipped.iloc[sample_idx]['miller'] and
            row['term'] == df_flipped.iloc[sample_idx]['term']):
            original_idx = idx
            break
    
    if original_idx is not None:
        # Get structures
        original_struct = Structure.from_dict(eval(df.iloc[original_idx]['slab']))
        flipped_struct = Structure.from_dict(eval(df_flipped.iloc[sample_idx]['slab']))
        
        # Print information about both structures
        print("ORIGINAL STRUCTURE:")
        print(f"Lattice parameters: a={original_struct.lattice.a:.4f}, b={original_struct.lattice.b:.4f}, c={original_struct.lattice.c:.4f}")
        print(f"Z-range: {min([site.coords[2] for site in original_struct]):.4f} - {max([site.coords[2] for site in original_struct]):.4f}")
        print(f"First 3 atomic z-coordinates: {[site.coords[2] for site in original_struct][:3]}")

        print("\nFLIPPED STRUCTURE:")
        print(f"Lattice parameters: a={flipped_struct.lattice.a:.4f}, b={flipped_struct.lattice.b:.4f}, c={flipped_struct.lattice.c:.4f}")
        print(f"Z-range: {min([site.coords[2] for site in flipped_struct]):.4f} - {max([site.coords[2] for site in flipped_struct]):.4f}")
        print(f"First 3 atomic z-coordinates: {[site.coords[2] for site in flipped_struct][:3]}")

        # Compare work functions
        print("\nWORK FUNCTIONS:")
        print(f"Original - Top: {df.iloc[original_idx]['WF_top']:.6f}, Bottom: {df.iloc[original_idx]['WF_bottom']:.6f}")
        print(f"Flipped  - Top: {df_flipped.iloc[sample_idx]['WF_top']:.6f}, Bottom: {df_flipped.iloc[sample_idx]['WF_bottom']:.6f}")
        
        # Verify work functions were properly swapped
        if (abs(df.iloc[original_idx]['WF_top'] - df_flipped.iloc[sample_idx]['WF_bottom']) < 1e-6 and
            abs(df.iloc[original_idx]['WF_bottom'] - df_flipped.iloc[sample_idx]['WF_top']) < 1e-6):
            print("\n✅ Work functions were properly swapped!")
        else:
            print("\n⚠️ Work functions may not have been properly swapped!")
    else:
        print("Could not find matching original structure for verification.")
else:
    print("No flipped structures to verify.")

In [None]:
# Plot distribution of work function differences
plt.figure(figsize=(14, 6))

# Original data
plt.subplot(1, 2, 1)
wf_diffs = df['WF_top'] - df['WF_bottom']
plt.hist(wf_diffs, bins=50, alpha=0.7)
plt.axvline(x=0, color='r', linestyle='--')
plt.xlabel('WF_top - WF_bottom (eV)')
plt.ylabel('Count')
plt.title('Original Work Function Differences')
plt.grid(alpha=0.3)

# Augmented data
plt.subplot(1, 2, 2)
wf_diffs_aug = df_augmented['WF_top'] - df_augmented['WF_bottom']
plt.hist(wf_diffs_aug, bins=50, alpha=0.7)
plt.axvline(x=0, color='r', linestyle='--')
plt.xlabel('WF_top - WF_bottom (eV)')
plt.ylabel('Count')
plt.title('Augmented Work Function Differences')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../data/wf_differences_histogram.png', dpi=300)
plt.show()