In [2]:

# Golden Profiles Analysis: 1,109 High-Quality Soil Profiles
# Focus on profiles that appear in ALL 5 tables for maximum data integrity

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("🌟 GOLDEN PROFILES ANALYSIS")
print("=" * 50)
print("Working with 1,109 profiles that appear in ALL tables")
print("Goal: Create clean, linked database with composite keys")
print("=" * 50)

🌟 GOLDEN PROFILES ANALYSIS
Working with 1,109 profiles that appear in ALL tables
Goal: Create clean, linked database with composite keys


In [3]:
# SECTION 1: LOAD DATA AND FILTER TO GOLDEN PROFILES
# ============================================================================

print("\n📂 SECTION 1: LOADING DATA AND FILTERING TO GOLDEN PROFILES")
print("-" * 60)

# Load original data files
print("Loading original data files...")
samples = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/AmostrasAngolaTerrario.xlsx")
analyses = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Horizontes Analises.xlsx")
#morphology = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Horizontes_Morfologia.xlsx")
morphology = pd.read_csv("/Users/inesschwartz/GreenDataScience/Thesis/tables_clean/morpho_cleaned.csv")

profile_loc = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Perfis_local.xlsx")
soil_profile = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Perfis_solo.xlsx")
elemental_analyses = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Data XRF Angola_inicial.xlsx")

print("✅ Original data loaded successfully")

# Load golden profiles list
golden_profiles_df = pd.read_csv("/Users/inesschwartz/GreenDataScience/Thesis/profile_matching_results/profiles_in_all_tables.csv")
golden_profiles = golden_profiles_df['profile'].tolist()

print(f"✅ Loaded {len(golden_profiles)} golden profiles")
print(f"Sample golden profiles: {golden_profiles[:5]}")

# Function to standardize profile names (consistent with your analysis)
def standardize_profile(profile_series):
    """Standardize profile names consistently"""
    return (profile_series
            .astype(str)
            .str.replace('/', '_')
            .str.strip()
            .str[:20]
            .str.upper())

# Apply standardization to all tables
print("\n🧹 Standardizing profile names...")

# Samples
samples['profile_clean'] = standardize_profile(samples['Perfil'])
samples_original_count = len(samples)
samples_golden = samples[samples['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Samples: {samples_original_count:,} → {len(samples_golden):,} records")

# Analyses
analyses['profile_clean'] = standardize_profile(analyses['PERFIL'])
analyses_original_count = len(analyses)
analyses_golden = analyses[analyses['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Analyses: {analyses_original_count:,} → {len(analyses_golden):,} records")

# Morphology
morphology['profile_clean'] = standardize_profile(morphology['Perfil'])
morphology_original_count = len(morphology)
morphology_golden = morphology[morphology['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Morphology: {morphology_original_count:,} → {len(morphology_golden):,} records")

# Profile location
profile_loc['profile_clean'] = standardize_profile(profile_loc['PERFIL'])
profile_loc_original_count = len(profile_loc)
profile_loc_golden = profile_loc[profile_loc['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Profile_loc: {profile_loc_original_count:,} → {len(profile_loc_golden):,} records")

# Soil profile
soil_profile['profile_clean'] = standardize_profile(soil_profile['Perfil'])
soil_profile_original_count = len(soil_profile)
soil_profile_golden = soil_profile[soil_profile['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Soil_profile: {soil_profile_original_count:,} → {len(soil_profile_golden):,} records")

print("✅ Golden profile filtering complete!")


📂 SECTION 1: LOADING DATA AND FILTERING TO GOLDEN PROFILES
------------------------------------------------------------
Loading original data files...
✅ Original data loaded successfully
✅ Loaded 1109 golden profiles
Sample golden profiles: ['100_58', '100_59', '100_61', '100_63', '101A_58']

🧹 Standardizing profile names...
  📊 Samples: 14,715 → 5,694 records
  📊 Analyses: 7,847 → 5,519 records
  📊 Morphology: 13,035 → 6,081 records
  📊 Profile_loc: 4,321 → 1,109 records
  📊 Soil_profile: 2,518 → 1,109 records
✅ Golden profile filtering complete!


In [6]:
# SECTION 1: LOAD DATA AND FILTER TO GOLDEN PROFILES
# ============================================================================

import pandas as pd

print("\n📂 SECTION 1: LOADING DATA AND FILTERING TO GOLDEN PROFILES")
print("-" * 60)

# Load original data files
print("Loading original data files...")
samples = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/AmostrasAngolaTerrario.xlsx")
analyses = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Horizontes Analises.xlsx")
#morphology = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Horizontes_Morfologia.xlsx")
morphology = pd.read_csv("/Users/inesschwartz/GreenDataScience/Thesis/tables_clean/morpho_cleaned.csv")

profile_loc = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Perfis_local.xlsx")
soil_profile = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Perfis_solo.xlsx")
elemental_analyses = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Data XRF Angola_inicial.xlsx")

print("✅ Original data loaded successfully")

# Load golden profiles list
golden_profiles_df = pd.read_csv("/Users/inesschwartz/GreenDataScience/Thesis/profile_matching_results/profiles_in_all_tables.csv")
golden_profiles = golden_profiles_df['profile'].tolist()

print(f"✅ Loaded {len(golden_profiles)} golden profiles")
print(f"Sample golden profiles: {golden_profiles[:5]}")

# Function to standardize profile names (consistent with your analysis)
def standardize_profile(profile_series):
    """Standardize profile names consistently"""
    return (profile_series
            .astype(str)
            .str.replace('/', '_')
            .str.strip()
            .str[:20]
            .str.upper())

# Apply standardization to all tables
print("\n🧹 Standardizing profile names...")

# Samples
samples['profile_clean'] = standardize_profile(samples['Perfil'])
samples_original_count = len(samples)
samples_golden = samples[samples['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Samples: {samples_original_count:,} → {len(samples_golden):,} records")

# Analyses
analyses['profile_clean'] = standardize_profile(analyses['PERFIL'])
analyses_original_count = len(analyses)
analyses_golden = analyses[analyses['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Analyses: {analyses_original_count:,} → {len(analyses_golden):,} records")

# Morphology
morphology['profile_clean'] = standardize_profile(morphology['Perfil'])
morphology_original_count = len(morphology)
morphology_golden = morphology[morphology['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Morphology: {morphology_original_count:,} → {len(morphology_golden):,} records")

# Profile location
profile_loc['profile_clean'] = standardize_profile(profile_loc['PERFIL'])
profile_loc_original_count = len(profile_loc)
profile_loc_golden = profile_loc[profile_loc['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Profile_loc: {profile_loc_original_count:,} → {len(profile_loc_golden):,} records")

# Soil profile
soil_profile['profile_clean'] = standardize_profile(soil_profile['Perfil'])
soil_profile_original_count = len(soil_profile)
soil_profile_golden = soil_profile[soil_profile['profile_clean'].isin(golden_profiles)].copy()
print(f"  📊 Soil_profile: {soil_profile_original_count:,} → {len(soil_profile_golden):,} records")

print("✅ Golden profile filtering complete!")

# SECTION 2: PROFILE COVERAGE ANALYSIS
# ============================================================================

def check_profile_coverage():
    """
    Check if all profiles from profile_loc_golden are present in analyses_golden, 
    morphology_golden, and samples_golden DataFrames.
    Uses the standardized 'profile_clean' column for comparison.
    """
    
    print("\n📊 SECTION 2: PROFILE COVERAGE ANALYSIS")
    print("-" * 60)
    
    # Get unique profiles from each DataFrame using the standardized profile_clean column
    print("Extracting standardized profile identifiers from each DataFrame...")
    
    # Use the standardized profile_clean columns from golden datasets
    profiles_in_loc = set(profile_loc_golden['profile_clean'].dropna())
    profiles_in_analyses = set(analyses_golden['profile_clean'].dropna())
    profiles_in_morphology = set(morphology_golden['profile_clean'].dropna())
    profiles_in_samples = set(samples_golden['profile_clean'].dropna())
    
    print(f"\nTotal profiles in profile_loc_golden: {len(profiles_in_loc)}")
    print(f"Total profiles in analyses_golden: {len(profiles_in_analyses)}")
    print(f"Total profiles in morphology_golden: {len(profiles_in_morphology)}")
    print(f"Total profiles in samples_golden: {len(profiles_in_samples)}")
    
    # Check coverage for each DataFrame
    print("\n" + "="*60)
    print("PROFILE COVERAGE ANALYSIS")
    print("="*60)
    
    # Check analyses coverage
    missing_in_analyses = profiles_in_loc - profiles_in_analyses
    coverage_analyses = (len(profiles_in_loc) - len(missing_in_analyses)) / len(profiles_in_loc) * 100
    
    print(f"\n📈 ANALYSES DataFrame:")
    print(f"Coverage: {coverage_analyses:.1f}% ({len(profiles_in_loc) - len(missing_in_analyses)}/{len(profiles_in_loc)})")
    if missing_in_analyses:
        print(f"❌ Missing profiles ({len(missing_in_analyses)}): {sorted(list(missing_in_analyses))}")
    else:
        print("✅ All profiles from profile_loc are present in analyses")
    
    # Check morphology coverage
    missing_in_morphology = profiles_in_loc - profiles_in_morphology
    coverage_morphology = (len(profiles_in_loc) - len(missing_in_morphology)) / len(profiles_in_loc) * 100
    
    print(f"\n🔬 MORPHOLOGY DataFrame:")
    print(f"Coverage: {coverage_morphology:.1f}% ({len(profiles_in_loc) - len(missing_in_morphology)}/{len(profiles_in_loc)})")
    if missing_in_morphology:
        print(f"❌ Missing profiles ({len(missing_in_morphology)}): {sorted(list(missing_in_morphology))}")
    else:
        print("✅ All profiles from profile_loc are present in morphology")
    
    # Check samples coverage
    missing_in_samples = profiles_in_loc - profiles_in_samples
    coverage_samples = (len(profiles_in_loc) - len(missing_in_samples)) / len(profiles_in_loc) * 100
    
    print(f"\n🧪 SAMPLES DataFrame:")
    print(f"Coverage: {coverage_samples:.1f}% ({len(profiles_in_loc) - len(missing_in_samples)}/{len(profiles_in_loc)})")
    if missing_in_samples:
        print(f"❌ Missing profiles ({len(missing_in_samples)}): {sorted(list(missing_in_samples))}")
    else:
        print("✅ All profiles from profile_loc are present in samples")
    
    # Overall summary
    print(f"\n" + "="*60)
    print("📋 SUMMARY")
    print("="*60)
    
    all_present_analyses = len(missing_in_analyses) == 0
    all_present_morphology = len(missing_in_morphology) == 0
    all_present_samples = len(missing_in_samples) == 0
    
    if all_present_analyses and all_present_morphology and all_present_samples:
        print("🎉 ALL profiles from profile_loc are present in ALL three DataFrames!")
    else:
        print("⚠️  Some profiles are missing from one or more DataFrames:")
        if not all_present_analyses:
            print(f"  - {len(missing_in_analyses)} profiles missing from analyses")
        if not all_present_morphology:
            print(f"  - {len(missing_in_morphology)} profiles missing from morphology")
        if not all_present_samples:
            print(f"  - {len(missing_in_samples)} profiles missing from samples")
    
    # Find profiles that are completely missing (not in any of the three DataFrames)
    completely_missing = profiles_in_loc - profiles_in_analyses - profiles_in_morphology - profiles_in_samples
    if completely_missing:
        print(f"\n🚨 Profiles completely missing from all DataFrames ({len(completely_missing)}): {sorted(list(completely_missing))}")
    else:
        print(f"\n✅ No profiles are completely missing from all DataFrames")
    
    # Additional insights: profiles that exist in target DataFrames but not in profile_loc
    print(f"\n" + "="*60)
    print("🔍 ADDITIONAL INSIGHTS")
    print("="*60)
    
    extra_in_analyses = profiles_in_analyses - profiles_in_loc
    extra_in_morphology = profiles_in_morphology - profiles_in_loc
    extra_in_samples = profiles_in_samples - profiles_in_loc
    
    if extra_in_analyses:
        print(f"📊 Analyses has {len(extra_in_analyses)} extra profiles not in profile_loc: {sorted(list(extra_in_analyses))[:10]}{'...' if len(extra_in_analyses) > 10 else ''}")
    
    if extra_in_morphology:
        print(f"🔬 Morphology has {len(extra_in_morphology)} extra profiles not in profile_loc: {sorted(list(extra_in_morphology))[:10]}{'...' if len(extra_in_morphology) > 10 else ''}")
    
    if extra_in_samples:
        print(f"🧪 Samples has {len(extra_in_samples)} extra profiles not in profile_loc: {sorted(list(extra_in_samples))[:10]}{'...' if len(extra_in_samples) > 10 else ''}")
    
    # Return results for further analysis if needed
    return {
        'profiles_in_loc': profiles_in_loc,
        'profiles_in_analyses': profiles_in_analyses,
        'profiles_in_morphology': profiles_in_morphology,
        'profiles_in_samples': profiles_in_samples,
        'missing_in_analyses': missing_in_analyses,
        'missing_in_morphology': missing_in_morphology,
        'missing_in_samples': missing_in_samples,
        'coverage_analyses': coverage_analyses,
        'coverage_morphology': coverage_morphology,
        'coverage_samples': coverage_samples,
        'completely_missing': completely_missing,
        'extra_in_analyses': extra_in_analyses,
        'extra_in_morphology': extra_in_morphology,
        'extra_in_samples': extra_in_samples
    }

# Run the coverage check
if __name__ == "__main__":
    results = check_profile_coverage()
    
    # Print final status
    print(f"\n" + "="*60)
    print("🏁 FINAL STATUS")
    print("="*60)
    
    total_profiles = len(results['profiles_in_loc'])
    missing_analyses = len(results['missing_in_analyses'])
    missing_morphology = len(results['missing_in_morphology'])
    missing_samples = len(results['missing_in_samples'])
    
    if missing_analyses == 0 and missing_morphology == 0 and missing_samples == 0:
        print("🎯 SUCCESS: All profiles from profile_loc are present in analyses, morphology, and samples DataFrames!")
    else:
        print(f"📊 RESULTS: Out of {total_profiles} profiles from profile_loc:")
        if missing_analyses > 0:
            print(f"  ❌ {missing_analyses} missing from analyses ({100*missing_analyses/total_profiles:.1f}%)")
        else:
            print(f"  ✅ All present in analyses")
            
        if missing_morphology > 0:
            print(f"  ❌ {missing_morphology} missing from morphology ({100*missing_morphology/total_profiles:.1f}%)")
        else:
            print(f"  ✅ All present in morphology")
            
        if missing_samples > 0:
            print(f"  ❌ {missing_samples} missing from samples ({100*missing_samples/total_profiles:.1f}%)")
        else:
            print(f"  ✅ All present in samples")


📂 SECTION 1: LOADING DATA AND FILTERING TO GOLDEN PROFILES
------------------------------------------------------------
Loading original data files...
✅ Original data loaded successfully
✅ Loaded 1109 golden profiles
Sample golden profiles: ['100_58', '100_59', '100_61', '100_63', '101A_58']

🧹 Standardizing profile names...
  📊 Samples: 14,715 → 5,694 records
  📊 Analyses: 7,847 → 5,519 records
  📊 Morphology: 10,434 → 5,328 records
  📊 Profile_loc: 4,321 → 1,109 records
  📊 Soil_profile: 2,518 → 1,109 records
✅ Golden profile filtering complete!

📊 SECTION 2: PROFILE COVERAGE ANALYSIS
------------------------------------------------------------
Extracting standardized profile identifiers from each DataFrame...

Total profiles in profile_loc_golden: 1109
Total profiles in analyses_golden: 1109
Total profiles in morphology_golden: 1100
Total profiles in samples_golden: 1109

PROFILE COVERAGE ANALYSIS

📈 ANALYSES DataFrame:
Coverage: 100.0% (1109/1109)
✅ All profiles from profile_loc a

In [4]:
print(f"Unique profiles in original samples: {samples['profile_clean'].nunique()}")
print(f"Unique profiles in original analyses: {analyses['profile_clean'].nunique()}")
print(f"Unique profiles in original morphology: {morphology['profile_clean'].nunique()}")
print(f"Unique profiles in original profiles_loc: {profile_loc['profile_clean'].nunique()}")

print(f"Golden profiles count: {len(golden_profiles)}")

Unique profiles in original samples: 3133
Unique profiles in original analyses: 1592
Unique profiles in original morphology: 2218
Unique profiles in original profiles_loc: 4321
Golden profiles count: 1109


In [8]:
import pandas as pd

# Load original data files
print("Loading original data files...")
analyses = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Horizontes Analises.xlsx")
profile_loc = pd.read_excel("/Users/inesschwartz/GreenDataScience/Thesis/tables_soil_database/Perfis_local.xlsx")

# Function to standardize profile names (same as your existing function)
def standardize_profile(profile_series):
    """Standardize profile names consistently"""
    return (profile_series
            .astype(str)
            .str.replace('/', '_')
            .str.strip()
            .str[:20]
            .str.upper())

# Standardize profile names
print("\n🧹 Standardizing profile names...")
analyses['profile_clean'] = standardize_profile(analyses['PERFIL'])
profile_loc['profile_clean'] = standardize_profile(profile_loc['PERFIL'])

# Get unique profiles from each dataset
unique_analyses_profiles = set(analyses['profile_clean'].dropna())
unique_profile_loc_profiles = set(profile_loc['profile_clean'].dropna())

print(f"\nUnique profiles in analyses: {len(unique_analyses_profiles)}")
print(f"Unique profiles in profile_loc: {len(unique_profile_loc_profiles)}")

# Find profiles that are in analyses but NOT in profile_loc
missing_coordinates = unique_analyses_profiles - unique_profile_loc_profiles

print(f"\n" + "="*70)
print("🗺️  PROFILES IN ANALYSES MISSING COORDINATE DATA")
print("="*70)

if missing_coordinates:
    print(f"\n❌ PROFILES IN ANALYSES BUT NOT IN PROFILE_LOC ({len(missing_coordinates)} profiles):")
    print("   These profiles have analytical data but are missing coordinate information")
    print("-" * 70)
    
    # Sort the missing profiles for easier reading
    missing_sorted = sorted(list(missing_coordinates))
    
    # Print each missing profile
    for i, profile in enumerate(missing_sorted, 1):
        print(f"{i:3d}. {profile}")
    
    print(f"\n📊 Summary: {len(missing_coordinates)} out of {len(unique_analyses_profiles)} analyses profiles are missing coordinates")
    print(f"📊 Coverage: {((len(unique_analyses_profiles) - len(missing_coordinates)) / len(unique_analyses_profiles) * 100):.1f}% of analyses profiles have coordinates")
    
    # Save missing profiles to a file for easy reference
    missing_df = pd.DataFrame({'profile_missing_coordinates': missing_sorted})
    output_file = "/Users/inesschwartz/GreenDataScience/Thesis/analyses_profiles_missing_coordinates.csv"
    missing_df.to_csv(output_file, index=False)
    print(f"\n💾 Missing profiles saved to: {output_file}")
    
    # Show some examples with original names for verification
    print(f"\n" + "="*70)
    print("🔍 EXAMPLES WITH ORIGINAL NAMES")
    print("="*70)
    
    print(f"\nFirst 10 profiles missing coordinates (showing original names):")
    missing_examples = analyses[analyses['profile_clean'].isin(missing_sorted[:10])]
    example_profiles = missing_examples[['PERFIL', 'profile_clean']].drop_duplicates()
    
    for _, row in example_profiles.iterrows():
        print(f"  Original: '{row['PERFIL']}' → Standardized: '{row['profile_clean']}'")
    
else:
    print("✅ ALL profiles in analyses have corresponding coordinate data in profile_loc!")

# Additional analysis: Show profiles that exist in profile_loc but not in analyses
extra_in_profile_loc = unique_profile_loc_profiles - unique_analyses_profiles

print(f"\n" + "="*70)
print("📍 ADDITIONAL INFO: PROFILES WITH COORDINATES BUT NO ANALYSES")
print("="*70)

if extra_in_profile_loc:
    print(f"\n🗺️  PROFILES IN PROFILE_LOC BUT NOT IN ANALYSES ({len(extra_in_profile_loc)} profiles):")
    print("   These profiles have coordinate data but no analytical data")
    print("-" * 70)
    
    extra_sorted = sorted(list(extra_in_profile_loc))
    
    # Show first 20 examples
    for i, profile in enumerate(extra_sorted[:20], 1):
        print(f"{i:3d}. {profile}")
    
    if len(extra_in_profile_loc) > 20:
        print(f"   ... and {len(extra_in_profile_loc) - 20} more")
    
    print(f"\n📊 Summary: {len(extra_in_profile_loc)} profiles have coordinates but no analytical data")
    
else:
    print("✅ ALL profiles in profile_loc have corresponding analytical data!")

# Overall summary
print(f"\n" + "="*70)
print("📋 OVERALL SUMMARY")
print("="*70)

overlap = len(unique_analyses_profiles & unique_profile_loc_profiles)

print(f"📊 Profiles in analyses: {len(unique_analyses_profiles)}")
print(f"🗺️  Profiles in profile_loc: {len(unique_profile_loc_profiles)}")
print(f"🔗 Profiles in BOTH: {overlap}")
print(f"❌ Analyses profiles missing coordinates: {len(missing_coordinates)}")
print(f"📍 Coordinate profiles missing analyses: {len(extra_in_profile_loc)}")

print(f"\n🎯 ACTION NEEDED:")
if missing_coordinates:
    print(f"   → Find coordinate data for {len(missing_coordinates)} analyses profiles")
    print(f"   → Check for typos/formatting issues in profile names")
    print(f"   → Verify if these profiles should have coordinate data")
else:
    print(f"   → ✅ All analyses profiles have coordinate data!")

Loading original data files...

🧹 Standardizing profile names...

Unique profiles in analyses: 1592
Unique profiles in profile_loc: 4321

🗺️  PROFILES IN ANALYSES MISSING COORDINATE DATA

❌ PROFILES IN ANALYSES BUT NOT IN PROFILE_LOC (11 profiles):
   These profiles have analytical data but are missing coordinate information
----------------------------------------------------------------------
  1. 101_68
  2. 185A_58
  3. 1_51
  4. 31_6
  5. 37_55(BAC)
  6. 4_62-IIAA
  7. 507_67
  8. 630N_69
  9. 93_55(BAC)
 10. 94_68
 11. MJ 26

📊 Summary: 11 out of 1592 analyses profiles are missing coordinates
📊 Coverage: 99.3% of analyses profiles have coordinates

💾 Missing profiles saved to: /Users/inesschwartz/GreenDataScience/Thesis/analyses_profiles_missing_coordinates.csv

🔍 EXAMPLES WITH ORIGINAL NAMES

First 10 profiles missing coordinates (showing original names):
  Original: '185a/58' → Standardized: '185A_58'
  Original: '101/68' → Standardized: '101_68'
  Original: '507/67' → Standard