# Onotological analysis of EcoSIM variables.

In [1]:
#download the github repo

import subprocess

def git_clone(repository_url, destination_directory=None):
    command = ['git', 'clone', repository_url]
    if destination_directory:
        command.append(destination_directory)
    result = subprocess.run(command, capture_output=True, text=True)
    if result.returncode == 0:
        print("Repository cloned successfully.")
    else:
        print(f"Error cloning repository: {result.stderr}")

# Example usage:
git_clone("git@github.com:jinyun1tang/EcoSIM.git", "EcoSIM_code")


#Copy .F90 files into .txt files
import os
import shutil
from pathlib import Path

def copy_f90_to_txt(source_dir, dest_dir, exclude_files=None,include_files=None):
    """Copy .F90 files from source_dir to dest_dir as .txt files, excluding specified files."""
    
    source_path = Path(source_dir)
    dest_path = Path(dest_dir)
    exclude_files = exclude_files or []
    include_files = include_files or []
    print(f"Processing dir:{source_dir}")
    # Create destination directory
    dest_path.mkdir(parents=True, exist_ok=True)
    
    # Process all .F90 files (case insensitive)
    if not include_files:
        for pattern in ["*.F90", "*.f90"]:
            for f90_file in source_path.glob(pattern):
                if f90_file.stem not in exclude_files:
                    dest_file = dest_path / (f90_file.stem + ".txt")
                    shutil.copy2(f90_file, dest_file)
                    print(f"Copied: {f90_file.name} -> {dest_file.name}")
                else:
                    print(f"Excluded: {f90_file.name}")
    else:
        for pattern in ["*.F90", "*.f90"]:
            for f90_file in source_path.glob(pattern):
                if f90_file.stem in include_files:
                    dest_file = dest_path / (f90_file.stem + ".txt")
                    shutil.copy2(f90_file, dest_file)
                    print(f"Copied: {f90_file.name} -> {dest_file.name}")        

# Usage example

copy_f90_to_txt(
        source_dir="./EcoSIM_code/f90src/Ecosim_datatype/",
        dest_dir="./txt/", 
        exclude_files=["BalanceCheckDataType","EcoSIMCtrlDataType","EcoSIMHistMod"]
    )

copy_f90_to_txt(
        source_dir="./EcoSIM_code/f90src/Modelpars/",
        dest_dir="./txt/", 
        exclude_files=["EcoSiMParDataMod","TracerPropMod"]
    )

copy_f90_to_txt(
        source_dir="./EcoSIM_code/f90src/Utils/",
        dest_dir="./txt/", 
        include_files=["EcoSimConst"]
    )


Repository cloned successfully.
Processing dir:./EcoSIM_code/f90src/Ecosim_datatype/
Copied: SoilPropertyDataType.F90 -> SoilPropertyDataType.txt
Copied: FertilizerDataType.F90 -> FertilizerDataType.txt
Copied: CanopyRadDataType.F90 -> CanopyRadDataType.txt
Copied: SnowDataType.F90 -> SnowDataType.txt
Copied: SOMDataType.F90 -> SOMDataType.txt
Copied: SoilWaterDataType.F90 -> SoilWaterDataType.txt
Copied: AqueChemDatatype.F90 -> AqueChemDatatype.txt
Excluded: EcoSIMCtrlDataType.F90
Copied: ClimForcDataType.F90 -> ClimForcDataType.txt
Copied: SedimentDataType.F90 -> SedimentDataType.txt
Excluded: BalanceCheckDataType.F90
Copied: SoilHeatDataType.F90 -> SoilHeatDataType.txt
Excluded: EcoSIMHistMod.F90
Copied: SoilPhysDataType.F90 -> SoilPhysDataType.txt
Copied: SurfSoilDataType.F90 -> SurfSoilDataType.txt
Copied: LandSurfDataType.F90 -> LandSurfDataType.txt
Copied: PlantTraitDataType.F90 -> PlantTraitDataType.txt
Copied: SoilBGCDataType.F90 -> SoilBGCDataType.txt
Copied: ChemTranspDataTy

In [2]:
# get name of all files need to be parsed.
import os
from pathlib import Path

# Method 1: One-liner with os.listdir
directory = "./txt"
txt_files = [f for f in os.listdir(directory) if f.lower().endswith('.txt')]

# Method 2: One-liner with pathlib
txt_files = [f.name for f in Path(directory).glob("*.txt")]

# Method 3: One-liner with glob
import glob
txt_files = [os.path.basename(f) for f in glob.glob(os.path.join(directory, "*.txt"))]

print(txt_files)

['EcosimBGCFluxType.txt', 'SoluteParMod.txt', 'GrosubPars.txt', 'FlagDataType.txt', 'ChemTracerParsMod.txt', 'EcoSimSumDataType.txt', 'PlantDataRateType.txt', 'IrrigationDataType.txt', 'CanopyDataType.txt', 'LandSurfDataType.txt', 'PlantTraitDataType.txt', 'ChemTranspDataType.txt', 'SoilBGCDataType.txt', 'GridDataType.txt', 'MicrobialDataType.txt', 'PlantMgmtDataType.txt', 'RootDataType.txt', 'SurfLitterDataType.txt', 'NitroPars.txt', 'SedimentDataType.txt', 'EcoSimConst.txt', 'ClimForcDataType.txt', 'SurfSoilDataType.txt', 'MicBGCPars.txt', 'SoilHeatDataType.txt', 'SoilPhysDataType.txt', 'SoilPropertyDataType.txt', 'SnowDataType.txt', 'FertilizerDataType.txt', 'CanopyRadDataType.txt', 'SOMDataType.txt', 'AqueChemDatatype.txt', 'SoilWaterDataType.txt']


# example data to extract patterns
  real(r8),target,allocatable ::  canopy_growth_pft(:,:,:)                   !canopy structural growth rate [gC/h]
  
  real(r8) :: RMAX       !maximum hourly radiation,	[MJ m-2 h-1]
  
  integer,target,allocatable ::  iPlantGrainType_pft(:,:,:)                  !grain type (below or above-ground), e.g. potato and onion are below,
  
  real(r8), PARAMETER :: DPH2O=6.5E-09_r8                !equilbrium constant for H2O=H(+)+OH(-), [mol^2 m^-6]   

In [3]:
# parse variables int varname, description
import re
import pandas as pd
import csv

import re

def extract_variable_info_with_termination(line):
    """
    Extract variable name, description, and unit from Fortran variable declaration lines.
    Returns 'TERMINATE' if line contains 'contains'.
    
    Args:
        line (str): Line containing variable declaration
        
    Returns:
        tuple: (variable_name, description, unit) or None if no match, or 'TERMINATE' if contains found
    """
    
    # Check if line contains 'contains' (case insensitive)
    found_strings = [s for s in ['contains','terminate'] if s in line.lower()]
    if found_strings:
        return 'TERMINATE'
    
    # Remove leading/trailing whitespace
    line = line.strip()
    
    # Enhanced pattern to match various Fortran variable declarations
    # Pattern to match variable declarations
    pattern = r'^\s*(real(?:\([^)]*\))?|integer)(?:\s*,\s*[^:]*?)?\s*::\s*([^!\s=\(]+)(?:\([^)]*\))?(?:\s*=\s*[^!]*)?\s*!(.*)$'
    
    match = re.match(pattern, line.strip(), re.IGNORECASE)
    
    if match:
        variable_name = match.group(2).strip()
        comment_part = match.group(3).strip()
        
        # Extract unit from square brackets
        unit_match = re.search(r'\[([^\]]*)\]', comment_part)
        unit = unit_match.group(1).strip() if unit_match else ""
        
        # Extract description
        if unit_match:
            description = comment_part[:unit_match.start()].strip()
        else:
            description = comment_part.strip()
        
        # Clean description
        description = re.sub(r'[,\s]+$', '', description).strip()
        description = re.sub(r'\s+', ' ', description)
        
        return variable_name, description, unit
    
    return None

def extract_variables_simple_with_termination(file_path):
    """
    Extract variables from a file, stopping when 'contains' is encountered.
    """
    
    variables = []
    
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            for line_num, line in enumerate(file, 1):
                result = extract_variable_info_with_termination(line)
                
                if result == 'TERMINATE':
                    print(f"  Encountered 'contains' at line {line_num}. Stopping parsing.")
                    break
                elif result:
                    variables.append(result)
#                    print(f"  {result[0]}, {result[1]}, {result[2]}")
        
        return variables
        
    except Exception as e:
        print(f"  Error processing file {file_path}: {e}")
        return []
# Usage
# Initialize list to collect all variables
all_variables = []
k=0
for file_path in txt_files:
    k=k+1
    print('Parsing file [%d]: %s'%(k,file_path))
    print('-'*50)
    variables = extract_variables_simple_with_termination('./txt/'+file_path)
    for var_name, description, unit in variables:
        all_variables.append({
            'File Name':file_path,
            'Variable Name': var_name,
            'Description': description,
            'Unit': unit
        })
# Write all variables to CSV file
if all_variables:
    df = pd.DataFrame(all_variables)
    df.to_csv('new_EcoSIM_variables.csv', index=False, encoding='utf-8')
    print(f'\nAll variables saved to all_variables.csv')
    print(f'Total variables: {len(all_variables)}')
    print(f'Files processed: {k}')
else:
    print('No variables found to save.')    

# Extract all Variable Name entries
ecosim_new_variables = [item['Variable Name'] for item in all_variables]
ecosim_new_descriptions=[item['Description'] for item in all_variables]
print(ecosim_new_variables[:5])

Parsing file [1]: EcosimBGCFluxType.txt
--------------------------------------------------
  Encountered 'contains' at line 39. Stopping parsing.
Parsing file [2]: SoluteParMod.txt
--------------------------------------------------
  Encountered 'contains' at line 138. Stopping parsing.
Parsing file [3]: GrosubPars.txt
--------------------------------------------------
  Encountered 'contains' at line 99. Stopping parsing.
Parsing file [4]: FlagDataType.txt
--------------------------------------------------
  Encountered 'contains' at line 32. Stopping parsing.
Parsing file [5]: ChemTracerParsMod.txt
--------------------------------------------------
  Encountered 'contains' at line 56. Stopping parsing.
Parsing file [6]: EcoSimSumDataType.txt
--------------------------------------------------
  Encountered 'contains' at line 45. Stopping parsing.
Parsing file [7]: PlantDataRateType.txt
--------------------------------------------------
  Encountered 'contains' at line 132. Stopping pa

In [4]:
#read EcoSIM.xlsx that holds variables from the old version.
import pandas as pd


def create_ecosim_dict(file_path="EcoSIM.xlsx"):
    """
    Function to create EcoSIM dictionary and return two vectors of strings.
    
    Returns:
        tuple: (ecosim_dict, names_vector, descriptions_vector)
               - ecosim_dict: Dictionary mapping names to descriptions
               - names_vector: List of EcoSIM Other Names
               - descriptions_vector: List of corresponding descriptions
    """
    
    try:
        # Read Excel file, skip second row
        df = pd.read_excel(file_path, skiprows=[1])
        
        # Extract required columns and remove NaN variable names
        subset_df = df[['EcoSIM Other Names', 'Description']].dropna(subset=['EcoSIM Other Names'])
        
        # Create vectors of strings
        nm_vector = subset_df['EcoSIM Other Names'].astype(str).tolist()
        names_vector= [var.replace('ECOSIM:', '') for var in nm_vector]

        descriptions_strings = subset_df['Description'].fillna('').astype(str).tolist()
        keywords=['refers', 'presents', 'represents',' is ','denotes','stands for','indicates','this ', 'are defined']
        descriptions_vector =[s[:min([s.lower().find(k.lower()) for k in keywords if k.lower() in s.lower()] or [len(s)])].strip() for s in descriptions_strings]

        
        # Create dictionary
        ecosim_dict = dict(zip(names_vector, descriptions_vector))
        
        print(f"Created dictionary with {len(ecosim_dict)} variables")
        print(f"Names vector length: {len(names_vector)}")
        print(f"Descriptions vector length: {len(descriptions_vector)}")
        
        return ecosim_dict, names_vector, descriptions_vector
        
    except Exception as e:
        print(f"Error: {e}")
        return None, [], []
# Usage:
ecosim_dict, ecosim_old_names_vector, ecosim_old_descriptions_vector = create_ecosim_dict("EcoSIM.xlsx")


def clean_paired_vectors(vec_1, vec_2):
    """
    Simple and efficient function to clean paired vectors.
    
    Args:
        vec_1, vec_2 (list): Input vectors of same length
        
    Returns:
        tuple: (cleaned_vec_1, cleaned_vec_2)
    """
    
    # Common units to remove
    units_to_remove = {
        'oC', 'kg', 'uM', 'm2', 'g', 'mg', 'cm', 'mm', 'km', 'L', 'mL',
        'Pa', 'kPa', 'MPa', 'mol', 'mmol', 'umol', 'ppmv', 'ppb', 'pH',
        'm', 's', 'h', 'd', 'yr', 'C', 'K', 'F', 'J', 'kJ', 'MJ', 'W',
        'kW', 'V', 'A', 'Hz', 'bar', 'atm', '%', 'pct','m3'
    }
    var_numeric_id=range(len(vec_1))
    
    cleaned_1, cleaned_2, cleaned_3 = [], [],[]
    
    for i in range(len(vec_1)):
        entry = str(vec_1[i]).strip()
        
        # Keep entry if it doesn't match removal criteria
        if (len(entry) > 1 and 
            '-' not in entry and 
            entry not in units_to_remove) and 'CATEGORY' not in entry:
            cleaned_1.append(var_numeric_id[i])
            cleaned_2.append(vec_1[i])
            cleaned_3.append(vec_2[i])
    
    return cleaned_1, cleaned_2, cleaned_3

#remove units
ecosim_old_varid_vector_clean,ecosim_old_names_vector_clean, ecosim_old_descriptions_vector_clean=clean_paired_vectors(ecosim_old_names_vector, ecosim_old_descriptions_vector)
       
print(f"Names vector length: {len(ecosim_old_names_vector_clean)}")
print(f"Descriptions vector length: {len(ecosim_old_descriptions_vector_clean)}")


# Create DataFrame and write to CSV
df = pd.DataFrame({
    'Variable_ID': ecosim_old_varid_vector_clean,
    'Variable_Name': ecosim_old_names_vector_clean,
    'Description': ecosim_old_descriptions_vector_clean
})

# Write to CSV
df.to_csv('ecosim_old_clean_data.csv', index=False, encoding='utf-8')

Created dictionary with 1272 variables
Names vector length: 1274
Descriptions vector length: 1274
Names vector length: 1180
Descriptions vector length: 1180


In [5]:
#match the variable names by semantic meaning
# First install required packages:
# pip install sentence-transformers scikit-learn numpy

from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

def match_strings_semantic_similarity(vector1, vector2, top_k=3, threshold=0.3):
    """
    Match strings between two vectors using semantic similarity with Sentence Transformers.
    
    Args:
        vector1, vector2 (list): Lists of strings to match
        top_k (int): Number of top matches to return for each string
        threshold (float): Minimum similarity threshold (0-1)
        
    Returns:
        list: List of match results
    """
    
    # Load pre-trained sentence transformer model
    print("Loading sentence transformer model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast and good quality
    
    # Generate embeddings for both vectors
    print("Generating embeddings...")
    embeddings1 = model.encode(vector1)
    embeddings2 = model.encode(vector2)
    
    # Calculate cosine similarity matrix
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)
    
    # Find matches for each string in vector1
    matches = []
    
    for i, string1 in enumerate(vector1):
        # Get similarities for current string
        similarities = similarity_matrix[i]
        
        # Get top k matches above threshold
        top_indices = np.argsort(similarities)[::-1][:top_k]
        top_matches = []
        
        for idx in top_indices:
            if similarities[idx] >= threshold:
                top_matches.append({
                    'string': vector2[idx],
                    'similarity': similarities[idx],
                    'index': idx
                })
        
        matches.append({
            'source_string': string1,
            'source_index': i,
            'matches': top_matches
        })
    
    return matches

# Test with your example
#vector1 = [
#    'Snow temperature (TCSnow)',
#    'The threshold temperature for autumn leafoff/hardening',
#    'The threshold temperature for spring leafout/dehardening, often abbreviated as TCZ'
#]

#vector2 = [
#    'Temperature measurement in snow layers',
#    'Critical temperature for plant dormancy in fall',
#    'Spring awakening temperature for vegetation',
#    'Soil moisture content measurement',
#    'Canopy leaf area calculation',
#    'Root biomass estimation method'
#]

# Find semantic matches
print("Finding semantic matches...")
matches = match_strings_semantic_similarity(ecosim_old_descriptions_vector_clean, ecosim_new_descriptions, top_k=2, threshold=0.2)

# Display results
print("\nSemantic Similarity Matches:")
print("=" * 80)
ecosim_old_varid,ecosim_old_varname,ecosim_old_vardescp,ecomsim_1stbest_newvar_match,ecomsim_2ndbest_newvar_match=[],[],[],[],[]
for match in matches:
    ecosim_old_varid.append(match['source_index'])
    ecosim_old_varname.append(ecosim_old_names_vector_clean[match['source_index']])
    ecosim_old_vardescp.append(match['source_string'])
    
    if match['matches']:
        match0=(match['matches'])        
        ecomsim_1stbest_newvar_match.append(ecosim_new_variables[match0[0]['index']])
        ecomsim_2ndbest_newvar_match.append(ecosim_new_variables[match0[1]['index']])        
    else:
        ecomsim_1stbest_newvar_match.append('')
        ecomsim_2ndbest_newvar_match.append('')

# Create DataFrame and write to CSV
df = pd.DataFrame({
    'Old_Variable_ID': ecosim_old_varid,
    'Old_Variable_Name': ecosim_old_varname,
    'Description': ecosim_old_vardescp,
    'Best macth new var': ecomsim_1stbest_newvar_match,
    'Next best match new var':ecomsim_1stbest_newvar_match
})

# Write to CSV
df.to_csv('ecosim_old_new_varmatch.csv', index=False, encoding='utf-8')

  from scipy.sparse import csr_matrix, issparse


Finding semantic matches...
Loading sentence transformer model...
Generating embeddings...

Semantic Similarity Matches:


In [6]:
#match the old and new variables
import math
from collections import Counter

#variable matching use cosine similarity
def find_most_similar_strings(target_string, candidate_strings, top_k=5):
    """
    Find the most similar strings to a target string.
    
    Args:
        target_string (str): String to compare against
        candidate_strings (list): List of candidate strings
        top_k (int): Number of top matches to return
        
    Returns:
        list: List of tuples (string, similarity_score)
    """
    
    def count_chars_to_vector(input_string):
        count_vector = [0] * 36
        for char in input_string.lower():
            if char.isdigit():
                count_vector[int(char)] += 1
            elif char.isalpha():
                count_vector[ord(char) - ord('a') + 10] += 1
        return count_vector
    
    def cosine_similarity(vec1, vec2):
        dot_product = sum(a * b for a, b in zip(vec1, vec2))
        magnitude1 = math.sqrt(sum(a * a for a in vec1))
        magnitude2 = math.sqrt(sum(b * b for b in vec2))
        
        if magnitude1 > 0 and magnitude2 > 0:
            return dot_product / (magnitude1 * magnitude2)
        return 0
    
    target_vector = count_chars_to_vector(target_string)
    similarities = []
    
    for candidate in candidate_strings:
        candidate_vector = count_chars_to_vector(candidate)
        similarity = cosine_similarity(target_vector, candidate_vector)
        similarities.append((candidate, similarity))
    
    # Sort by similarity (descending) and return top k
    similarities.sort(key=lambda x: x[1], reverse=True)
    matches = " | ".join([f"{m[0]} ({m[1]:.3f})" for m in similarities[:top_k]])
    return similarities[:top_k]

# Test with EcoSIM variable names
ecosim_simmatch_1stbest_var,ecosim_simmatch_2ndbest_var=[],[]
for target in ecosim_old_varname:
    matches = find_most_similar_strings(target, ecosim_new_variables, top_k=3)
    all_strings = [item[0] for item in matches]
    ecosim_simmatch_1stbest_var.append(all_strings[0])
    ecosim_simmatch_2ndbest_var.append(all_strings[1])
    
    
df = pd.DataFrame({
    'Old_Variable_ID': ecosim_old_varid,
    'Old_Variable_Name': ecosim_old_varname,
    'Description': ecosim_old_vardescp,
    'Best semantic macth new var': ecomsim_1stbest_newvar_match,
    'Next best semantic match new var':ecomsim_1stbest_newvar_match,
    'Best similarity match new vaar':ecosim_simmatch_1stbest_var,
    'Next best similarity match new vaar':ecosim_simmatch_2ndbest_var    
})

# Write to CSV, which has the variables aligned by cosine similarity and semantic similarity
df.to_csv('ecosim_old_new_var_fullmatch.csv', index=False, encoding='utf-8')
