## 1. Filter native records from the KS file

In [1]:
import pandas as pd

file_path = "KS edited Flora_All Species20250624-Weed VS Native.xlsx"
df = pd.read_excel(file_path, sheet_name=0)

native_df = df[df["Local Status"].str.lower() == "native"]

output_path = "Flora_Native_ReportOnly.xlsx"
native_df.to_excel(output_path, index=False)

print(f"Total Record Number: {len(native_df)} ，Saved as {output_path}")

Total Record Number: 1355 ，Saved as Flora_Native_ReportOnly.xlsx


## 2. Integrate 2 synonym files provided by client (accepted names + alter names)

In [2]:
import pandas as pd

df1 = pd.read_excel("new_synonyms_found2025-06-01.xlsx", sheet_name="new_synonyms_found")
df2 = pd.read_excel("sorted_synonyms_for_trait_summary.xlsx", sheet_name="sorted_synonyms")

df_combined = pd.concat([df1, df2], ignore_index=True)

df_combined = df_combined.drop_duplicates()

out_file = "all_synonyms_merged.xlsx"
df_combined.to_excel(out_file, index=False)

print(f"The merged file is saved as {out_file}")

The merged file is saved as all_synonyms_merged.xlsx


## 3. Ensure no omission and repetition

In [3]:
import pandas as pd

df1 = pd.read_excel("new_synonyms_found2025-06-01.xlsx", sheet_name="new_synonyms_found")
df2 = pd.read_excel("sorted_synonyms_for_trait_summary.xlsx", sheet_name="sorted_synonyms")

df_combined = pd.concat([df1, df2], ignore_index=True)

# Check repetition
dup_rows = df_combined.duplicated().sum()
dup_examples = df_combined[df_combined.duplicated()].head(10)

print(f"Repetition Count: {dup_rows}")
print("Repetition 10 Examples:")
print(dup_examples)

Repetition Count: 88
Repetition 10 Examples:
              accepted_name                          alternate_names
58    Alocasia macrorrhizos  Alocasia macrorrhizos var. brisbanensis
101       Blechnum ambiguum                        Blechnum ambiguum
103       Blechnum ambiguum                        Blechnum ambiguum
133   Boronia polygalifolia                 Tetratheca oppositifolia
134   Boronia polygalifolia                    Boronia polygalifolia
135   Boronia polygalifolia                    Boronia oppositifolia
153  Brachychiton australis                      Sterculia australis
177    Callitris rhomboidea                       Frenela ventenatii
178    Callitris rhomboidea                 Callitris cupressiformis
179    Callitris rhomboidea                        Frenela australis


## 4. Capturing all possible names for native species
- According to all_synonyms_merged.xlsx, list names from col 'species_name' in Flora_Native_ReportOnly.xlsx
- Find all corresponding accepted_name & alternate_names

In [4]:
import pandas as pd

flora_path = "Flora_Native_ReportOnly.xlsx"
syn_path = "all_synonyms_merged.xlsx"

flora = pd.read_excel(flora_path, sheet_name=0)
syn = pd.read_excel(syn_path)

syn = syn.rename(columns={"alternate_names": "alt_name"})

# Unfold alternate_names
def split_alts(val):
    if pd.isna(val):
        return []
    return [x.strip() for x in str(val).replace("\n", ";").split(";") if x.strip()]

rows = []
for _, row in syn.iterrows():
    acc = str(row["accepted_name"]).strip()
    for alt in split_alts(row["alt_name"]):
        rows.append((acc, alt))

syn_expanded = pd.DataFrame(rows, columns=["accepted_name", "alt_name"])

flora_species = flora["species_name"].dropna().astype(str).str.strip().unique()

# Find all native species' accepted_name and alternate_names
results = []

for sp in flora_species:
    sp_matches = []

    # ① if accepted_name
    if sp in syn_expanded["accepted_name"].values:
        alts = syn_expanded[syn_expanded["accepted_name"] == sp]["alt_name"].unique()
        sp_matches.append((sp, "; ".join(alts)))
    # ② if any alternate_name
    if sp in syn_expanded["alt_name"].values:
        accs = syn_expanded[syn_expanded["alt_name"] == sp]["accepted_name"].unique()
        for acc in accs:
            alts = syn_expanded[syn_expanded["accepted_name"] == acc]["alt_name"].unique()
            sp_matches.append((acc, "; ".join(alts)))
    # ③ if fail，accepted_name=species_name, alternate_names=nan
    if not sp_matches:
        sp_matches.append((sp, ""))

    # Save the output
    for acc, alts in sp_matches:
        results.append({"species_name": sp, "accepted_name": acc, "alternate_names": alts})

# Convert to DataFrame
result_df = pd.DataFrame(results)

# Output
out_path = "flora_species_with_synonyms.xlsx"
result_df.to_excel(out_path, index=False)

print(f"Save as {out_path}")

Save as flora_species_with_synonyms.xlsx


## 5. Split 'alternate_names' and deduplicate

In [5]:
import pandas as pd

in_file = "flora_species_with_synonyms.xlsx"
out_file = "flora_species_with_synonyms_expanded.xlsx"

df = pd.read_excel(in_file)

# Split alternate_names
rows = []
for _, row in df.iterrows():
    acc = str(row["accepted_name"]).strip()
    species = str(row["species_name"]).strip()
    alts = str(row["alternate_names"]) if pd.notna(row["alternate_names"]) else ""

    # Split use ;
    for alt in alts.split(";"):
        alt = alt.strip()
        if alt:  
            rows.append({
                "species_name": species,
                "accepted_name": acc,
                "alternate_name": alt
            })

# Convert to DataFrame
expanded_df = pd.DataFrame(rows)

# Deduplication
expanded_df = expanded_df.drop_duplicates()

# Save
expanded_df.to_excel(out_file, index=False)

print(f"Expanded and deduplicated 'alternate_names'，{len(expanded_df)} records in total，saved as {out_file}")

Expanded and deduplicated 'alternate_names'，10199 records in total，saved as flora_species_with_synonyms_expanded.xlsx


## 6. Match Occurrence records through comprehensive species synonym names

In [6]:
import pandas as pd

occ_file = "occurrences.csv"
syn_file = "flora_species_with_synonyms_expanded.xlsx"
out_file = "occurrences_native_matched.csv"

occ = pd.read_csv(occ_file)
syn = pd.read_excel(syn_file)

# Collect all possible names
all_names = set()

for col in ["species_name", "accepted_name", "alternate_name"]:
    if col in syn.columns:
        all_names.update(syn[col].dropna().astype(str).str.strip())

# Filter occurrences
occ_matched = occ[occ["scientific_name"].isin(all_names)].copy()

print(f"Total records of occurrences: {len(occ)}")
print(f"Successful Matching of {len(occ_matched)} Records")

# Output results
occ_matched.to_csv(out_file, index=False)
print(f"Filtered results are saved as {out_file}")

Total records of occurrences: 43206
Successful Matching of 27977 Records
Filtered results are saved as occurrences_native_matched.csv
