# Paul's List Task 6 – Updated ver.
Input:(Paul-list task)Plant_Species_Native_Exotic_Genus.xlsx

Data sources: CBCity PCT Lists.xlsx (Cameron's big spreadsheet)；combined_synonym_list.xlsx (alternate_names → accepted_name)

Output: Paul_Task6_updated.xlsx

### Load reqired datasets: 
1. Updated Paul's list
2. Synonym list
3. Cameron's big spreadsheet

In [1]:
import pandas as pd
import re

# Updated Paul's List
paul_file = "(Paul-list task)Plant_Species_Native_Exotic_Genus.xlsx"
native = pd.read_excel(paul_file, sheet_name="Native_list")
exotic = pd.read_excel(paul_file, sheet_name="Exotic_list")
genus = pd.read_excel(paul_file, sheet_name="Genus_only_list")

# Synonym List 
synonyms = pd.read_excel("combined_synonym_list.xlsx")
synonyms.columns = synonyms.columns.str.strip().str.lower()
syn_dict = dict(zip(synonyms["alternate_names"], synonyms["accepted_name"]))

# Cameron's big spreadsheet
xls = pd.ExcelFile("CBCity PCT Lists.xlsx")
all_sheets = []
for sheet in xls.sheet_names:
    df = pd.read_excel(xls, sheet_name=sheet)
    df.columns = df.columns.str.strip()
    df["PCT_Name"] = sheet
    all_sheets.append(df)
pct_all = pd.concat(all_sheets, ignore_index=True)

### Classify 3 key PCTs, and Other

In [2]:
def classify_region(name):
    if name == "3320-Cumb":
        return "Cumberland"
    elif name == "3448-Castl":
        return "Castlereagh"
    elif name == "3262-STIF":
        return "STIF"
    else:
        return "Other"

pct_all["Region"] = pct_all["PCT_Name"].apply(classify_region)

### Apply synonym mapping and normalize names

In [3]:
# only apply synonym to PCT file
pct_all["Species"] = (
    pct_all["Species"]
    .astype(str)
    .str.strip()
    .str.replace(r"\s+", " ", regex=True)
    .str.lower()
    .replace(syn_dict)   
)

### Extract simplified growth form

In [4]:
pct_all["Simplified_Growth_Form"] = pct_all["Growth Form"].apply(
    lambda x: re.search(r"\((.*?)\)", str(x)).group(1).strip()
    if pd.notna(x) and "(" in str(x)
    else ""
)

### Keep and rename required columns

In [5]:
pct_data = pct_all[
    ["Species", "Region", "Median Cover Score", "Frequency", "Simplified_Growth_Form"]
].rename(
    columns={
        "Species": "scientific_name",
        "Median Cover Score": "MedianCover",
        "Frequency": "Frequency",
    }
)

### Create required cols and define fill function

In [6]:
# Creat cols and define fill function 
def fill_from_pct(df):
    df = df.copy()

    # Normalize names, keep case consistent for matching
    df["scientific_name"] = (
        df["scientific_name"]
        .astype(str)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
        .str.lower()   
    )

    # Ensure all required columns exist
    for col in [
        "Cumberland_MedianCover", "Cumberland_Frequency",
        "Castlereagh_MedianCover", "Castlereagh_Frequency",
        "STIF_MedianCover", "STIF_Frequency",
        "Other_Y", "Simplified_Growth_Form"
    ]:
        if col not in df.columns:
            df[col] = ""

    # Fill the 3 key PCTs
    for region in ["Cumberland", "Castlereagh", "STIF"]:
        sub = pct_data[pct_data["Region"] == region][
            ["scientific_name", "MedianCover", "Frequency"]
        ].drop_duplicates(subset=["scientific_name"], keep="first")

        sub = sub.rename(
            columns={
                "MedianCover": f"{region}_MedianCover",
                "Frequency": f"{region}_Frequency",
            }
        )

        df = df.merge(sub, on="scientific_name", how="left", suffixes=("", "_new"))

        for col in [f"{region}_MedianCover", f"{region}_Frequency"]:
            df[col] = df[f"{col}_new"].combine_first(df[col])
            df.drop(columns=[f"{col}_new"], inplace=True)

    # Fill Other col
    other_species = pct_data.loc[
        pct_data["Region"] == "Other", "scientific_name"
    ].drop_duplicates()
    df["Other_Y"] = df["scientific_name"].isin(other_species).map({True: "Y", False: ""})

    # Fill Simplified Growth Form
    growth_map = (
        pct_data.drop_duplicates(subset=["scientific_name"])
        .set_index("scientific_name")["Simplified_Growth_Form"]
        .to_dict()
    )
    df["Simplified_Growth_Form"] = df["scientific_name"].map(growth_map).fillna("")

    print(f"Finished filling {len(df)} rows for this sheet.")
    return df

### Apply and update 3 worksheets

In [7]:
native_upd = fill_from_pct(native)
exotic_upd = fill_from_pct(exotic)
genus_upd = fill_from_pct(genus)

Finished filling 1201 rows for this sheet.
Finished filling 684 rows for this sheet.
Finished filling 217 rows for this sheet.


### Export and save final results

In [8]:
output_file = "Paul_Task6_updated.xlsx"
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    native_upd.to_excel(writer, sheet_name="Native_list", index=False)
    exotic_upd.to_excel(writer, sheet_name="Exotic_list", index=False)
    genus_upd.to_excel(writer, sheet_name="Genus_only_list", index=False)

print(f"File saved as '{output_file}'.")

File saved as 'Paul_Task6_updated.xlsx'.
