In [27]:
import pandas as pd


# Load the PAFS table into a DataFrame (replace 'pafs_data.csv' with your file path)
df_pafs = pd.read_csv('data inputs\pafs_cal_ipc_invasiveness_scoring_data.csv')

# Selecting only the relevant columns (Species and Worksheet C columns)
df_pafs_subset = df_pafs[['Species', 'Rating', 'Question 3.1 Explanation'] + [col for col in df_pafs.columns if col.startswith('Worksheet C, ')]]

# Remove the prefix "Worksheet C, " from the column names
df_pafs_subset.columns = df_pafs_subset.columns.str.replace('Worksheet C, ', '')


# Load the workbook containing the two lists of species
workbook_path = "python generated excel files\priority_lists_by_visibility_months.xlsx"
df_species_list_spring = pd.read_excel(workbook_path, sheet_name="Spring Search")  # Replace "Sheet1" with the actual sheet name
df_species_list_other = pd.read_excel(workbook_path, sheet_name="Other_Species")  # Replace "Sheet2" with the actual sheet name


# Create a new Excel writer object for the new workbook
writer = pd.ExcelWriter(output_workbook_path, engine='xlsxwriter')


# Transpose the DataFrame so that habitats are rows and species are columns
df_transposed = df_pafs_subset.set_index('Species').T.reset_index().rename(columns={'index': 'Habitat'})

# Create a new column 'Species Concat' which concatenates species where the value is 'A', 'B', 'C', or 'D'
df_transposed['Species Concat'] = df_transposed.apply(lambda row: ', '.join([col for col in row.index[1:] if row[col] in ['A', 'B', 'C', 'D']]), axis=1)

# Create separate columns for each possible value ('A', 'B', 'C', 'D', 'U', or no value)
for value in ['A', 'B', 'C', 'D', 'U', '']:
    df_transposed[f'{value} Species PAF'] = df_transposed.apply(lambda row: ', '.join([col for col in row.index[1:] if row[col] == value]), axis=1)

# Reorder the columns so that 'Species Concat' is immediately to the right of 'Habitat' and the new concat columns are immediately to the right of 'Species Concat'
column_order = ['Habitat', 'Species Concat'] + ['A Species PAF', 'B Species PAF', 'C Species PAF', 'D Species PAF', 'U Species PAF'] + [col for col in df_transposed.columns if col not in ['Habitat', 'Species Concat', 'A Species PAF', 'B Species PAF', 'C Species PAF', 'D Species PAF', 'U Species PAF']]
df_transposed = df_transposed[column_order]

# Read the tables into DataFrames
df_species_habitats = df_transposed
df_minor_ecological_types = pd.read_excel("python generated excel files\habitat_types_at_mgp.xlsx")

# Merge the tables based on the common column
merged_df = pd.merge(df_species_habitats, df_minor_ecological_types, 
                      left_on="Habitat", right_on="Minor Ecological Types", how="inner")


merged_df.rename(columns={"MGP Habitats": "Habitat, Marin Fine Veg Map"}, inplace=True)

# Reorder the columns
columns_order = ['Habitat', 'Habitat, Marin Fine Veg Map'] + [col for col in merged_df.columns if col not in ['habitat', 'Habitat, Marin Fine Veg Map']]
merged_df = merged_df[columns_order]

# Drop the redundant "Habitat" column
#merged_df.drop(columns="Habitat", inplace=True)

# Export the merged DataFrame to an Excel file
output_excel_path = "PAF species per habitat.xlsx"
merged_df.to_excel(output_excel_path, index=False)

#region without habitat

# Save the transposed DataFrame to an Excel file
#output_excel_path = 'species_habitats_transposed_with_concat_and_individual_renamed.xlsx'
#df_transposed.to_excel(output_excel_path, index=False)

# Create a list of all species
all_species = set(df_pafs['Species'])

# Create a list of species associated with a habitat
species_with_habitat = set(df_transposed['Species Concat'].str.split(', ').explode().dropna())

# Create a list of species without a habitat
species_without_habitat = list(all_species - species_with_habitat)

# Create a DataFrame for species without a habitat, considering all columns from the original PAFS DataFrame
df_species_without_habitat = df_pafs[df_pafs['Species'].isin(species_without_habitat)]

# Create a new column 'Missing Info' that records 'yes' if all columns except 'Rating' are empty for the species
df_species_without_habitat['Missing Info'] = df_species_without_habitat.drop('Rating', axis=1).isnull().all(axis=1).map({True: 'yes', False: 'no'})

# Create a new column 'Non-empty Columns' that lists which columns are not empty for the species
df_species_without_habitat['Non-empty Columns'] = df_species_without_habitat.apply(lambda row: ', '.join(row.drop(['Species', 'Rating']).index[row.drop(['Species', 'Rating']).notnull()]), axis=1)

# Save the DataFrame to an Excel file
output_excel_path_species_without_habitat = 'species_without_habitat_and_debugging_columns.xlsx'
df_species_without_habitat.to_excel(output_excel_path_species_without_habitat, index=False)
#endregion



NameError: name 'output_workbook_path' is not defined

In [46]:

import pandas as pd

# Load the PAFS table into a DataFrame (replace 'pafs_data.csv' with your file path)
df_pafs = pd.read_csv('data inputs\pafs_cal_ipc_invasiveness_scoring_data.csv')

# Load the workbook containing the two lists of species
workbook_path = "python generated excel files\priority_lists_by_visibility_months.xlsx"
df_species_list_spring = pd.read_excel(workbook_path, sheet_name="Spring_Search")  # Replace "Sheet1" with the actual sheet name
df_species_list_other = pd.read_excel(workbook_path, sheet_name="Other_Species")  # Replace "Sheet2" with the actual sheet name

# Create a new Excel writer object for the new workbook
writer = pd.ExcelWriter('species_per_habitat.xlsx', engine='openpyxl')

# Add a dummy sheet to ensure at least one sheet is visible
pd.DataFrame().to_excel(writer, index=False, header=False)

# Close the writer to save changes
writer.close()

def process_pafs_data(filter_df, output_excel_path, sheet_name):
    # Filter the PAFS data based on the merged species list
    df_pafs_filtered = df_pafs[df_pafs['Species'].isin(filter_df['Species'])]

    # Selecting only the relevant columns (Species and Worksheet C columns)
    df_pafs_subset = df_pafs_filtered[['Species', 'Rating', 'Question 3.1 Explanation'] + [col for col in df_pafs_filtered.columns if col.startswith('Worksheet C, ')]]

    # Remove the prefix "Worksheet C, " from the column names
    df_pafs_subset.columns = df_pafs_subset.columns.str.replace('Worksheet C, ', '')

    # Transpose the DataFrame so that habitats are rows and species are columns
    df_transposed = df_pafs_subset.set_index('Species').T.reset_index().rename(columns={'index': 'Habitat'})

    # Create a new column 'Species Concat' which concatenates species where the value is 'A', 'B', 'C', or 'D'
    df_transposed['Species Concat'] = df_transposed.apply(lambda row: ', '.join([col for col in row.index[1:] if row[col] in ['A', 'B', 'C', 'D']]), axis=1)

    # Create separate columns for each possible value ('A', 'B', 'C', 'D', 'U', or no value)
    for value in ['A', 'B', 'C', 'D', 'U', '']:
        df_transposed[f'{value} Species PAF'] = df_transposed.apply(lambda row: ', '.join([col for col in row.index[1:] if row[col] == value]), axis=1)

    # Reorder the columns so that 'Species Concat' is immediately to the right of 'Habitat' and the new concat columns are immediately to the right of 'Species Concat'
    column_order = ['Habitat', 'Species Concat'] + ['A Species PAF', 'B Species PAF', 'C Species PAF', 'D Species PAF', 'U Species PAF'] + [col for col in df_transposed.columns if col not in ['Habitat', 'Species Concat', 'A Species PAF', 'B Species PAF', 'C Species PAF', 'D Species PAF', 'U Species PAF']]
    df_transposed = df_transposed[column_order]

    # Read the habitat types data into a DataFrame
    df_minor_ecological_types = pd.read_excel("python generated excel files\habitat_types_at_mgp.xlsx")

    # Merge the transposed PAFS data with the habitat types data
    merged_df = pd.merge(df_transposed, df_minor_ecological_types, left_on="Habitat", right_on="Minor Ecological Types", how="inner")

    # Rename the merged column
    merged_df.rename(columns={"MGP Habitats": "Habitat, Marin Fine Veg Map"}, inplace=True)

    # Reorder the columns
    columns_order = ['Habitat', 'Habitat, Marin Fine Veg Map'] + [col for col in merged_df.columns if col not in ['Habitat', 'Habitat, Marin Fine Veg Map']]
    merged_df = merged_df[columns_order]

    with pd.ExcelWriter(output_excel_path, engine='openpyxl', mode='a') as writer:
        # Export the merged DataFrame to an Excel file
        merged_df.to_excel(writer, sheet_name=sheet_name, index=False)

# Apply the function to process PAFS data for each species list
process_pafs_data(df_species_list_spring, 'species_per_habitat.xlsx', 'Spring_Search_PAFS')
process_pafs_data(df_species_list_other, 'species_per_habitat.xlsx', 'Other_Species_PAFS')

with pd.ExcelWriter('species_per_habitat.xlsx', engine='openpyxl', mode='a') as writer:
    writer.book.remove(writer.book['Sheet1'])