In [None]:
from pathlib import Path

import psyplot.project as psy
import pandas as pd
from psy_strat.stratplot import stratplot
import matplotlib as mpl
import matplotlib.pyplot as plt

Note: this notebook uses the environment `as_data_minimal`

In [None]:
mpl.rcParams['figure.figsize'] = (18, 10)
mpl.rcParams['figure.dpi'] = 150

In [None]:
DATA_DIR = Path('../outputs')

In [None]:
PLOTS_DIR = Path('../outputs/plots')

In [None]:
pollen_df = pd.read_csv(DATA_DIR / 'site_pollen_abundance_ts.csv')
group_df = pd.read_csv(DATA_DIR / 'pollen_groups.csv')

In [None]:
pollen_df.head()

In [None]:
group_df.head()

In [None]:
df = pollen_df.merge(group_df, how='left', on='var_')

In [None]:
df['groupname'].unique()

Assumptions:
- Undifferentiated NPP is NPP-ID: Non-Pollen Palynomorph
- Exclude Nonpollen, Cyanobacteria, Microcrustaceans, Invertebrates, Nematoda (roundworms), Macrofossils, Acritarchs, Rhizopods, Indeterminables and unknowns

In [None]:
excluded_types = [
    'Nonpollen',
    'Cyanobacteria',
    'Microcrustaceans',
    'Invertebrates',
    'Nematoda (roundworms)',
    'Macrofossils',
    'Acritarchs',
    'Rhizopods',
    'Indeterminables and unknowns',
    'Undifferentiated NPP'
]

In [None]:
df = (
    df[~df['groupname'].isin(excluded_types)]
    .loc[:, ['sitename', 'varname', 'groupname', 'agebp', 'count']]
)

In [None]:
def get_grouper(sitename):
    def grouper(col):
        return (
            df[df['sitename'] == sitename][['varname', 'groupname']]
            .drop_duplicates().set_index('varname')['groupname']
            .loc[col]
        )
    return grouper

In [None]:
def grouper(col):
    return (
        df[['varname', 'groupname']]
        .drop_duplicates().set_index('varname')['groupname']
        .loc[col]
    )

Navarres and Charco da Candieira have some duplicated rows with common (sitename, agebp, and varname).

In [None]:
df[df.loc[:, ['sitename', 'agebp', 'varname', 'count']].duplicated(subset=['sitename', 'agebp', 'varname'])]['sitename'].unique()

Examining these in more detail

In [None]:
(
    df[
        df.loc[:, ['sitename', 'agebp', 'varname', 'count']]
        .duplicated(subset=['sitename', 'agebp', 'varname'], keep=False)
    ]
    .sort_values(by=['sitename', 'agebp', 'varname'])
)

These duplicated values should either be summed, or one selected over the other. Assume for the time being that they should be summed

In [None]:
plot_df = df.groupby(['sitename', 'agebp', 'varname'])['count'].sum().unstack()

In [None]:
group2taxon = pd.DataFrame.from_dict(plot_df.groupby(grouper, axis=1).groups, orient='index').T
group2taxon.fillna('')

In [None]:
def generate_pollen_diagram(df, sitename):
    sp, groupers = stratplot(
        plot_df.loc[sitename].dropna(axis=1, how='all'), get_grouper(sitename),
        thresh=2.0,
        widths={'Pollen': 1.1},
        percentages=['Pollen'],
        subgroups={'Pollen': ['Trees and shrubs', 'Dwarf shrubs', 'Herbs']},
        exclude=[
            'Helophytes',
            'Vascular cryptogams (Pteridophytes)',
            'Algae',
            'Aquatics',
            'Dinoflagellates',
            'Bryophytes',
            'Pre-Quaternary type',
            'Fungi',
            'Aquatic Pteridophyta'
        ],
    )
    plt.savefig(PLOTS_DIR / f'{sitename.lower()}_pollen_diagram.png', bbox_inches='tight')

In [None]:
for sitename in plot_df.index.get_level_values(0).unique():
    generate_pollen_diagram(plot_df, sitename)