# Make pollen abundance time series

Uses the file `outputs/clean_epd_data.csv` generated by the notebook `make_lct_timeseries.ipynb` to generate pollen abundance time series for a selection of species (note, not land cover types).

This is not part of the core data pipeline like `make_lct_timeseries.ipynb`. This is used to produce an illustration of what a pollen diagram at the species level looks like after Fig. 4 in [Carrión et al. 2010](https://doi.org/http://dx.doi.org/10.1016/j.revpalbo.2009.12.007).

In [None]:
from dataclasses import dataclass
import os
from pathlib import Path
import re

import pandas as pd

In [None]:
pwd = os.getcwd().split('/')[-1]
in_pollen_abundance = pwd == 'pollen-abundance'
TMP_DIR = Path('../tmp') if in_pollen_abundance else Path('tmp')
OUTPUT_DIR = Path('../outputs') if in_pollen_abundance else Path('outputs')
PLOTS_DIR = OUTPUT_DIR / 'plots'
PLOTS_DIR.mkdir(exist_ok=True)

In [None]:
epd_data = (
    pd.read_csv(OUTPUT_DIR / 'clean_epd_data.csv')
    .set_index(['sitecode', 'agebp'])
    .sort_index()
)

In [None]:
@dataclass
class SpeciesGroup:
    """Relate regex identifying a species to its family/ group."""
    regex: str
    desc: str
    note: str = None

In [None]:
def species_to_group(species_name: str) -> str:
    """Convert species name to group.
    
    Throws error if more than one group matched by species.
    """
    re_maps = [
        SpeciesGroup(r'pinus(?!\spinaster)', 'Pinus'),
        SpeciesGroup(r'artemisia', 'Artemisia'),
        SpeciesGroup(r'ephedra', 'Ephedra'),
        SpeciesGroup(r'pinus pinaster', 'Pinus Pinaster'),
        SpeciesGroup((r'.*(castanea)|(betula)|(fagaceae)|(fagus)|'
                      r'(alnus)|(corylus)|(salix)|(carpinus)'),
                     'Deciduous Trees'),
        SpeciesGroup('plantago', 'Plantago'),
        SpeciesGroup(r'quercus', 'Quercus'),
    ]
    
    candidate = None
    for r in re_maps:
        if re.match(r.regex, species_name.lower()):
            if candidate is not None:
                raise RuntimeError('More than one group matched '
                                   + species_name)
            candidate = r.desc
    
    return candidate

def test_species_to_group():
    assert species_to_group('Pinus halepensis') == 'Pinus'
    assert species_to_group('Pinus pinaster') == 'Pinus Pinaster'
    assert species_to_group('Alnus') == 'Deciduous Trees'

test_species_to_group()

In [None]:
epd_data = (
    epd_data.assign(group=lambda df: df['varname'].apply(species_to_group))
)

In [None]:
epd_data[~epd_data['group'].isna()].head()

In [None]:
sample_totals = epd_data.groupby(level=['sitecode', 'agebp'])['pcount'].sum()
group_counts = (
    epd_data.reset_index().groupby(by=['sitecode', 'agebp', 'group'])['pcount']
    .sum().unstack().fillna(0)
)

In [None]:
assert (sample_totals.index == group_counts.index).all()

In [None]:
group_pct = group_counts.divide(sample_totals, axis=0).multiply(100)
group_pct = group_pct[['Pinus', 'Artemisia', 'Ephedra', 'Pinus Pinaster',
                       'Quercus', 'Deciduous Trees', 'Plantago']]

In [None]:
def write_site_data(site_df: pd.DataFrame):
    """Write data for a study site to disk.
    
    Assumes there's data for only one site in df.
    """
    site_codes = site_df.index.get_level_values(0).unique()
    if site_codes.size > 1:
        raise RuntimeError('Only expect one site code in df, found '
                           + site_codes)
    (
        site_df.reset_index('sitecode', drop=True)
        .to_csv(OUTPUT_DIR / site_codes[0] / 'selected_species_pol_pct.csv')
    )   

In [None]:
group_pct.groupby(level='sitecode').apply(write_site_data)

## References

Carrión, J. S., Fernández, S., González-Sampériz, P., Gil-Romera, G., Badal, E., Carrión-Marco, Y., … Burjachs, F. (2010). Expected trends and surprises in the Lateglacial and Holocene vegetation history of the Iberian Peninsula and Balearic Islands. Review of Palaeobotany and Palynology, 162(3), 458–475. https://doi.org/http://dx.doi.org/10.1016/j.revpalbo.2009.12.007