In [2]:
#| code-summary: import IOOS metrics citation data helper function
# TODO: can import this from ioos_metrics. it was copy-pasted here.
# ref https://github.com/ioos/ioos_metrics/
def mbon_stats():
    """
    This function collects download statistics about MBON affiliated datasets shared with the Ocean Biodiversity
    Information System (OBIS) and the Global Biodiversity Information Framework (GBIF). The function returns a
    dataframe with rows corresponding to each dataset.
    :return:
    """
    import pandas as pd
    import pyobis
    import urllib.parse

    # collect dataset information from OBIS
    institution_id = 23070
    query = pyobis.dataset.search(instituteid=institution_id)
    df = pd.DataFrame(query.execute())
    df_obis = pd.DataFrame.from_records(df["results"])
    df_obis.columns = ['obis_' + str(col) for col in df_obis.columns]

    df_mapping = pd.DataFrame()
    base_url = 'https://api.gbif.org'
    # iterate through each OBIS dataset to gather uuid from GBIF
    # create a mapping table
    for title in df_obis['obis_title']:
        string = title
        query = '{}/v1/dataset/search?q={}'.format(base_url, urllib.parse.quote(string))
        df = pd.read_json(query, orient='index').T

        # build a DataFrame with the info we need more accessible
        df_mapping = pd.concat([df_mapping, pd.DataFrame({
            'gbif_uuid': df['results'].values[0][0]['key'],
            'title': [df['results'].values[0][0]['title']],
            'obis_id': [df_obis.loc[df_obis['obis_title']==title,'obis_id'].to_string(index=False)],
            'doi': [df['results'].values[0][0]['doi']]
        })], ignore_index=True)

    df_gbif = pd.DataFrame()
    for key in df_mapping['gbif_uuid']:

        url = 'https://api.gbif.org/v1/literature/export?format=CSV&gbifDatasetKey={}'.format(key)
        df2 = pd.read_csv(url)  # collect liturature cited information
        df2.columns = ['literature_' + str(col) for col in df2.columns]
        df2['gbif_uuid'] = key

        df_gbif = pd.concat([df2,df_gbif], ignore_index=True)

    # merge the OBIS and GBIF data frames together
    df_obis = df_obis.merge(df_mapping, on='obis_id')

    df_out = df_gbif.merge(df_obis, on='gbif_uuid')

    return df_out

In [3]:
#| code-summary: fetch the latest mbon data
stats_df = mbon_stats()

In [4]:
#| code-summary: define streamgraph function
def make_streamgraph(og_df, y_colname, plt_title):
  # subset the df to save RAM
  df_subset = og_df[[y_colname, 'literature_published']]
  # split any pipe-delimited rows in y_colname
  df = df_subset.assign(**{y_colname: df_subset[y_colname].str.split('|')}).explode(y_colname)
  # Convert the 'literature_published' column to datetime
  df['literature_published'] = pd.to_datetime(df['literature_published'])
  
  # Extract month and year for aggregation
  df['year_month'] = df['literature_published'].dt.to_period('M')
  
  # Group by year_month and title, then get the cumulative count
  df['count'] = df.groupby(y_colname).cumcount() + 1
  monthly_counts = df.groupby(['year_month', y_colname]).agg({'count': 'max'}).reset_index()
  
  # Pivot the DataFrame to have year_month as rows and titles as columns
  pivot_df = monthly_counts.pivot(index='year_month', columns=y_colname, values='count').fillna(0).cumsum()
  
  # Plotting the streamgraph
  plt.figure(figsize=(12, 8))
  plt.stackplot(pivot_df.index.to_timestamp(), pivot_df.T, labels=pivot_df.columns)
  plt.title(plt_title)
  plt.xlabel('Date')
  plt.ylabel('Cumulative Count')
  plt.xticks(rotation=45)

In [5]:
#| code-summary: streamgraph of citations per dataset
import pandas as pd
import matplotlib.pyplot as plt
import streamz

make_streamgraph(stats_df, "obis_title", f'Citation Count by MBON Dataset")
plt.show()

SyntaxError: EOL while scanning string literal (1548083018.py, line 6)

In [None]:
#| code-summary: streamgraph of citations per topic
# split up column with pipe-delimited multi-values

make_streamgraph(stats_df, "literature_topics", "MBON Dataset Contributions to Topic Citations")
plt.legend(loc='upper left')
plt.show()

In [None]:
#| code-summary: create chlopleth map 
import plotly.express as px

loc_colname = "literature_countries_of_researcher"
og_df = stats_df[[loc_colname]]

# split any pipe-delimited rows in y_colname
df = og_df.assign(**{loc_colname: og_df[loc_colname].str.split('|')}).explode(loc_colname)

# Count the occurrences of each country
country_counts = df[loc_colname].value_counts().reset_index()
country_counts.columns = ['Country', 'Count']

# Create the choropleth map with a monochrome color scale and gray for missing data
fig = px.choropleth(country_counts,
                    locations="Country",
                    locationmode='country names',
                    color="Count",
                    hover_name="Country",
                    color_continuous_scale=px.colors.sequential.Blues,
                    title='Choropleth Heatmap of # Citations in each Country"
  )

# Update layout to set color for missing data
fig.update_geos(
    showcoastlines=True, coastlinecolor="Gray",
    showland=True, landcolor="White",
    showocean=True, oceancolor="Gray",
    showlakes=True, lakecolor="Gray"
)

# Show the plot
fig.show()