In [1]:
%load_ext kedro.ipython
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

import altair as alt
import vl_convert as vlc
alt.data_transformers.disable_max_rows()

from alphafold_impact.settings import SESSION_STORE_ARGS
PATHDIR = SESSION_STORE_ARGS['path']
SAVE_DIR = PATHDIR + '/data/08_reporting/reach/'
ECR_SAVE_DIR = PATHDIR + '/data/08_reporting/reach/ecr/'

# check folder, create if it doesn't exist
import os
for folder in [SAVE_DIR, ECR_SAVE_DIR]:
    if not os.path.exists(folder):
        os.makedirs(folder)

In [2]:
# Reach

In [3]:
publications = catalog.load("publications.data.outputs")

In [4]:
publications["authors"] = publications["authorships"].apply(
    lambda x: [y[0] for y in x] if x is not None else []
)

In [5]:
publications["publication_date"] = pd.to_datetime(publications["publication_date"])
publications["quarter"] = publications["publication_date"].dt.to_period("Q").astype(str)

In [6]:
authors = (
    publications[
        [
            "id",
            "source",
            "authors",
            "level",
            "quarter",
            "fwci",
            "citation_normalized_percentile_is_in_top_10_percent",
            "primary_field"
        ]
    ]
    .explode("authors")
    .drop_duplicates(subset=["authors", "source"])
)


In [7]:
authors = (
    publications[
        [
            "id",
            "source",
            "authors",
            "level",
            "quarter",
            "fwci",
            "citation_normalized_percentile_is_in_top_10_percent",
            "primary_field"
        ]
    ]
    .explode("authors")
    .drop_duplicates(subset=["authors", "source"])
)

unique_researchers_overtime = (
    authors.groupby(["quarter", "source"])["authors"]
    .nunique()
    .reset_index()
    .rename(columns={"authors": "researcher_count"})
)

unique_researchers_overtime["cumulative_researcher_count"] = (
    unique_researchers_overtime.groupby("source")["researcher_count"].cumsum()
)

unique_researchers_overtime = unique_researchers_overtime[
    (unique_researchers_overtime["quarter"] >= "2020Q1")
    & (unique_researchers_overtime["quarter"] <= "2024Q1")
]

chart = alt.Chart(unique_researchers_overtime).mark_line(interpolate='step-after').encode(
    x='quarter:N',
    y='cumulative_researcher_count:Q',
    color='source:N',
    tooltip=['quarter:N', 'cumulative_researcher_count:Q', 'source:N']
).properties(
    title='Number of Source-Unique Researchers Over Time by Source Group'
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open("source_unique_researchers.png", "wb") as f:
    f.write(png_str)

In [8]:
# Assign priority to each source
source_priority = {"af": 1, "ct_ai": 2, "ct_noai": 3, "other": 4}
authors["source_priority"] = authors["source"].map(source_priority)

# Sort the DataFrame based on the priority
authors = authors.sort_values(by=["authors", "source_priority"])

# Drop duplicates based on authors to keep the highest priority source for each author
authors = authors.drop_duplicates(subset=["authors"])

# Group by quarter and count the number of unique researchers
unique_researchers_overtime = (
    authors.groupby(["quarter", "source"])["authors"]
    .nunique()
    .reset_index()
    .rename(columns={"authors": "researcher_count"})
)

# Calculate the cumulative count of unique researchers over time for each source
unique_researchers_overtime["cumulative_researcher_count"] = (
    unique_researchers_overtime.groupby("source")["researcher_count"].cumsum()
)

# Filter the data for the desired time range
unique_researchers_overtime = unique_researchers_overtime[
    (unique_researchers_overtime["quarter"] >= "2020Q1")
    & (unique_researchers_overtime["quarter"] <= "2024Q1")
]

chart = alt.Chart(unique_researchers_overtime).mark_line(interpolate='step-after').encode(
    x='quarter:N',
    y='cumulative_researcher_count:Q',
    color='source:N',
    tooltip=['quarter:N', 'cumulative_researcher_count:Q', 'source:N']
).properties(
    title='Number of Unique Researchers Over Time by Source Group'
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open("unique_researchers.png", "wb") as f:
    f.write(png_str)

In [12]:
authors = authors[authors["primary_field"].isin(["Biochemistry, Genetics and Molecular Biology", "Medicine", "Chemistry", "Immunology and Microbiology"])]

unique_researchers_overtime = (
    authors.groupby(["quarter", "source", "primary_field"])["authors"]
    .nunique()
    .reset_index()
    .rename(columns={"authors": "researcher_count"})
)

# Calculate the cumulative count of unique researchers over time for each source and primary_field
unique_researchers_overtime["cumulative_researcher_count"] = (
    unique_researchers_overtime.groupby(["source", "primary_field"])[
        "researcher_count"
    ].cumsum()
)

# Filter the data for the desired time range
unique_researchers_overtime = unique_researchers_overtime[
    (unique_researchers_overtime["quarter"] >= "2020Q1")
    & (unique_researchers_overtime["quarter"] <= "2024Q1")
]

# Create the Altair step chart with faceting by primary_field
step_line = (
    alt.Chart(unique_researchers_overtime)
    .mark_line(interpolate="step-after")
    .encode(
        alt.X("quarter:N", title="Quarter"),
        alt.Y("cumulative_researcher_count:Q", title=None),
        alt.Color("source:N", title="Source"),
    )
)

chart = (
    step_line.properties(
        title="Number of Source-Unique Researchers Over Time by Field",
        width=300,
        height=120,
    )
    .facet(facet="primary_field:N", columns=2, spacing=25)
    .resolve_scale(x="shared", y="independent")
    .configure_title(fontSize=16)
    .configure_axis(labelFontSize=12, titleFontSize=14)
    .configure_header(titleFontSize=14, labelFontSize=12)
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open("source_unique_researchers_field.png", "wb") as f:
    f.write(png_str)

In [14]:
# replace nan in citation_normalized_percentile_is_in_top_10_percent with False
publications["citation_normalized_percentile_is_in_top_10_percent"] = publications[
    "citation_normalized_percentile_is_in_top_10_percent"
].fillna(False)

In [15]:
total_publications = (
    publications.groupby(["source", "quarter"])["id"]
    .count()
    .reset_index()
    .rename(columns={"id": "total_publications"})
)

# Compute the share of publications for each source-quarter pair
total_publications["share_publications"] = (
    total_publications["total_publications"]
    / total_publications["total_publications"].sum()
)

# Compute the total number of top decile publications for each source-quarter pair
top_decile_publications = (
    publications[publications["citation_normalized_percentile_is_in_top_10_percent"]]
    .groupby(["source", "quarter"])["id"]
    .count()
    .reset_index()
    .rename(columns={"id": "top_decile_publications"})
)

# Merge the total publications and top decile publications data
merged_data = pd.merge(
    total_publications,
    top_decile_publications,
    on=["source", "quarter"],
    how="left"
).fillna(0)

# Compute the share of top decile publications for each source-quarter pair
merged_data["share_top_decile_publications"] = (
    merged_data["top_decile_publications"]
    / merged_data["top_decile_publications"].sum()
)

# Compute the ratio of the share over the top decile share
merged_data["representation_ratio"] = (
    merged_data["share_top_decile_publications"] / 
    merged_data["share_publications"]
)

merged_data = merged_data[
    (merged_data["quarter"] >= "2020Q1")
    & (merged_data["quarter"] <= "2024Q1")
]

In [16]:
scatter = alt.Chart(merged_data).mark_circle(size=60).encode(
    x=alt.X('quarter:N', title='Quarter'),
    y=alt.Y('representation_ratio:Q', title='Representation Ratio'),
    color=alt.Color('source:N', title='Source'),
    tooltip=['quarter:N', 'representation_ratio:Q', 'source:N']
)

line = alt.Chart(merged_data).mark_line().encode(
    x=alt.X('quarter:N', title='Quarter'),
    y=alt.Y('representation_ratio:Q', title='Representation Ratio'),
    color=alt.Color('source:N', title='Source')
)

chart = alt.layer(scatter, line).properties(
    title='Representation of Top Decile Publications Over Time by Source',
    width=800,
    height=400
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open("repr_time.png", "wb") as f:
    f.write(png_str)