In [1]:
%load_ext kedro.ipython

In [2]:
import numpy as np
import pandas as pd

import altair as alt
import vl_convert as vlc
alt.data_transformers.disable_max_rows()

from alphafold_impact.settings import SESSION_STORE_ARGS
PATHDIR = SESSION_STORE_ARGS['path']
SAVE_DIR = PATHDIR + '/data/08_reporting/november/'

# check folder, create if it doesn't exist
import os
for folder in [SAVE_DIR]:
    if not os.path.exists(folder):
        os.makedirs(folder)

In [3]:
s = catalog.load("publications.data.outputs")

In [4]:
country_classification = {
    'AE': 'Non-LMIC',
    'AG': 'Non-LMIC',
    'AL': 'LMIC',
    'AM': 'LMIC',
    'AO': 'LMIC',
    'AR': 'Non-LMIC',
    'AT': 'Non-LMIC',
    'AU': 'Non-LMIC',
    'AW': 'Non-LMIC',
    'AZ': 'LMIC',
    'BA': 'LMIC',
    'BB': 'Non-LMIC',
    'BD': 'LMIC',
    'BE': 'Non-LMIC',
    'BF': 'LMIC',
    'BG': 'LMIC',
    'BH': 'Non-LMIC',
    'BI': 'LMIC',
    'BJ': 'LMIC',
    'BN': 'Non-LMIC',
    'BO': 'LMIC',
    'BR': 'Non-LMIC',
    'BS': 'Non-LMIC',
    'BT': 'LMIC',
    'BW': 'LMIC',
    'BY': 'LMIC',
    'BZ': 'Non-LMIC',
    'CA': 'Non-LMIC',
    'CD': 'LMIC',
    'CF': 'LMIC',
    'CG': 'Non-LMIC',
    'CH': 'Non-LMIC',
    'CI': 'Non-LMIC',
    'CL': 'Non-LMIC',
    'CM': 'LMIC',
    'CN': 'LMIC',
    'CO': 'LMIC',
    'CR': 'LMIC',
    'CU': 'Non-LMIC',
    'CY': 'Non-LMIC',
    'CZ': 'Non-LMIC',
    'DE': 'Non-LMIC',
    'DK': 'Non-LMIC',
    'DM': 'LMIC',
    'DO': 'Non-LMIC',
    'DZ': 'LMIC',
    'EC': 'LMIC',
    'EE': 'Non-LMIC',
    'EG': 'LMIC',
    'ES': 'Non-LMIC',
    'ET': 'LMIC',
    'FI': 'Non-LMIC',
    'FJ': 'LMIC',
    'FO': 'Non-LMIC',
    'FR': 'Non-LMIC',
    'GB': 'Non-LMIC',
    'GD': 'LMIC',
    'GE': 'LMIC',
    'GF': 'Non-LMIC',
    'GH': 'LMIC',
    'GM': 'LMIC',
    'GP': 'Non-LMIC',
    'GR': 'Non-LMIC',
    'GT': 'LMIC',
    'GU': 'Non-LMIC',
    'HK': 'Non-LMIC',
    'HR': 'Non-LMIC',
    'HU': 'Non-LMIC',
    'ID': 'LMIC',
    'IE': 'Non-LMIC',
    'IL': 'Non-LMIC',
    'IN': 'LMIC',
    'IQ': 'LMIC',
    'IR': 'LMIC',
    'IS': 'Non-LMIC',
    'IT': 'Non-LMIC',
    'JM': 'LMIC',
    'JO': 'LMIC',
    'JP': 'Non-LMIC',
    'KE': 'LMIC',
    'KG': 'LMIC',
    'KH': 'LMIC',
    'KN': 'Non-LMIC',
    'KR': 'Non-LMIC',
    'KW': 'Non-LMIC',
    'KZ': 'LMIC',
    'LB': 'LMIC',
    'LK': 'LMIC',
    'LS': 'LMIC',
    'LT': 'Non-LMIC',
    'LU': 'Non-LMIC',
    'LV': 'Non-LMIC',
    'LY': 'Non-LMIC',
    'MA': 'LMIC',
    'MC': 'Non-LMIC',
    'MD': 'Non-LMIC',
    'ME': 'LMIC',
    'MG': 'LMIC',
    'MK': 'LMIC',
    'ML': 'LMIC',
    'MM': 'LMIC',
    'MN': 'LMIC',
    'MO': 'Non-LMIC',
    'MT': 'Non-LMIC',
    'MU': 'Non-LMIC',
    'MW': 'LMIC',
    'MX': 'Non-LMIC',
    'MY': 'Non-LMIC',
    'MZ': 'LMIC',
    'NC': 'Non-LMIC',
    'NE': 'LMIC',
    'NG': 'LMIC',
    'NI': 'LMIC',
    'NL': 'Non-LMIC',
    'NO': 'Non-LMIC',
    'NP': 'LMIC',
    'NZ': 'Non-LMIC',
    'OM': 'Non-LMIC',
    'PA': 'Non-LMIC',
    'PE': 'LMIC',
    'PF': 'Non-LMIC',
    'PG': 'LMIC',
    'PH': 'LMIC',
    'PK': 'LMIC',
    'PL': 'Non-LMIC',
    'PR': 'Non-LMIC',
    'PS': 'LMIC',
    'PT': 'Non-LMIC',
    'PY': 'Non-LMIC',
    'QA': 'Non-LMIC',
    'RE': 'Non-LMIC',
    'RO': 'Non-LMIC',
    'RS': 'LMIC',
    'RU': 'Non-LMIC',
    'RW': 'LMIC',
    'SA': 'Non-LMIC',
    'SD': 'LMIC',
    'SE': 'Non-LMIC',
    'SG': 'Non-LMIC',
    'SI': 'Non-LMIC',
    'SK': 'Non-LMIC',
    'SL': 'LMIC',
    'SN': 'LMIC',
    'SS': 'LMIC',
    'SY': 'LMIC',
    'TG': 'LMIC',
    'TH': 'LMIC',
    'TJ': 'LMIC',
    'TN': 'LMIC',
    'TR': 'Non-LMIC',
    'TT': 'LMIC',
    'TW': 'Non-LMIC',
    'TZ': 'LMIC',
    'UA': 'LMIC',
    'UG': 'LMIC',
    'US': 'Non-LMIC',
    'UY': 'Non-LMIC',
    'UZ': 'LMIC',
    'VE': 'LMIC',
    'VI': 'Non-LMIC',
    'VN': 'LMIC',
    'XK': 'LMIC',
    'YE': 'LMIC',
    'ZA': 'LMIC',
    'ZM': 'LMIC',
    'ZW': 'LMIC'
}

In [5]:
s["country_label"] = s["country_code"].map(country_classification)

In [6]:
source_labels = {
  "af": "AF",
  "ct_ai": "CT AI",
  "ct_noai": "CT No AI",
  "other": "Other SB"
}

In [7]:

unique_researchers = s.drop_duplicates(subset=["last_author"])
pd.crosstab(unique_researchers["source"], unique_researchers["country_label"], normalize="index", margins=True)

# compute the representation ratio of LMICs in the dataset for each source. This is compute as the ratio of source&LMIC to source publications, or the ratio of source to all publications.


country_label,LMIC,Non-LMIC
source,Unnamed: 1_level_1,Unnamed: 2_level_1
af,0.365157,0.634843
ct_ai,0.40182,0.59818
ct_noai,0.326741,0.673259
other,0.425033,0.574967
All,0.38743,0.61257


In [8]:
unique_researchers = unique_researchers.copy()
unique_researchers["source"] = unique_researchers["source"].map(source_labels)

total_authors = (
    unique_researchers.groupby(["source", "country_label"])["id"]
    .count()
    .reset_index()
    .rename(columns={"id": "total_authors"})
)




combined_total_authors = (
    unique_researchers.dropna(subset=["country_label"]).groupby("source")["id"]
    .count()
    .reset_index()
    .rename(columns={"id": "total_authors"})
)

combined_total_authors["share_source_authors"] = (
    combined_total_authors["total_authors"] /
    combined_total_authors["total_authors"].sum()
)

total_authors["share_authors"] = (
    total_authors.groupby("country_label")["total_authors"]
    .transform(lambda x: x / x.sum())
)

# Merge with overall source counts
pubs_merged_data = pd.merge(
    total_authors,
    combined_total_authors[["source", "share_source_authors"]],
    on="source",
    how="left"
)

# Compute the representation ratio for each country_label
pubs_merged_data["representation_ratio"] = (
    pubs_merged_data["share_authors"] /
    pubs_merged_data["share_source_authors"]
)

### For ERCs

In [9]:
ecrs = catalog.load("ecr.publications.quarterly")

In [10]:
ecrs = ecrs.sort_values(by=['author', 'quarter'])

# Determine the source classification for the last observation of each author
last_observation = ecrs.groupby('author').last().reset_index()

# Create the conditions and choices for the source classification
conditions = [
    last_observation["af"] > 0,
    last_observation["ct_ai"] > 0,
    last_observation["ct_noai"] > 0
]
choices = ["af", "ct_ai", "ct_noai"]

# Use np.select to create the source column for the last observation
last_observation["source"] = np.select(conditions, choices, default="other")

# Merge the source classification back into the original DataFrame
ecrs = ecrs.merge(last_observation[['author', 'source']], on='author', how='left')

In [11]:

unique_researchers = ecrs.sort_values(by=["quarter"]).drop_duplicates(subset=["author"], keep="last")
unique_researchers["country_label"] = unique_researchers["institution_country_code"].map(country_classification)
unique_researchers["source"] = unique_researchers["source"].map(source_labels)

In [12]:
unique_researchers = unique_researchers.copy()
total_authors = (
    unique_researchers.groupby(["source", "country_label"])["author"]
    .count()
    .reset_index()
    .rename(columns={"author": "total_authors"})
)


combined_total_authors = (
    unique_researchers.dropna(subset=["country_label"]).groupby("source")["author"]
    .count()
    .reset_index()
    .rename(columns={"author": "total_authors"})
)

combined_total_authors["share_source_authors"] = (
    combined_total_authors["total_authors"] /
    combined_total_authors["total_authors"].sum()
)
total_authors["share_authors"] = (
    total_authors.groupby("country_label")["total_authors"]
    .transform(lambda x: x / x.sum())
)

# Merge with overall source counts
ecr_merged_data = pd.merge(
    total_authors,
    combined_total_authors[["source", "share_source_authors"]],
    on="source",
    how="left"
)

# Compute the LMIC representation ratio
ecr_merged_data["representation_ratio"] = (
    ecr_merged_data["share_authors"] /
    ecr_merged_data["share_source_authors"]
)

In [13]:
ecr_merged_data

Unnamed: 0,source,country_label,total_authors,share_authors,share_source_authors,representation_ratio
0,AF,LMIC,13802,0.316654,0.352901,0.897289
1,AF,Non-LMIC,23122,0.378782,0.352901,1.073339
2,CT AI,LMIC,2815,0.064583,0.062372,1.035453
3,CT AI,Non-LMIC,3711,0.060793,0.062372,0.974685
4,CT No AI,LMIC,4850,0.111272,0.12592,0.883671
5,CT No AI,Non-LMIC,8325,0.136379,0.12592,1.083064
6,Other SB,LMIC,22120,0.507491,0.458807,1.106109
7,Other SB,Non-LMIC,25885,0.424045,0.458807,0.924234


### For Established

In [14]:
nonecrs = catalog.load("nonecr.publications.quarterly")

In [15]:
nonecrs = nonecrs.sort_values(by=['author', 'quarter'])

# Determine the source classification for the last observation of each author
last_observation = nonecrs.groupby('author').last().reset_index()

# Create the conditions and choices for the source classification
conditions = [
    last_observation["af"] > 0,
    last_observation["ct_ai"] > 0,
    last_observation["ct_noai"] > 0
]
choices = ["af", "ct_ai", "ct_noai"]

# Use np.select to create the source column for the last observation
last_observation["source"] = np.select(conditions, choices, default="other")

# Merge the source classification back into the original DataFrame
nonecrs = nonecrs.merge(last_observation[['author', 'source']], on='author', how='left')

In [16]:
unique_researchers = nonecrs.sort_values(by=["quarter"]).drop_duplicates(subset=["author"], keep="last")
unique_researchers["country_label"] = unique_researchers["institution_country_code"].map(country_classification)
unique_researchers["source"] = unique_researchers["source"].map(source_labels)

In [17]:
unique_researchers = unique_researchers.copy()
total_authors = (
    unique_researchers.groupby(["source", "country_label"])["author"]
    .count()
    .reset_index()
    .rename(columns={"author": "total_authors"})
)


combined_total_authors = (
    unique_researchers.dropna(subset=["country_label"]).groupby("source")["author"]
    .count()
    .reset_index()
    .rename(columns={"author": "total_authors"})
)

combined_total_authors["share_source_authors"] = (
    combined_total_authors["total_authors"] /
    combined_total_authors["total_authors"].sum()
)

total_authors["share_authors"] = (
    total_authors.groupby("country_label")["total_authors"]
    .transform(lambda x: x / x.sum())
)

# Merge with overall source counts
nonecr_merged_data = pd.merge(
    total_authors,
    combined_total_authors[["source", "share_source_authors"]],
    on="source",
    how="left"
)


# Compute the LMIC representation ratio
nonecr_merged_data["representation_ratio"] = (
    nonecr_merged_data["share_authors"] /
    nonecr_merged_data["share_source_authors"]
)

nonecr_merged_data

Unnamed: 0,source,country_label,total_authors,share_authors,share_source_authors,representation_ratio
0,AF,LMIC,36738,0.353808,0.357215,0.990461
1,AF,Non-LMIC,74446,0.358921,0.357215,1.004775
2,CT AI,LMIC,6071,0.058467,0.058245,1.003808
3,CT AI,Non-LMIC,12058,0.058134,0.058245,0.998094
4,CT No AI,LMIC,11501,0.110761,0.137217,0.807199
5,CT No AI,Non-LMIC,31208,0.150461,0.137217,1.09652
6,Other SB,LMIC,49526,0.476964,0.447322,1.066264
7,Other SB,Non-LMIC,89704,0.432484,0.447322,0.966827


In [24]:
color_scale = alt.Scale(domain=["LMIC", "Non-LMIC"], range=["#e41a1c", "#377eb8"])

# Create the Altair scatter plot
scatter = (
    alt.Chart(pubs_merged_data)
    .mark_circle(size=100)
    .encode(
        x=alt.X(
            "representation_ratio:Q",
            title="Representation Ratio",
            scale=alt.Scale(domain=[0.8, 1.2]),
        ),
        y=alt.Y("source:N", title="Source"),
        color=alt.Color("country_label:N", scale=color_scale, title="Country"),
    )
    .properties(title="Representation Ratio - Chains", width=200, height=100)
)

line = alt.Chart(pd.DataFrame({"x": [1]})).mark_rule(color="gray").encode(x="x:Q")

pubs_chart = alt.layer(scatter, line).resolve_scale(x="shared")


# Create the Altair scatter plot
scatter = (
    alt.Chart(ecr_merged_data)
    .mark_circle(size=100)
    .encode(
        x=alt.X(
            "representation_ratio:Q",
            title="Representation Ratio",
            scale=alt.Scale(domain=[0.8, 1.2]),
        ),
        y=alt.Y("source:N", title=None, axis=None),
        color=alt.Color("country_label:N", scale=color_scale, title="Country"),
    )
    .properties(title="Representation Ratio - ECRs", width=200, height=100)
)

line = alt.Chart(pd.DataFrame({"x": [1]})).mark_rule(color="gray").encode(x="x:Q")

ecr_chart = alt.layer(scatter, line).resolve_scale(x="shared")


# Create the Altair scatter plot
scatter = (
    alt.Chart(nonecr_merged_data)
    .mark_circle(size=100)
    .encode(
        x=alt.X(
            "representation_ratio:Q",
            title="Representation Ratio",
            scale=alt.Scale(domain=[0.8, 1.2]),
        ),
        y=alt.Y("source:N", title=None, axis=None),
        color=alt.Color("country_label:N", scale=color_scale, title="Country"),
    )
    .properties(title="Representation Ratio - Non-ECRs", width=200, height=100)
)

line = alt.Chart(pd.DataFrame({"x": [1]})).mark_rule(color="gray").encode(x="x:Q")

nonecr_chart = alt.layer(scatter, line).resolve_scale(x="shared")


# concatenate the charts
chart = (
    alt.hconcat(pubs_chart, ecr_chart, nonecr_chart)
    .resolve_axis(y="shared")
    .resolve_scale(x="shared", y="shared")
    .properties(spacing=10)
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open(f"{SAVE_DIR}repr_authors.png", "wb") as f:
    f.write(png_str)

### Labs

In [None]:
flabs = catalog.load("foundational_lab.data_analysis.staggered.outputs.quarterly.primary")
flabs["depth"] = "foundational"

alabs = catalog.load("applied_lab.data_analysis.staggered.outputs.quarterly.primary")
alabs["depth"] = "applied"

In [None]:
labs = pd.concat([flabs, alabs], ignore_index=True)

In [None]:
labs = labs.sort_values(by=['pi_id', 'time'])

# Determine the source classification for the last observation of each author
last_observation = labs.groupby('pi_id').last().reset_index()

# Create the conditions and choices for the source classification
conditions = [
    last_observation["cum_af"] > 0,
    last_observation["cum_ct_ai"] > 0,
    last_observation["cum_ct_noai"] > 0
]
choices = ["af", "ct_ai", "ct_noai"]

# Use np.select to create the source column for the last observation
last_observation["source"] = np.select(conditions, choices, default="other")

# Merge the source classification back into the original DataFrame
labs = labs.merge(last_observation[['pi_id', 'source']], on='pi_id', how='left')

In [None]:
REVERSED_QUARTER_MAPPING = {
    -9: "2015Q3",
    -8: "2015Q4",
    -7: "2016Q1",
    -6: "2016Q2",
    -5: "2016Q3",
    -4: "2016Q4",
    -3: "2017Q1",
    -2: "2017Q2",
    -1: "2017Q3",
    0: "2017Q4",
    1: "2018Q1",
    2: "2018Q2",
    3: "2018Q3",
    4: "2018Q4",
    5: "2019Q1",
    6: "2019Q2",
    7: "2019Q3",
    8: "2019Q4",
    9: "2020Q1",
    10: "2020Q2",
    11: "2020Q3",
    12: "2020Q4",
    13: "2021Q1",
    14: "2021Q2",
    15: "2021Q3",
    16: "2021Q4",
    17: "2022Q1",
    18: "2022Q2",
    19: "2022Q3",
    20: "2022Q4",
    21: "2023Q1",
    22: "2023Q2",
    23: "2023Q3",
    24: "2023Q4",
    25: "2024Q1",
}

In [None]:
labs["quarter"] = labs["time"].map(REVERSED_QUARTER_MAPPING)

In [None]:
pd.set_option('display.max_rows', 500)

### PDB submies

In [None]:
average_results_over_time = (
    nonecrs.groupby(["quarter", "source"])
    .agg({"organism_rarity_mean": "mean", "mean_tmscore": "mean", "num_diseases": "mean"})
    .reset_index()
)

# only keep until 2024Q1
average_results_over_time = average_results_over_time[
    (average_results_over_time["quarter"] <= "2024Q1") &
    (average_results_over_time["quarter"] >= "2021Q1")
]

# delete 2021Q1 and 2021Q2 for AF
average_results_over_time = average_results_over_time[
    ~((average_results_over_time["quarter"] == "2021Q1") & (average_results_over_time["source"] == "af")) &
    ~((average_results_over_time["quarter"] == "2021Q2") & (average_results_over_time["source"] == "af"))
]

average_results_over_time["quarter"] = average_results_over_time["quarter"].astype(str)

# map the source labels
average_results_over_time["source"] = average_results_over_time["source"].map(source_labels)

# create four quarter rolling averages
average_results_over_time["organism_rarity_mean_rolling"] = (
    average_results_over_time.groupby("source")["organism_rarity_mean"]
    .transform(lambda x: x.rolling(4, min_periods=1).mean())
)
average_results_over_time["mean_tmscore_rolling"] = (
    average_results_over_time.groupby("source")["mean_tmscore"]
    .transform(lambda x: x.rolling(4, min_periods=1).mean())
)
average_results_over_time["num_diseases_rolling"] = (
    average_results_over_time.groupby("source")["num_diseases"]
    .transform(lambda x: x.rolling(4, min_periods=1).mean())
)

In [None]:
color_scale = alt.Scale(
    domain=["AF", "CT AI", "CT No AI", "Other SB"],
    range=["#e41a1c", "#377eb8", "#4daf4a", "#ff7f00"],
)

# Create the Altair scatter plot
scatter = (
    alt.Chart(average_results_over_time)
    .mark_circle(size=60)
    .encode(
        x=alt.X("quarter:N", title="Quarter"),
        y=alt.Y(
            "organism_rarity_mean_rolling:Q",
            title="Mean",
            scale=alt.Scale(domain=[0.04, 0.16]),
        ),
        color=alt.Color("source:N", scale=color_scale, title="Source"),
    )
    .properties(width=300, height=200)
)

# Create the Altair line plot
line = (
    alt.Chart(average_results_over_time)
    .mark_line()
    .encode(
        x=alt.X("quarter:N", title="Quarter"),
        y=alt.Y(
            "organism_rarity_mean_rolling:Q",
            title="Mean",
            scale=alt.Scale(domain=[0.04, 0.16]),
        ),
        color=alt.Color("source:N", scale=color_scale, title="Source"),
    )
    .properties(width=300, height=200)
)

# Combine scatter and line plots
organism_chart = (
    alt.layer(scatter, line)
    .resolve_scale(y="shared")
    .properties(title="Organism Rarity (4Q RA) of PDB submissions")
)

# Create the Altair scatter plot
scatter = (
    alt.Chart(average_results_over_time)
    .mark_circle(size=60)
    .encode(
        x=alt.X("quarter:N", title="Quarter"),
        y=alt.Y(
            "mean_tmscore_rolling:Q", title=None, scale=alt.Scale(domain=[0.75, 0.9])
        ),
        color=alt.Color("source:N", scale=color_scale, title="Source"),
    )
    .properties(width=300, height=200)
)

# Create the Altair line plot
line = (
    alt.Chart(average_results_over_time)
    .mark_line()
    .encode(
        x=alt.X("quarter:N", title="Quarter"),
        y=alt.Y(
            "mean_tmscore_rolling:Q", title=None, scale=alt.Scale(domain=[0.75, 0.9])
        ),
        color=alt.Color("source:N", scale=color_scale, title="Source"),
    )
    .properties(width=300, height=200)
)

# Combine scatter and line plots
tmscore_chart = (
    alt.layer(scatter, line)
    .resolve_scale(y="shared")
    .properties(title="TM Score (4Q RA) of PDB submissions")
)

# Create the Altair scatter plot
scatter = (
    alt.Chart(average_results_over_time)
    .mark_circle(size=60)
    .encode(
        x=alt.X("quarter:N", title="Quarter"),
        y=alt.Y("num_diseases_rolling:Q", title=None),
        color=alt.Color("source:N", scale=color_scale, title="Source"),
    )
    .properties(width=300, height=200)
)

# Create the Altair line plot
line = (
    alt.Chart(average_results_over_time)
    .mark_line()
    .encode(
        x=alt.X("quarter:N", title="Quarter"),
        y=alt.Y("num_diseases_rolling:Q", title=None),
        color=alt.Color("source:N", scale=color_scale, title="Source"),
    )
    .properties(width=300, height=200)
)

# Combine scatter and line plots
diseases_chart = (
    alt.layer(scatter, line)
    .resolve_scale(y="shared")
    .properties(title="Disease-relevant Structures (4Q RA) in PDB submissions")
)

chart = (
    alt.hconcat(organism_chart, tmscore_chart, diseases_chart)
    .resolve_scale(y="independent")
    .properties(spacing=10)
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open(f"{SAVE_DIR}pdb_submissions.png", "wb") as f:
    f.write(png_str)

### Publication counts

In [None]:
nonecrs = nonecrs.sort_values(by=['author', 'quarter'])

# Determine the source classification for the last observation of each author
last_observation = nonecrs.groupby('author').last().reset_index()

# Create the conditions and choices for the source classification
conditions = [
    last_observation["strong_af"] > 0,
    last_observation["strong_ct_ai"] > 0,
    last_observation["strong_ct_noai"] > 0,
]
choices = ["af", "ct_ai", "ct_noai"]

# Use np.select to create the source column for the last observation
last_observation["strong_source"] = np.select(conditions, choices, default="other")

# Merge the source classification back into the original DataFrame
strong_nonecrs = nonecrs.merge(last_observation[['author', 'strong_source']], on='author', how='left')

In [None]:
nonecr_publications = strong_nonecrs.groupby(["quarter", "depth", "strong_source"]).agg({"num_publications": "mean"}).reset_index()

# only keep foundational
nonecr_publications = nonecr_publications[nonecr_publications["depth"] == "foundational"]
# map the source labels
nonecr_publications["strong_source"] = nonecr_publications["strong_source"].map(source_labels)

# # turn quarter into datetime
nonecr_publications["quarter"] = nonecr_publications["quarter"].astype(str)
nonecr_publications["quarter"] = pd.to_datetime(nonecr_publications["quarter"])

nonecr_publications = nonecr_publications[(nonecr_publications["quarter"] < "2024Q1")&(nonecr_publications["quarter"] > "2019Q1")]

# rolling average
nonecr_publications["rolling_avg"] = nonecr_publications.groupby(["depth", "strong_source"])["num_publications"].transform(lambda x: x.rolling(4, min_periods=1).mean())

In [None]:
# get the largest num_publications in strong_nonecrs
strong_nonecrs.loc[strong_nonecrs["num_publications"].nlargest(20).index]

In [None]:
# Define a color scale for sources
color_scale = alt.Scale(
    domain=['AF', 'CT AI', 'CT No AI', 'Other SB'],
    range=['#e41a1c', '#377eb8', '#4daf4a', '#ff7f00']
)

# Create the Altair scatter plot
scatter = alt.Chart(nonecr_publications).mark_circle(size=60).encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title='Number of publications', scale=alt.Scale(domain=[1.2, 1.4])),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# Create the Altair line plot
line = alt.Chart(nonecr_publications).mark_line().encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title='Number of publications', scale=alt.Scale(domain=[1.2, 1.4])),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# vertical line on 2021Q2
rule = alt.Chart(pd.DataFrame({'x': ['2021-07-01']})).mark_rule(color='gray').encode(
    x='x:T'
)

# Combine scatter and line plots using facet
nonecr_chart = alt.layer(scatter, line, rule).properties(
    title='Foundational Papers (4Q RA) - Established',
)

# Display the chart
nonecr_chart

In [None]:
ecrs = ecrs.sort_values(by=['author', 'quarter'])

# Determine the source classification for the last observation of each author
last_observation = ecrs.groupby('author').last().reset_index()

# Create the conditions and choices for the source classification
conditions = [
    last_observation["strong_af"] > 0,
    last_observation["strong_ct_ai"] > 0,
    last_observation["strong_ct_noai"] > 0,
]
choices = ["af", "ct_ai", "ct_noai"]

# Use np.select to create the source column for the last observation
last_observation["strong_source"] = np.select(conditions, choices, default="other")

# Merge the source classification back into the original DataFrame
strong_ecrs = ecrs.merge(last_observation[['author', 'strong_source']], on='author', how='left')

In [None]:
ecr_publications = strong_ecrs.groupby(["quarter", "depth", "strong_source"]).agg({"num_publications": "mean"}).reset_index()

# only keep foundational
ecr_publications = ecr_publications[ecr_publications["depth"] == "foundational"]
# map the source labels
ecr_publications["strong_source"] = ecr_publications["strong_source"].map(source_labels)

# # turn quarter into datetime
ecr_publications["quarter"] = ecr_publications["quarter"].astype(str)
ecr_publications["quarter"] = pd.to_datetime(ecr_publications["quarter"])

ecr_publications = ecr_publications[(ecr_publications["quarter"] < "2024Q1")&(ecr_publications["quarter"] > "2019Q1")]

# rolling average
ecr_publications["rolling_avg"] = ecr_publications.groupby(["depth", "strong_source"])["num_publications"].transform(lambda x: x.rolling(4, min_periods=1).mean())

In [None]:
# Define a color scale for sources
color_scale = alt.Scale(
    domain=['AF', 'CT AI', 'CT No AI', 'Other SB'],
    range=['#e41a1c', '#377eb8', '#4daf4a', '#ff7f00']
)

# Create the Altair scatter plot
scatter = alt.Chart(ecr_publications).mark_circle(size=60).encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title='Number of publications', scale=alt.Scale(domain=[1, 1.3])),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# Create the Altair line plot
line = alt.Chart(ecr_publications).mark_line().encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title=None, scale=alt.Scale(domain=[1, 1.3])),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

rule = alt.Chart(pd.DataFrame({'x': ['2021-07-01']})).mark_rule(color='gray').encode(
    x='x:T'
)

# Combine scatter and line plots using facet
ecr_chart = alt.layer(scatter, line, rule).properties(
    title='Foundational Papers (4Q RA) - ECRs',
)

# Display the chart
ecr_chart

In [None]:
# concatenate the charts
chart = (
    alt.hconcat(nonecr_chart, ecr_chart)
    .resolve_scale(y="independent")
    .properties(spacing=10)
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open(f"{SAVE_DIR}pubs.png", "wb") as f:
    f.write(png_str)

### Representation Ratios

In [None]:
nonecrs = nonecrs.sort_values(by=['author', 'quarter'])

# Determine the source classification for the last observation of each author
last_observation = nonecrs.groupby('author').last().reset_index()

# Create the conditions and choices for the source classification
conditions = [
    last_observation["strong_af"] > 0,
    last_observation["strong_ct_ai"] > 0,
    last_observation["strong_ct_noai"] > 0,
]
choices = ["af", "ct_ai", "ct_noai"]

# Use np.select to create the source column for the last observation
last_observation["strong_source"] = np.select(conditions, choices, default="other")

# Merge the source classification back into the original DataFrame
strong_nonecrs = nonecrs.merge(last_observation[['author', 'strong_source']], on='author', how='left')

In [None]:
strong_nonecrs.ca_count.sum()

In [None]:
relevant_period = strong_nonecrs[
    (strong_nonecrs["quarter"] >= "2021Q2")
]

total_publications = (
    relevant_period.groupby(["source", "depth"])["num_publications"]
    .sum()
    .reset_index()
    .rename(columns={"num_publications": "total_citations"})
)

# Compute the share of publications for each source-quarter pair
total_publications["share_citations"] = (
    total_publications["total_citations"]
    / total_publications["total_citations"].sum()
)

# Compute the total number of top decile publications for each source-quarter pair
ca_publications = (
    relevant_period
    .groupby(["source", "depth"])["ca_count"]
    .sum()
    .reset_index()
    .rename(columns={"ca_count": "ca_citations"})
)

ca_merged_data = pd.merge(
    total_publications,
    ca_publications,
    on=["source", "depth"],
    how="left"
).fillna(0)

ca_merged_data["share_ca_citations"] = (
    ca_merged_data["ca_citations"]
    / ca_merged_data["ca_citations"].sum()
)

# Compute the ratio of the share over the top decile share
ca_merged_data["representation_ratio"] = (
    ca_merged_data["share_ca_citations"] / 
    ca_merged_data["share_citations"]
)

In [None]:
total_publications = (
    relevant_period.groupby(["source", "depth"])["num_publications"]
    .sum()
    .reset_index()
    .rename(columns={"num_publications": "total_citations"})
)

# Compute the share of publications for each source-quarter pair
total_publications["share_citations"] = (
    total_publications["total_citations"]
    / total_publications["total_citations"].sum()
)

# Compute the total number of top decile publications for each source-quarter pair
pa_publications = (
    relevant_period
    .groupby(["source", "depth"])["patent_count"]
    .sum()
    .reset_index()
    .rename(columns={"patent_count": "pa_citations"})
)

pa_merged_data = pd.merge(
    total_publications,
    pa_publications,
    on=["source", "depth"],
    how="left"
).fillna(0)

pa_merged_data["share_pa_citations"] = (
    pa_merged_data["pa_citations"]
    / pa_merged_data["pa_citations"].sum()
)

# Compute the ratio of the share over the top decile share
pa_merged_data["representation_ratio"] = (
    pa_merged_data["share_pa_citations"] / 
    pa_merged_data["share_citations"]
)

In [None]:
relevant_period["has_mesh"] = (
    relevant_period[[col for col in relevant_period.columns if "mesh_" in col]].sum(
        axis=1
    )
    > 0
)

total_publications = (
    relevant_period.groupby(["source", "depth"])["has_mesh"]
    .sum()
    .reset_index()
    .rename(columns={"has_mesh": "total_mesh"})
)

# Compute the share of publications for each source-quarter pair
total_publications["share_mesh"] = (
    total_publications["total_mesh"] / total_publications["total_mesh"].sum()
)

# Compute the total number of top decile publications for each source-quarter pair
mesh_publications = (
    relevant_period.groupby(["source", "depth"])["mesh_C"]
    .sum()
    .reset_index()
    .rename(columns={"mesh_C": "mesh_C"})
)

mesh_merged_data = pd.merge(
    total_publications, mesh_publications, on=["source", "depth"], how="left"
).fillna(0)

mesh_merged_data["share_mesh_C"] = (
    mesh_merged_data["mesh_C"] / mesh_merged_data["mesh_C"].sum()
)

# Compute the ratio of the share over the top decile share
mesh_merged_data["representation_ratio"] = (
    mesh_merged_data["share_mesh_C"] / mesh_merged_data["share_mesh"]
)

In [None]:
# concatenate the three merged_data
ca_merged_data["group"] = "Clinical Article Citations"
pa_merged_data["group"] = "Patent Citations"
mesh_merged_data["group"] = "Disease-relevant Research"

merged_data = pd.concat([ca_merged_data, pa_merged_data, mesh_merged_data], ignore_index=True)

# map the source labels
merged_data["source"] = merged_data["source"].map(source_labels)

In [None]:
merged_data

In [None]:
color_scale = alt.Scale(
    domain=['AF', 'CT AI', 'CT No AI', 'Other SB'],
    range=['#e41a1c', '#377eb8', '#4daf4a', '#ff7f00']
)


ca_chart = alt.Chart(merged_data[merged_data["depth"]=="foundational"]).mark_circle(size=60).encode(
    y=alt.Y('group:N', title=None),
    x=alt.X('representation_ratio:Q', title='Representation Ratio'),
    color=alt.Color('source:N', scale=color_scale, title='Source'),
).properties(
    title='Representation of Clinical Article citations',
    width=200,
    height=100
)

ca_chart

#### Counts

In [None]:
nonecr_publications = strong_nonecrs.groupby(["quarter", "strong_source"]).agg({"ca_count": "mean"}).reset_index()
# map the source labels
nonecr_publications["strong_source"] = nonecr_publications["strong_source"].map(source_labels)

# # turn quarter into datetime
nonecr_publications["quarter"] = nonecr_publications["quarter"].astype(str)
nonecr_publications["quarter"] = pd.to_datetime(nonecr_publications["quarter"])

nonecr_publications = nonecr_publications[(nonecr_publications["quarter"] < "2024Q1")&(nonecr_publications["quarter"] > "2019Q1")]

# rolling average
nonecr_publications["rolling_avg"] = nonecr_publications.groupby([ "strong_source"])["ca_count"].transform(lambda x: x.rolling(4, min_periods=1).mean())

In [None]:
# Define a color scale for sources
color_scale = alt.Scale(
    domain=['AF', 'CT AI', 'CT No AI', 'Other SB'],
    range=['#e41a1c', '#377eb8', '#4daf4a', '#ff7f00']
)

# Create the Altair scatter plot
scatter = alt.Chart(nonecr_publications).mark_circle(size=60).encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title='Mean'),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# Create the Altair line plot
line = alt.Chart(nonecr_publications).mark_line().encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title='Mean'),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# vertical line on 2021Q2
rule = alt.Chart(pd.DataFrame({'x': ['2021-07-01']})).mark_rule(color='gray').encode(
    x='x:T'
)

# Combine scatter and line plots using facet
clinical_chart = alt.layer(scatter, line, rule).properties(
    title='Clinical Article Citations (4Q RA)',
)

# Display the chart
clinical_chart

In [None]:
nonecr_publications = strong_nonecrs.groupby(["quarter", "strong_source"]).agg({"patent_count": "mean"}).reset_index()

# map the source labels
nonecr_publications["strong_source"] = nonecr_publications["strong_source"].map(source_labels)

# # turn quarter into datetime
nonecr_publications["quarter"] = nonecr_publications["quarter"].astype(str)
nonecr_publications["quarter"] = pd.to_datetime(nonecr_publications["quarter"])

nonecr_publications = nonecr_publications[(nonecr_publications["quarter"] < "2024Q1")&(nonecr_publications["quarter"] > "2019Q1")]

# rolling average
nonecr_publications["rolling_avg"] = nonecr_publications.groupby(["strong_source"])["patent_count"].transform(lambda x: x.rolling(4, min_periods=1).mean())

In [None]:
# Define a color scale for sources
color_scale = alt.Scale(
    domain=['AF', 'CT AI', 'CT No AI', 'Other SB'],
    range=['#e41a1c', '#377eb8', '#4daf4a', '#ff7f00']
)

# Create the Altair scatter plot
scatter = alt.Chart(nonecr_publications).mark_circle(size=60).encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title=None),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# Create the Altair line plot
line = alt.Chart(nonecr_publications).mark_line().encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title=None),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# vertical line on 2021Q2
rule = alt.Chart(pd.DataFrame({'x': ['2021-07-01']})).mark_rule(color='gray').encode(
    x='x:T'
)

# Combine scatter and line plots using facet
patent_chart = alt.layer(scatter, line, rule).properties(
    title='Patent Citations (4Q RA)',
)

# Display the chart
patent_chart

In [None]:
nonecr_publications = strong_nonecrs.groupby(["quarter", "strong_source"]).agg({"mesh_C": "mean"}).reset_index()

# map the source labels
nonecr_publications["strong_source"] = nonecr_publications["strong_source"].map(source_labels)

# # turn quarter into datetime
nonecr_publications["quarter"] = nonecr_publications["quarter"].astype(str)
nonecr_publications["quarter"] = pd.to_datetime(nonecr_publications["quarter"])

nonecr_publications = nonecr_publications[(nonecr_publications["quarter"] < "2024Q1")&(nonecr_publications["quarter"] > "2019Q1")]

# rolling average
nonecr_publications["rolling_avg"] = nonecr_publications.groupby(["strong_source"])["mesh_C"].transform(lambda x: x.rolling(4, min_periods=1).mean())

In [None]:
# Define a color scale for sources
color_scale = alt.Scale(
    domain=['AF', 'CT AI', 'CT No AI', 'Other SB'],
    range=['#e41a1c', '#377eb8', '#4daf4a', '#ff7f00']
)

# Create the Altair scatter plot
scatter = alt.Chart(nonecr_publications).mark_circle(size=60).encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title=None, scale=alt.Scale(domain=[0.02, 0.05])),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# Create the Altair line plot
line = alt.Chart(nonecr_publications).mark_line().encode(
    x=alt.X('quarter:T', title='Quarter'),
    y=alt.Y('rolling_avg:Q', title=None, scale=alt.Scale(domain=[0.02, 0.05])),
    color=alt.Color('strong_source:N', scale=color_scale, title='Source'),
).properties(
    width=200,
    height=150
)

# vertical line on 2021Q2
rule = alt.Chart(pd.DataFrame({'x': ['2021-07-01']})).mark_rule(color='gray').encode(
    x='x:T'
)

# Combine scatter and line plots using facet
mesh_chart = alt.layer(scatter, line, rule).properties(
    title='C-Category MeSH links (4Q RA)',
)

# Display the chart
mesh_chart

In [None]:
# concatenate the charts
chart = alt.hconcat(clinical_chart, patent_chart, mesh_chart).resolve_scale(
    y='independent'
).properties(
    spacing=10
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open(f"{SAVE_DIR}translational.png", "wb") as f:
    f.write(png_str)