In [1]:
%load_ext kedro.ipython
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)

import altair as alt
import vl_convert as vlc
alt.data_transformers.disable_max_rows()

from alphafold_impact.settings import SESSION_STORE_ARGS
PATHDIR = SESSION_STORE_ARGS['path']
SAVE_DIR = PATHDIR + '/data/08_reporting/reach/'
ECR_SAVE_DIR = PATHDIR + '/data/08_reporting/reach/ecr/'

# check folder, create if it doesn't exist
import os
for folder in [SAVE_DIR, ECR_SAVE_DIR]:
    if not os.path.exists(folder):
        os.makedirs(folder)

In [2]:
# Reach

In [2]:
publications = catalog.load("publications.data.outputs")

In [3]:
publications["authors"] = publications["authorships"].apply(
    lambda x: [y[0] for y in x] if x is not None else []
)

In [4]:
publications["publication_date"] = pd.to_datetime(publications["publication_date"])
publications["quarter"] = publications["publication_date"].dt.to_period("Q").astype(str)

In [6]:
authors = (
    publications[
        [
            "id",
            "source",
            "authors",
            "level",
            "quarter",
            "fwci",
            "primary_field"
        ]
    ]
    .explode("authors")
    .drop_duplicates(subset=["authors", "source"])
)


In [7]:
authors = (
    publications[
        [
            "id",
            "source",
            "authors",
            "level",
            "quarter",
            "fwci",
            "primary_field"
        ]
    ]
    .explode("authors")
    .drop_duplicates(subset=["authors", "source"])
)

unique_researchers_overtime = (
    authors.groupby(["quarter", "source"])["authors"]
    .nunique()
    .reset_index()
    .rename(columns={"authors": "researcher_count"})
)

unique_researchers_overtime["cumulative_researcher_count"] = (
    unique_researchers_overtime.groupby("source")["researcher_count"].cumsum()
)

unique_researchers_overtime = unique_researchers_overtime[
    (unique_researchers_overtime["quarter"] >= "2020Q1")
    & (unique_researchers_overtime["quarter"] <= "2024Q1")
]

chart = alt.Chart(unique_researchers_overtime).mark_line(interpolate='step-after').encode(
    x='quarter:N',
    y='cumulative_researcher_count:Q',
    color='source:N',
    tooltip=['quarter:N', 'cumulative_researcher_count:Q', 'source:N']
).properties(
    title='Number of Source-Unique Researchers Over Time by Source Group'
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open("source_unique_researchers.png", "wb") as f:
    f.write(png_str)

In [8]:
# Assign priority to each source
source_priority = {"af": 1, "ct_ai": 2, "ct_noai": 3, "other": 4}
authors["source_priority"] = authors["source"].map(source_priority)

# Sort the DataFrame based on the priority
authors = authors.sort_values(by=["authors", "source_priority"])

# Drop duplicates based on authors to keep the highest priority source for each author
authors = authors.drop_duplicates(subset=["authors"])

# Group by quarter and count the number of unique researchers
unique_researchers_overtime = (
    authors.groupby(["quarter", "source"])["authors"]
    .nunique()
    .reset_index()
    .rename(columns={"authors": "researcher_count"})
)

# Calculate the cumulative count of unique researchers over time for each source
unique_researchers_overtime["cumulative_researcher_count"] = (
    unique_researchers_overtime.groupby("source")["researcher_count"].cumsum()
)

# Filter the data for the desired time range
unique_researchers_overtime = unique_researchers_overtime[
    (unique_researchers_overtime["quarter"] >= "2020Q1")
    & (unique_researchers_overtime["quarter"] <= "2024Q1")
]

chart = alt.Chart(unique_researchers_overtime).mark_line(interpolate='step-after').encode(
    x='quarter:N',
    y='cumulative_researcher_count:Q',
    color='source:N',
    tooltip=['quarter:N', 'cumulative_researcher_count:Q', 'source:N']
).properties(
    title='Number of Unique Researchers Over Time by Source Group'
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open("unique_researchers.png", "wb") as f:
    f.write(png_str)

In [9]:
authors = authors[authors["primary_field"].isin(["Biochemistry, Genetics and Molecular Biology", "Medicine", "Chemistry", "Immunology and Microbiology"])]

unique_researchers_overtime = (
    authors.groupby(["quarter", "source", "primary_field"])["authors"]
    .nunique()
    .reset_index()
    .rename(columns={"authors": "researcher_count"})
)

# Calculate the cumulative count of unique researchers over time for each source and primary_field
unique_researchers_overtime["cumulative_researcher_count"] = (
    unique_researchers_overtime.groupby(["source", "primary_field"])[
        "researcher_count"
    ].cumsum()
)

# Filter the data for the desired time range
unique_researchers_overtime = unique_researchers_overtime[
    (unique_researchers_overtime["quarter"] >= "2020Q1")
    & (unique_researchers_overtime["quarter"] <= "2024Q1")
]

# Create the Altair step chart with faceting by primary_field
step_line = (
    alt.Chart(unique_researchers_overtime)
    .mark_line(interpolate="step-after")
    .encode(
        alt.X("quarter:N", title="Quarter"),
        alt.Y("cumulative_researcher_count:Q", title=None),
        alt.Color("source:N", title="Source"),
    )
)

chart = (
    step_line.properties(
        title="Number of Source-Unique Researchers Over Time by Field",
        width=300,
        height=120,
    )
    .facet(facet="primary_field:N", columns=2, spacing=25)
    .resolve_scale(x="shared", y="independent")
    .configure_title(fontSize=16)
    .configure_axis(labelFontSize=12, titleFontSize=14)
    .configure_header(titleFontSize=14, labelFontSize=12)
)

png_str = vlc.vegalite_to_png(vl_spec=chart.to_json(), scale=3)
with open("source_unique_researchers_field.png", "wb") as f:
    f.write(png_str)

In [7]:
# create mesh_C_ind for mesh_C that is 1 if mesh_C is > 0, 0 otherwise
publications["mesh_C_ind"] = publications["mesh_C"].apply(lambda x: 1 if x > 0 else 0)

In [8]:
# create variable depth that is "adjacent" if level is 0, and "downstream" if level is 1 or 2
publications["depth"] = publications["level"].apply(lambda x: "adjacent" if x == "0" else "downstream" if x in [1, 2] else "distant")

In [9]:
publications.groupby(["source", "depth"])["mesh_C_ind"].sum()


source  depth   
af      adjacent     [1;36m3097[0m
        distant     [1;36m76320[0m
ct_ai   adjacent      [1;36m644[0m
        distant     [1;36m19661[0m
ct_pp   adjacent     [1;36m1865[0m
        distant     [1;36m53541[0m
ct_sb   adjacent     [1;36m1383[0m
        distant     [1;36m49112[0m
other   adjacent     [1;36m2058[0m
        distant     [1;36m92698[0m
Name: mesh_C_ind, dtype: int64

In [10]:
publications.groupby(["source", "depth"])["mesh_C_ind"].mean()


source  depth   
af      adjacent    [1;36m0.094392[0m
        distant     [1;36m0.148382[0m
ct_ai   adjacent    [1;36m0.116582[0m
        distant     [1;36m0.122388[0m
ct_pp   adjacent    [1;36m0.286219[0m
        distant     [1;36m0.242117[0m
ct_sb   adjacent    [1;36m0.221919[0m
        distant     [1;36m0.244117[0m
other   adjacent    [1;36m0.069009[0m
        distant     [1;36m0.185471[0m
Name: mesh_C_ind, dtype: float64

In [11]:
strong_publications = publications[publications["chain_label"].isin(["strong", "partial_strong"])]
strong_publications.groupby(["source", "depth"])["mesh_C_ind"].sum()


source  depth   
af      adjacent      [1;36m685[0m
        distant     [1;36m15673[0m
ct_ai   adjacent        [1;36m0[0m
        distant      [1;36m1508[0m
ct_pp   adjacent       [1;36m13[0m
        distant      [1;36m3330[0m
ct_sb   distant      [1;36m2094[0m
other   adjacent       [1;36m39[0m
        distant      [1;36m6591[0m
Name: mesh_C_ind, dtype: int64

In [12]:
strong_publications.groupby(["source", "depth"])["mesh_C_ind"].mean()


source  depth   
af      adjacent    [1;36m0.131883[0m
        distant     [1;36m0.195275[0m
ct_ai   adjacent    [1;36m0.000000[0m
        distant     [1;36m0.155432[0m
ct_pp   adjacent    [1;36m0.541667[0m
        distant     [1;36m0.281274[0m
ct_sb   distant     [1;36m0.264528[0m
other   adjacent    [1;36m0.096296[0m
        distant     [1;36m0.211365[0m
Name: mesh_C_ind, dtype: float64

In [13]:
publications.ca_count


[1;36m0[0m          [1;36m4.0[0m
[1;36m1[0m          [1;36m0.0[0m
[1;36m2[0m          [1;36m1.0[0m
[1;36m3[0m          [1;36m0.0[0m
[1;36m4[0m          [1;36m0.0[0m
          [33m...[0m 
[1;36m1678008[0m    [1;36m0.0[0m
[1;36m1678009[0m    [1;36m0.0[0m
[1;36m1678010[0m    [1;36m0.0[0m
[1;36m1678011[0m    [1;36m0.0[0m
[1;36m1678012[0m    [1;36m0.0[0m
Name: ca_count, Length: [1;36m1678013[0m, dtype: float64

#### Clinical citations

In [14]:
publications.groupby(["source", "depth"])["ca_count"].sum()



source  depth   
af      adjacent     [1;36m22634.0[0m
        distant     [1;36m313889.0[0m
ct_ai   adjacent      [1;36m3692.0[0m
        distant      [1;36m77918.0[0m
ct_pp   adjacent      [1;36m4702.0[0m
        distant     [1;36m144460.0[0m
ct_sb   adjacent      [1;36m4726.0[0m
        distant     [1;36m135673.0[0m
other   adjacent     [1;36m16928.0[0m
        distant     [1;36m321294.0[0m
Name: ca_count, dtype: float64

In [25]:
publications.groupby(["source", "depth"])["ca_count"].mean()


source   depth   
af       adjacent    [1;36m0.004747[0m
         distant     [1;36m0.018750[0m
ct_ai    adjacent    [1;36m0.006520[0m
         distant     [1;36m0.005714[0m
ct_noai  adjacent    [1;36m0.034029[0m
         distant     [1;36m0.179203[0m
other    adjacent    [1;36m0.001401[0m
         distant     [1;36m0.018398[0m
Name: ca_count, dtype: float64

In [26]:
strong_publications.groupby(["source", "depth"])["ca_count"].sum()


source   depth   
af       adjacent      [1;36m32.0[0m
         distant     [1;36m1776.0[0m
ct_ai    adjacent       [1;36m4.0[0m
         distant       [1;36m20.0[0m
ct_noai  adjacent      [1;36m18.0[0m
         distant     [1;36m1615.0[0m
other    distant      [1;36m273.0[0m
Name: ca_count, dtype: float64

In [28]:
strong_publications.groupby(["source", "depth"])["ca_count"].mean()


source   depth   
af       adjacent    [1;36m0.009849[0m
         distant     [1;36m0.048554[0m
ct_ai    adjacent    [1;36m0.006107[0m
         distant     [1;36m0.004879[0m
ct_noai  adjacent    [1;36m0.031469[0m
         distant     [1;36m0.176503[0m
other    distant     [1;36m0.014097[0m
Name: ca_count, dtype: float64

#### Patent counts

In [29]:
publications.groupby(["source", "depth"])["patent_count"].sum()



source   depth   
af       adjacent      [1;36m79.0[0m
         distant      [1;36m461.0[0m
ct_ai    adjacent      [1;36m76.0[0m
         distant      [1;36m225.0[0m
ct_noai  adjacent     [1;36m194.0[0m
         distant     [1;36m2318.0[0m
other    adjacent     [1;36m705.0[0m
         distant     [1;36m2389.0[0m
Name: patent_count, dtype: float64

In [30]:
publications.groupby(["source", "depth"])["patent_count"].mean()


source   depth   
af       adjacent    [1;36m0.006579[0m
         distant     [1;36m0.002595[0m
ct_ai    adjacent    [1;36m0.026081[0m
         distant     [1;36m0.006429[0m
ct_noai  adjacent    [1;36m0.055476[0m
         distant     [1;36m0.029069[0m
other    adjacent    [1;36m0.070564[0m
         distant     [1;36m0.008174[0m
Name: patent_count, dtype: float64

In [31]:
strong_publications.groupby(["source", "depth"])["patent_count"].sum()


source   depth   
af       adjacent     [1;36m25.0[0m
         distant     [1;36m148.0[0m
ct_ai    adjacent      [1;36m7.0[0m
         distant      [1;36m27.0[0m
ct_noai  adjacent     [1;36m28.0[0m
         distant     [1;36m438.0[0m
other    distant     [1;36m167.0[0m
Name: patent_count, dtype: float64

In [32]:
strong_publications.groupby(["source", "depth"])["patent_count"].mean()


source   depth   
af       adjacent    [1;36m0.007695[0m
         distant     [1;36m0.004046[0m
ct_ai    adjacent    [1;36m0.010687[0m
         distant     [1;36m0.006587[0m
ct_noai  adjacent    [1;36m0.048951[0m
         distant     [1;36m0.047869[0m
other    distant     [1;36m0.008623[0m
Name: patent_count, dtype: float64

#### Patent citations

In [34]:
publications.groupby(["source", "depth"])["patent_citation"].sum()



source   depth   
af       adjacent       [1;36m9.0[0m
         distant       [1;36m88.0[0m
ct_ai    adjacent      [1;36m27.0[0m
         distant       [1;36m48.0[0m
ct_noai  adjacent     [1;36m148.0[0m
         distant     [1;36m2191.0[0m
other    adjacent     [1;36m817.0[0m
         distant     [1;36m1045.0[0m
Name: patent_citation, dtype: float64

In [35]:
publications.groupby(["source", "depth"])["patent_citation"].mean()


source   depth   
af       adjacent    [1;36m0.132353[0m
         distant     [1;36m0.251429[0m
ct_ai    adjacent    [1;36m0.551020[0m
         distant     [1;36m0.279070[0m
ct_noai  adjacent    [1;36m1.525773[0m
         distant     [1;36m1.740270[0m
other    adjacent    [1;36m3.071429[0m
         distant     [1;36m0.585434[0m
Name: patent_citation, dtype: float64

In [36]:
strong_publications.groupby(["source", "depth"])["patent_citation"].sum()


source   depth   
af       adjacent      [1;36m1.0[0m
         distant      [1;36m25.0[0m
ct_ai    adjacent      [1;36m0.0[0m
         distant      [1;36m10.0[0m
ct_noai  adjacent      [1;36m8.0[0m
         distant     [1;36m583.0[0m
other    distant      [1;36m61.0[0m
Name: patent_citation, dtype: float64

In [37]:
strong_publications.groupby(["source", "depth"])["patent_citation"].mean()


source   depth   
af       adjacent    [1;36m0.045455[0m
         distant     [1;36m0.193798[0m
ct_ai    adjacent    [1;36m0.000000[0m
         distant     [1;36m0.384615[0m
ct_noai  adjacent    [1;36m0.571429[0m
         distant     [1;36m3.068421[0m
other    distant     [1;36m0.570093[0m
Name: patent_citation, dtype: float64