# Exploration of Gateway to research data

In [1]:
%load_ext kedro.ipython

import pandas as pd
import altair as alt
import yaml

alt.data_transformers.disable_max_rows()

[1;35mDataTransformerRegistry.enable[0m[1m([0m[32m'default'[0m[1m)[0m

In [2]:
gtr_institutions = catalog.load("oa.data_processing.gtr.institutions")
gtr_publications = catalog.load("oa.data_processing.gtr.publications")
gtr_citations = catalog.load('oa.data_processing.gtr.citations.alphafold')

### Publications over time

In [3]:
# Assuming gtr_publications is already defined and loaded
pubs = gtr_publications.copy()
pubs['publication_date'] = pd.to_datetime(pubs['publication_date'])

# keep publications from 2008 onwards
pubs = pubs[pubs['publication_date'] >= '2008-01-01']

pubs['year_month'] = pubs['publication_date'].dt.strftime('%Y-%m')

# Group by 'year_month' and count the publications
monthly_counts = pubs.groupby('year_month').size().reset_index(name='counts')

# Create a histogram using the aggregated data
chart = alt.Chart(monthly_counts).mark_bar().encode(
    x=alt.X('year_month:O', title='Publication Month', axis=alt.Axis(tickCount=pubs.shape[0], grid=False, labelAngle=-45, labelFontSize=8)),
    y=alt.Y('counts:Q', title='Number of Publications')
).properties(
    title='Monthly Distribution of GTR Publications',
    width=1800,  # Adjusted for better readability
    height=400
)

chart

### Distribution over institutions

In [4]:
gtr_institutions

Unnamed: 0,paper_id,author_id,institution_id,institution_name,country_code
0,W3123896073,A5029121397,I157614274,University of Cape Town,ZA
1,W3123896073,A5029121397,I46305939,University of Zielona Góra,PL
2,W3123896073,A5029121397,I121797337,Leiden University,NL
3,W3123896073,A5045208440,I4210105583,Royal Observatory,GB
4,W3123896073,A5045208440,I98677209,University of Edinburgh,GB
...,...,...,...,...,...
1880343,W2528595972,A5085798363,I4210156067,Institute for Research on Combustion,IT
1880344,W2528595972,A5076148122,I4210156067,Institute for Research on Combustion,IT
1880345,W2528595972,A5085943471,I71267560,University of Naples Federico II,IT
1880346,W2528595972,A5084132907,I82284825,Cranfield University,GB


In [5]:
institution_counts = (
    gtr_institutions.groupby(
        ["institution_id", "institution_name", "country_code"]
    )
    .size()
    .reset_index(name="count")
)
institution_counts["Region"] = institution_counts["country_code"].apply(
    lambda x: "GB" if x == "GB" else "RoW"
)

# keep only institutions with more than 4000 papers/authors
institution_counts = institution_counts[institution_counts["count"] > 4000]

chart = (
    alt.Chart(institution_counts)
    .mark_bar()
    .encode(
        x=alt.X("count:Q", title="Number of Papers/Authors"),
        y=alt.Y("institution_name:N", title="Institution Name", sort="-x"),
        color=alt.Color("Region:N", title="Region"),
        tooltip=["institution_name", "count", "Region"],
    )
    .properties(title="Distribution of Institutions in GB vs RoW", height=1000)
)

chart

In [6]:
gtr_publications

Unnamed: 0,work_id,title,abstract,doi,publication_date
0,W1996468795,Measurement of an excess of<mml:math xmlns:mml...,"Based on the full BaBar data sample, we report...",https://doi.org/10.1103/physrevd.88.072012,2013-10-31
1,W1532821088,Measurement of the Higgs boson mass from the<m...,An improved measurement of the mass of the Hig...,https://doi.org/10.1103/physrevd.90.052004,2014-09-09
2,W2025070190,Electron reconstruction and identification eff...,Many of the interesting physics processes to b...,https://doi.org/10.1140/epjc/s10052-014-2941-0,2014-07-01
3,W2202434937,Search for new phenomena in dijet mass and ang...,This Letter describes a model-agnostic search ...,https://doi.org/10.1016/j.physletb.2016.01.032,2016-03-01
4,W1811257610,Measurement of event-plane correlations in<mml...,A measurement of event-plane correlations invo...,https://doi.org/10.1103/physrevc.90.024905,2014-08-12
...,...,...,...,...,...
157794,W2792494745,Two-dimensional heat and mass transfer and the...,Transport of heat and mass and the thermodynam...,https://doi.org/10.1016/j.cep.2018.02.025,2018-04-01
157795,W4213414592,Mental health and life satisfaction among 10–1...,"In many countries, including in the United Kin...",https://doi.org/10.1186/s12889-022-12752-6,2022-02-23
157796,W3042807857,PhISCS-BnB: a fast branch and bound algorithm ...,Recent advances in single-cell sequencing (SCS...,https://doi.org/10.1093/bioinformatics/btaa464,2020-07-01
157797,W3207988091,Effect of CRTH2 antagonism on the response to ...,The chemoattractant receptor-homologous molecu...,https://doi.org/10.1136/thoraxjnl-2021-217429,2021-10-29


In [7]:
# merge citations with publications using work_id
citations = gtr_citations.copy()

citations = citations.merge(gtr_publications, on='work_id', how='left')

In [8]:
gtr_citations.af_indirect_citations.value_counts()


af_indirect_citations
[1;36m0[0m     [1;36m157506[0m
[1;36m1[0m        [1;36m196[0m
[1;36m2[0m         [1;36m48[0m
[1;36m3[0m         [1;36m18[0m
[1;36m4[0m         [1;36m13[0m
[1;36m5[0m          [1;36m8[0m
[1;36m8[0m          [1;36m5[0m
[1;36m13[0m         [1;36m2[0m
[1;36m9[0m          [1;36m1[0m
[1;36m11[0m         [1;36m1[0m
[1;36m6[0m          [1;36m1[0m
Name: count, dtype: int64

In [9]:
indirect_citations = citations[citations.af_indirect_citations > 0]

indirect_citations_chart = alt.Chart(indirect_citations).mark_bar().encode(
    x=alt.X('af_indirect_citations:Q', title='Number of Papers'),
    y=alt.Y('title:N', title='Paper Title', sort='-x'),
    tooltip=['title', 'af_indirect_citations']
).properties(
    title='Count of Papers that Indirectly Cite Alphafold',
    height=2400
)

indirect_citations_chart

In [11]:
# Create two separate DataFrames for indirect and direct citations
indirect_df = citations[['title', 'af_indirect_citations']].rename(columns={'af_indirect_citations': 'count'})
indirect_df['type'] = 'Indirect'

direct_df = citations[['title', 'af_direct_citation']].rename(columns={'af_direct_citation': 'count'})
direct_df['type'] = 'Direct'

# Concatenate the two DataFrames
combined_df = pd.concat([indirect_df, direct_df])

# keep those that have at least one citation either directly or indirectly
combined_df = combined_df[combined_df['count'] > 0]

# Add a color column based on the type
combined_df['color'] = combined_df['type'].map({'Indirect': 'blue', 'Direct': 'red'})

# Group by title and type, and sum the counts
grouped_df = combined_df.groupby(['title', 'type', 'color']).sum().reset_index()

# Sort by the summed count
grouped_df = grouped_df.sort_values(by='count', ascending=False)

# Create the chart
chart = alt.Chart(grouped_df).mark_bar().encode(
    x=alt.X('count:Q', title='Number of Citations'),
    y=alt.Y('title:N', title='Paper Title', sort='-x'),
    color=alt.Color('type:N', title='Citation Type', scale=alt.Scale(domain=['Indirect', 'Direct'], range=['blue', 'red'])),
    tooltip=['title', 'sum(count)', 'type']
).properties(
    title='Count of Papers Citing Alphafold (Direct and Indirect)',
    height=3200
)

chart
