# Exploration of Gateway to research data

In [11]:
%load_ext kedro.ipython

import pandas as pd
import altair as alt
import yaml

alt.data_transformers.disable_max_rows()

The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


[1;35mDataTransformerRegistry.enable[0m[1m([0m[32m'default'[0m[1m)[0m

In [12]:
gtr_institutions = catalog.load("oa.data_processing.gtr.institutions")
gtr_publications = catalog.load("oa.data_processing.gtr.publications")
gtr_citations = catalog.load('oa.data_processing.gtr.citations.alphafold')

### Publications over time

In [13]:
# Assuming gtr_publications is already defined and loaded
pubs = gtr_publications.copy()
pubs['publication_date'] = pd.to_datetime(pubs['publication_date'])

# keep publications from 2008 onwards
pubs = pubs[pubs['publication_date'] >= '2008-01-01']

pubs['year_month'] = pubs['publication_date'].dt.strftime('%Y-%m')

# Group by 'year_month' and count the publications
monthly_counts = pubs.groupby('year_month').size().reset_index(name='counts')

# Create a histogram using the aggregated data
chart = alt.Chart(monthly_counts).mark_bar().encode(
    x=alt.X('year_month:O', title='Publication Month', axis=alt.Axis(tickCount=pubs.shape[0], grid=False, labelAngle=-45, labelFontSize=8)),
    y=alt.Y('counts:Q', title='Number of Publications')
).properties(
    title='Monthly Distribution of GTR Publications',
    width=1800,  # Adjusted for better readability
    height=400
)

chart

### Distribution over institutions

In [14]:
gtr_institutions

Unnamed: 0,work_id,author_id,institution_id,institution_name,country_code
0,W2107167859,A5075519270,I40120149,University of Oxford,GB
1,W2107167859,A5075519270,I4210146410,Science Oxford,GB
2,W2107167859,A5065572308,I40120149,University of Oxford,GB
3,W2107167859,A5065572308,I4210146410,Science Oxford,GB
4,W2107167859,A5078802488,I40120149,University of Oxford,GB
...,...,...,...,...,...
4233157,W3212985413,A5056240345,I45129253,University College London,GB
4233158,W3212985413,A5055823700,I4210122016,Wellcome / EPSRC Centre for Interventional and...,GB
4233159,W3212985413,A5055823700,I45129253,University College London,GB
4233160,W3212985413,A5077630267,I4210122016,Wellcome / EPSRC Centre for Interventional and...,GB


In [15]:
institution_counts = (
    gtr_institutions.groupby(
        ["institution_id", "institution_name", "country_code"]
    )
    .size()
    .reset_index(name="count")
)
institution_counts["Region"] = institution_counts["country_code"].apply(
    lambda x: "GB" if x == "GB" else "RoW"
)

# keep only institutions with more than 4000 papers/authors
institution_counts = institution_counts[institution_counts["count"] > 4000]

chart = (
    alt.Chart(institution_counts)
    .mark_bar()
    .encode(
        x=alt.X("count:Q", title="Number of Papers/Authors"),
        y=alt.Y("institution_name:N", title="Institution Name", sort="-x"),
        color=alt.Color("Region:N", title="Region"),
        tooltip=["institution_name", "count", "Region"],
    )
    .properties(title="Distribution of Institutions in GB vs RoW", height=1000)
)

chart

In [16]:
gtr_publications

Unnamed: 0,work_id,title,abstract,doi,publication_date
0,W2107167859,Designing logical codon reassignment – Expandi...,"Over the last decade, the ability to genetical...",https://doi.org/10.1039/c4sc01534g,2015-01-01
1,W2027950306,<scp>RNA</scp>‐<scp>S</scp>eq bulked segregant...,The identification of genetic markers linked t...,https://doi.org/10.1111/pbi.12281,2014-11-08
2,W2625129035,Validation practices for satellite‐based Earth...,Abstract Assessing the inherent uncertainties ...,https://doi.org/10.1002/2017rg000562,2017-09-01
3,W2121853553,Iridium-catalysed amine alkylation with alcoho...,Amines have been directly alkylated with alcoh...,https://doi.org/10.1039/b923083a,2010-01-01
4,W2766693751,SDSS-IV MaNGA: evidence of the importance of A...,We present new evidence for AGN feedback in a ...,https://doi.org/10.1093/mnras/sty202,2018-01-29
...,...,...,...,...,...
457154,W2936701156,Ultrasonic Testing of Laboratory Samples Repre...,,https://doi.org/10.1784/insi.2019.61.4.187,2019-04-01
457155,W3081449123,Decoupled Spatial Distribution of PAHs Degrade...,Abstract Knowing the structure and distributio...,https://doi.org/10.1029/2020jg005659,2020-09-01
457156,W4294024768,SILAC-based quantitative proteomics to investi...,Abstract Background Macrophages play a central...,https://doi.org/10.1186/s12950-022-00309-8,2022-09-01
457157,W4252167073,Sensitive and Selective Detection of DNA Fragm...,A colorimetric assay for the detection of DNA ...,https://doi.org/10.26434/chemrxiv.9248825.v2,2020-07-06


In [17]:
# merge citations with publications using work_id
citations = gtr_citations.copy()

citations = citations.merge(gtr_publications, on='work_id', how='left')

In [21]:
gtr_citations.af_direct_citation.value_counts()


af_direct_citation
[1;36m0[0m    [1;36m456773[0m
[1;36m1[0m       [1;36m386[0m
Name: count, dtype: int64

In [18]:
gtr_citations.af_indirect_citations.value_counts()


af_indirect_citations
[1;36m0[0m     [1;36m456437[0m
[1;36m1[0m        [1;36m492[0m
[1;36m2[0m        [1;36m119[0m
[1;36m3[0m         [1;36m38[0m
[1;36m4[0m         [1;36m30[0m
[1;36m5[0m         [1;36m13[0m
[1;36m8[0m         [1;36m10[0m
[1;36m7[0m          [1;36m7[0m
[1;36m13[0m         [1;36m3[0m
[1;36m10[0m         [1;36m3[0m
[1;36m9[0m          [1;36m2[0m
[1;36m6[0m          [1;36m2[0m
[1;36m12[0m         [1;36m1[0m
[1;36m11[0m         [1;36m1[0m
[1;36m16[0m         [1;36m1[0m
Name: count, dtype: int64

In [19]:
indirect_citations = citations[citations.af_indirect_citations > 0]

indirect_citations_chart = alt.Chart(indirect_citations).mark_bar().encode(
    x=alt.X('af_indirect_citations:Q', title='Number of Papers'),
    y=alt.Y('title:N', title='Paper Title', sort='-x'),
    tooltip=['title', 'af_indirect_citations']
).properties(
    title='Count of Papers that Indirectly Cite Alphafold',
    height=6400
)

indirect_citations_chart

In [20]:
# Create two separate DataFrames for indirect and direct citations
indirect_df = citations[['title', 'af_indirect_citations']].rename(columns={'af_indirect_citations': 'count'})
indirect_df['type'] = 'Indirect'

direct_df = citations[['title', 'af_direct_citation']].rename(columns={'af_direct_citation': 'count'})
direct_df['type'] = 'Direct'

# Concatenate the two DataFrames
combined_df = pd.concat([indirect_df, direct_df])

# keep those that have at least one citation either directly or indirectly
combined_df = combined_df[combined_df['count'] > 0]

# Add a color column based on the type
combined_df['color'] = combined_df['type'].map({'Indirect': 'blue', 'Direct': 'red'})

# Group by title and type, and sum the counts
grouped_df = combined_df.groupby(['title', 'type', 'color']).sum().reset_index()

# Sort by the summed count
grouped_df = grouped_df.sort_values(by='count', ascending=False)

# Create the chart
chart = alt.Chart(grouped_df).mark_bar().encode(
    x=alt.X('count:Q', title='Number of Citations'),
    y=alt.Y('title:N', title='Paper Title', sort='-x'),
    color=alt.Color('type:N', title='Citation Type', scale=alt.Scale(domain=['Indirect', 'Direct'], range=['blue', 'red'])),
    tooltip=['title', 'sum(count)', 'type']
).properties(
    title='Count of Papers Citing Alphafold (Direct and Indirect)',
    height=6400
)

chart
