# Project


## hypotheses
there are mostly protein structures in the database and mostly enzymes

number of added structures rises exponentaly

cryoEM is more and more popular

In [None]:
# importing modules
import numpy as np
import pandas as pd # dataframes
import altair as alt # plotting
import biotite.database.rcsb as rcsb # api for rcsb database


In [None]:
# obtaining data from rcsb database through api
# number of all structures added before 1990

query_old=rcsb.FieldQuery("rcsb_accession_info.initial_release_date", range_closed=('1960-01-01', '1989-12-31'))
all_old=rcsb.count(query_old)

In [None]:
# number of added structures obtained by selected methods and in total from 1990
# runs for about a minute

years=list(range(1990,2026))
methods=["X-RAY DIFFRACTION", "SOLUTION NMR", "ELECTRON MICROSCOPY"]
counted_structures=[]
counted_years=[]
counted_methods=[]

for year in years:
    query_year=rcsb.FieldQuery("rcsb_accession_info.initial_release_date", range_closed=(f'{year}-01-01', f'{year}-12-31'))
    counted_structures.append(rcsb.count(query_year))
    counted_years.append(year)
    counted_methods.append("all")
    for method in methods:
        query_method = rcsb.FieldQuery("exptl.method", exact_match=method)
        counted_structures.append(rcsb.count(query_method & query_year))
        counted_years.append(year)
        counted_methods.append(method)
   
# saving data to dataframe

data_rcsb= pd.DataFrame(
    {'experimentalTechnique': counted_methods,
     'publicationYear': counted_years,
     'countedStructures': counted_structures
    })

In [None]:
print(all_old)
print(data_rcsb.head)
print(data_rcsb.shape)

In [None]:
alt.Chart(data_rcsb, title='Number of newly published structures per year in the RCSB PDB').mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).transform_filter(
    alt.datum.experimentalTechnique != 'all'   
).encode(
    x=alt.X('publicationYear:O').title("Publication year"),
    y=alt.Y('countedStructures:Q').title('Number of structures'),
    color=alt.Color('experimentalTechnique:N').title('Method')
)

In [None]:
yearly=alt.Chart(data_rcsb, title='Cumulative number of published structures in the RCSB PDB'
).transform_filter(
    alt.datum.experimentalTechnique == 'all'
).mark_bar(
    color='green',
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3   
).encode(
    x=alt.X('publicationYear:O').title('Publication year'),
    y=alt.Y('countedStructures:Q').title('Number of structures')
)

cumulative=alt.Chart(data_rcsb
).transform_filter(
    alt.datum.experimentalTechnique == 'all'
).transform_window(
    sort=[{"field": "publicationYear"}],
    summed='sum(countedStructures)'
).transform_calculate(
    cumulative_count=alt.datum.summed + all_old
).mark_line(
    color='red'
).encode(
    x=alt.X("publicationYear:O"),
    y=alt.Y("cumulative_count:Q").stack(False)
)
alt.layer(yearly,cumulative)

In [None]:
alt.__version__

In [None]:
# allows Vega-Altair work with big datasets

alt.data_transformers.disable_max_rows()

In [None]:
# loading data
data_seq = pd.read_csv("../data/pdb_data_seq.csv")
data = pd.read_csv("../data/pdb_data_no_dups.csv")

In [None]:
data_seq.head(10)

In [None]:
data.head(10)

In [None]:
print(data_seq.shape)
print(data.shape)

In [None]:
# making a dataframe of most common datapoints, e.g., 10 most common categories

def top_count (df, column, cuttoff):
    return (pd.DataFrame(df[column].value_counts().head(cuttoff)).reset_index())

top_class = top_count(data, 'classification', 10)

# plotting histograms, in this case for 10 most common categories

alt.Chart(top_class).mark_bar().encode(
    x=alt.X('classification:N').sort('-y'),
    y='count:Q',
)

In [None]:
top_type = top_count(data, "macromoleculeType", 10)

alt.Chart(top_type).mark_bar().encode(
    x=alt.X('macromoleculeType:N').sort('-y'),
    y='count:Q',
)

In [None]:
# plotting pie charts
# add percentage, join small arcs into one as others
# maybe only proteins -> see if most are enzymes
# choosing multiple categories with shift key

selection = alt.selection_point(fields=['classification'], bind='legend')
types=list

alt.Chart(data).mark_arc().encode(
    theta=alt.Theta("count()"),
    color=alt.Color("classification:N", sort=alt.EncodingSortField(field="macromoleculeType", op='count', order='descending')),
    order=alt.Order('count()')
#).transform_filter(
    #selection
).add_params(
    selection
)

In [None]:
alt.Chart(data).mark_arc().encode(
    theta=alt.Theta("count()"),
    color=alt.Color("macromoleculeType:N", sort=alt.EncodingSortField(field="macromoleculeType", op='count', order='descending')
                    ,title=('Macromolecule type')),
    order=alt.Order('count()')
)

In [None]:
alt.Chart(data).mark_arc().encode(
    theta=alt.Theta("count()"),
    color=alt.Color("experimentalTechnique:N", sort=alt.EncodingSortField(field="experimentalTechnique", op='count', order='descending')
                    ,title=('Experimental technique')),
    order=alt.Order('count()')
)

In [None]:
# number of structures over time, by type, type of methods
# includes only structures with known publication year

data_year_fix=data.replace({'publicationYear':201}, 2014).dropna(subset=['publicationYear'])

alt.Chart(data_year_fix, title='Published structures per year').transform_window(
    cumulative_count="count()",
    sort=[{"field": "publicationYear"}],
).mark_area().encode(
    x=alt.X("publicationYear:O").title('Publication year'),
    y=alt.Y("cumulative_count:Q").stack(False).title('Cumulative number')
).transform_filter(
    alt.datum.publicationYear > 1990
)

In [None]:
alt.Chart(data_year_fix, title='Published structures per year').mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x='publicationYear:O',
    y='count():Q',
    color=alt.Color('experimentalTechnique:N', sort=alt.EncodingSortField(field="experimentalTechnique", op='count', order='descending')),
#).transform_filter(
#    alt.datum.experimentalTechnique != 'X-RAY DIFFRACTION'
).transform_filter(
    alt.datum.publicationYear > 1990
)

In [None]:
print(data_year_fix['publicationYear'].sort_values().unique())
print(data_year_fix['publicationYear'].value_counts())
