# Protein structures

## Introduction
protein structures are stored in RCSB PDB (Research Collaboratory for Structural Bioinformatics Protein Data Bank)

there are other structures too, not just proteins


### Hypotheses

1) number of added structures rises exponentaly
2) electron microscopy is getting more popular
3) there are mostly protein structures in the database
4) most of the protein structures are enzymes

types of proteins, types of organisms

In [1]:
# importing modules
import pandas as pd # dataframes
import altair as alt # plotting, Vega-Altair version 5.5.0 was used
import biotite.database.rcsb as rcsb # api for rcsb database


## API - RCSB PDB

In [13]:
# obtaining data from rcsb pdb through api and saving it into dataframe
# searching by e.g, different experimental methods, structure types
# requires list of years, attribute and list of values to search for, name of column in the dataframe for attribute
# possible attributes and values can be found at https://search.rcsb.org/structure-search-attributes.html
# returns a dataset with numbers of structures per year for selected data type

def get_data (years:list, attribute:str, values:list, column_name:str):
    counted_structures=[]
    counted_years=[]
    counted_values=[]
    for year in years:
        query_year=rcsb.FieldQuery("rcsb_accession_info.initial_release_date", range_closed=(f'{year}-01-01', f'{year}-12-31'))
        for value in values:
            query_attribute = rcsb.FieldQuery(attribute, exact_match=value)
            counted_structures.append(rcsb.count(query_attribute & query_year))
            counted_years.append(year)
            counted_values.append(value)
    return pd.DataFrame(
        {column_name: counted_values,
         'publicationYear': counted_years,
         'countedStructures': counted_structures
        })

### 1) Number of all structures
number of added structures rises exponentaly

In [3]:
# number of all structures added before 1990
# obtaining data from rcsb pdb through api

query_old=rcsb.FieldQuery("rcsb_accession_info.initial_release_date", range_closed=('1960-01-01', '1989-12-31'))
all_old=rcsb.count(query_old)

In [4]:
# number of all added structures from 1990 per year

years=list(range(1990,2026))
counted_years_all=[]
counted_structures_all=[]

# obtaining data from rcsb pdb through api

for year in years:
    query_year_all=rcsb.FieldQuery("rcsb_accession_info.initial_release_date", range_closed=(f'{year}-01-01', f'{year}-12-31'))
    counted_structures_all.append(rcsb.count(query_year_all))
    counted_years_all.append(year)

# saving data to dataframe

data_rcsb_all= pd.DataFrame(
    {'publicationYear': counted_years_all,
     'countedStructures': counted_structures_all
    })

In [5]:
# plotting cumulative number of all added structures

cumulative=alt.Chart(data_rcsb_all, title='Cumulative number of published structures in the RCSB PDB'
).transform_window(
    sort=[{"field": "publicationYear"}],
    summed='sum(countedStructures)'
).transform_calculate(
    cumulative_count=alt.datum.summed + all_old
).mark_line(
    color='black'
).encode(
    x=alt.X("publicationYear:O").title('Publication year'),
    y=alt.Y("cumulative_count:Q").stack(False).title('Cumulative number of structures')
)

In [6]:
# showing plot
cumulative

### 2) Experimental metods
electron microscopy is getting more popular

In [None]:
# number of added structures obtained by selected methods from 1990 per year
# runs for about a minute

methods=["X-RAY DIFFRACTION", "SOLUTION NMR", "ELECTRON MICROSCOPY"]
data_rcsb_method=get_data(years, 'experimentalTechnique', methods, "exptl.method")

In [9]:
yearly_method=alt.Chart(data_rcsb_method, title='Number of published structures per year by experimental method'
).mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3   
).encode(
    x=alt.X('publicationYear:O').title('Publication year'),
    y=alt.Y('countedStructures:Q').title('Number of structures'),
    color=alt.Color('experimentalTechnique:N').title('Method')
)

In [10]:
# showing plot
yearly_method

In [11]:
# combining plots
alt.layer(cumulative, yearly_method)

### 3) Types of structures
there are mostly protein structures in the database

In [14]:
# number of added structures obtained by structure type from 1990 per year
# runs for about a minute

types=["Protein", "DNA", "RNA", "NA-hybrid", "Other"]
data_rcsb_type=get_data (years, "entity_poly.rcsb_entity_polymer_type", types, 'macromoleculeType')

In [15]:
yearly_type=alt.Chart(data_rcsb_type, title='Number of published structures per year by structure type'
).mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3   
).encode(
    x=alt.X('publicationYear:O').title('Publication year'),
    y=alt.Y('countedStructures:Q').title('Number of structures'),
    color=alt.Color('macromoleculeType:N').title('Type')
)
yearly_type

### 4) Classification of protein structures
most of the protein structures are enzymes

## Dataset from kaggle
obtained from https://www.kaggle.com/datasets/shahir/protein-data-set

In [None]:
# loading data
data = pd.read_csv("../data/pdb_data_no_dups.csv")

In [None]:
# allows Vega-Altair work with big datasets

alt.data_transformers.disable_max_rows()

In [None]:
# making a dataframe of most common datapoints, e.g., 10 most common categories

def top_count (df, column, cuttoff):
    return (pd.DataFrame(df[column].value_counts().head(cuttoff)).reset_index())

top_class = top_count(data, 'classification', 10)

# plotting histograms, in this case for 10 most common categories

alt.Chart(top_class).mark_bar().encode(
    x=alt.X('classification:N').sort('-y'),
    y='count:Q',
)

In [None]:
top_type = top_count(data, "macromoleculeType", 10)

alt.Chart(top_type).mark_bar().encode(
    x=alt.X('macromoleculeType:N').sort('-y'),
    y='count:Q',
)

In [None]:
# plotting pie charts
# add percentage, join small arcs into one as others
# maybe only proteins -> see if most are enzymes
# choosing multiple categories with shift key

selection = alt.selection_point(fields=['classification'], bind='legend')
types=list

alt.Chart(data).mark_arc().encode(
    theta=alt.Theta("count()"),
    color=alt.Color("classification:N", sort=alt.EncodingSortField(field="macromoleculeType", op='count', order='descending')),
    order=alt.Order('count()')
#).transform_filter(
    #selection
).add_params(
    selection
)

In [None]:
alt.Chart(data).mark_arc().encode(
    theta=alt.Theta("count()"),
    color=alt.Color("macromoleculeType:N", sort=alt.EncodingSortField(field="macromoleculeType", op='count', order='descending')
                    ,title=('Macromolecule type')),
    order=alt.Order('count()')
)

In [None]:
alt.Chart(data).mark_arc().encode(
    theta=alt.Theta("count()"),
    color=alt.Color("experimentalTechnique:N", sort=alt.EncodingSortField(field="experimentalTechnique", op='count', order='descending')
                    ,title=('Experimental technique')),
    order=alt.Order('count()')
)

In [None]:
# number of structures over time, by type, type of methods
# includes only structures with known publication year

data_year_fix=data.replace({'publicationYear':201}, 2014).dropna(subset=['publicationYear'])

alt.Chart(data_year_fix, title='Published structures per year').transform_window(
    cumulative_count="count()",
    sort=[{"field": "publicationYear"}],
).mark_area().encode(
    x=alt.X("publicationYear:O").title('Publication year'),
    y=alt.Y("cumulative_count:Q").stack(False).title('Cumulative number')
).transform_filter(
    alt.datum.publicationYear > 1990
)

In [None]:
alt.Chart(data_year_fix, title='Published structures per year').mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x='publicationYear:O',
    y='count():Q',
    color=alt.Color('experimentalTechnique:N', sort=alt.EncodingSortField(field="experimentalTechnique", op='count', order='descending')),
#).transform_filter(
#    alt.datum.experimentalTechnique != 'X-RAY DIFFRACTION'
).transform_filter(
    alt.datum.publicationYear > 1990
)

In [None]:
print(data_year_fix['publicationYear'].sort_values().unique())
print(data_year_fix['publicationYear'].value_counts())
