In [12]:
import pandas as pd
import plotly.express as px

Looking for dataset to recreate cohort of Stark et al. 2017

https://academic.oup.com/bioinformaticsadvances/article/1/1/vbab035/6432029#supplementary-data

# Stark GitHub

Data from the github repository associated to the paper: https://github.com/HannesStark/protein-localization/tree/master

In [13]:
fp_stark_deeploc_complete = 'data/stark/deeploc_complete_dataset.fasta'
fp_stark_train = 'data/stark/deeploc_our_train_set.fasta'
fp_stark_val = 'data/stark/deeploc_our_val_set.fasta'
fp_stark_test = 'data/stark/deeploc_test_set.fasta'

In [14]:
def read_fasta(fp: str):
    # parse fasta file
    proteins = dict()

    with open(fp, 'r') as f:
        lines = f.readlines()
        for i in range(0, len(lines)):
            if lines[i].startswith('>'):
                protein_name = lines[i].split()[0][1:]
                subcellular_location = lines[i].split()[1].split('-')[0]
                cellular_location = lines[i].split()[1].split('-')[1]
                if len(lines[i].split()) == 3:
                    if lines[i].split()[2] == 'test':
                        train_test_split = lines[i].split()[2]
                    else:
                        print(f'Error: {lines[i]}')
                else:
                    train_test_split = "train"
                # read sequence over all lines until next header
                sequence = ''
                for j in range(i+1, len(lines)):
                    if lines[j].startswith('>'):
                        break
                    else:
                        sequence += lines[j].strip()
                proteins[protein_name] = {
                        'subcellular_location': subcellular_location,
                        'cellular_location': cellular_location,
                        'train_test_split': train_test_split,
                        'sequence': sequence,
                        'sequence_length': len(sequence)
                    }
                    
    return proteins
    

In [15]:
def convert_dict_to_df(protein_dict: dict):
    df = pd.DataFrame.from_dict(protein_dict, orient='index')
    # reindex first column to protein_name
    df.index.name = 'protein_name'
    df.reset_index(inplace=True)
    return df

In [None]:
stark_deeploc = read_fasta(fp_stark_deeploc_complete)
df_stark_deeploc = convert_dict_to_df(stark_deeploc)
df_stark_deeploc.head()

In [None]:
df_stark_deeploc.shape

In [None]:
stark_train = read_fasta(fp_stark_train)
df_stark_train = convert_dict_to_df(stark_train)
print(df_stark_train.shape)
df_stark_train.head()

In [None]:
stark_val = read_fasta(fp_stark_val)
df_stark_val = convert_dict_to_df(stark_val)
print(df_stark_val.shape)
df_stark_val.head()

The number of our train and our val match the numbers in the paper.

In [None]:
stark_test = read_fasta(fp_stark_test)
df_stark_test = convert_dict_to_df(stark_test)
print(df_stark_test.shape)
df_stark_test.head()

In [None]:
df_stark_test['sequence_length'].max()

In [None]:
fig = px.histogram(df_stark_deeploc, x='sequence_length', title='Sequence Length Distribution of Stark DeepLoc Dataset',
                   # log scale for y-acis
                     log_y=True)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()

In [None]:
# how many sequences longer than 1024 in test

df_stark_test[df_stark_test['sequence_length'] > 1024].shape

# DeepLoc 1.0

Paper: https://academic.oup.com/bioinformatics/article/33/21/3387/3931857#405413820

Data: https://services.healthtech.dtu.dk/services/DeepLoc-1.0/

It is a fasta file composed by header and sequence. The header is composed by the accession number from Uniprot, the annotated subcellular localization and possibly a description field indicating if the protein was part of the test set. The subcellular localization includes an additional label, where S indicates soluble, M membrane and U unknown.

In [16]:
fp_deeploc_1 = 'data/deeploc_1.0/deeploc_data.fasta'
fp_tissue_data = 'data/deeploc_1.0/tissue_wise_df.csv'

In [17]:
# length of file
with open(fp_deeploc_1) as f:
    n = sum(1 for _ in f)
n

28008

In [18]:
# parse fasta file
proteins = read_fasta(fp_deeploc_1)
len(proteins)

14004

In [19]:
df_proteins = convert_dict_to_df(proteins)
df_proteins.shape

(14004, 6)

In [None]:
# number of proteins in training and test set 
train_test_split = pd.DataFrame.from_dict(proteins, orient='index').groupby('train_test_split').size().reset_index(name='counts_train_test_split')
train_test_split

In [20]:
df_deeploc_test = df_proteins[df_proteins['train_test_split'] == 'test']
df_deeploc_train = df_proteins[df_proteins['train_test_split'] == 'train']
df_deeploc_train_subset = df_deeploc_train.sample(n=500, random_state=8)
df_deeploc_train.shape, df_deeploc_test.shape, df_proteins.shape, df_deeploc_train_subset.shape

((11231, 6), (2773, 6), (14004, 6), (500, 6))

In [None]:
df_deeploc_test[df_deeploc_test['sequence_length'] <= 1024].shape

In [None]:
# compare overlap of proteins in df_deeploc_test and df_stark_test

df_deeploc_test_protein_names = set(df_deeploc_test['protein_name'])
df_stark_test_protein_names = set(df_stark_test['protein_name'])

print(f'Number of proteins in DeepLoc test set: {len(df_deeploc_test_protein_names)}')
print(f'Number of proteins in Stark test set: {len(df_stark_test_protein_names)}')
print(f'Number of proteins in both test sets: {len(df_deeploc_test_protein_names.intersection(df_stark_test_protein_names))}')

In [None]:
# show lines of df_deep_loc_test that are not in df_stark_test

df_deeploc_test[~df_deeploc_test['protein_name'].isin(df_stark_test_protein_names)]

Unclear as to why they are not included in the Stark test dataset.

In [None]:
# df_stark_test_protein_names show number of proteinsn in subcellular_location and cellular_location

df_stark_test.groupby(['subcellular_location', 'cellular_location']).size().reset_index(name='counts')

In [None]:
# test whether subcelular_location and cellular_location are the same in df_deeploc_test and df_stark_test

shared_proteins = df_deeploc_test[df_deeploc_test['protein_name'].isin(df_stark_test_protein_names)]

shared_proteins['subcellular_location_stark'] = shared_proteins['protein_name'].apply(lambda x: df_stark_test[df_stark_test['protein_name'] == x]['subcellular_location'].values[0])
shared_proteins['cellular_location_stark'] = shared_proteins['protein_name'].apply(lambda x: df_stark_test[df_stark_test['protein_name'] == x]['cellular_location'].values[0])

shared_proteins['subcellular_location_same'] = shared_proteins['subcellular_location'] == shared_proteins['subcellular_location_stark']
shared_proteins['cellular_location_same'] = shared_proteins['cellular_location'] == shared_proteins['cellular_location_stark']

print(f"Number of proteins in both test sets: {len(shared_proteins)}")
print(f"Number of proteins with same cellular location: {len(shared_proteins[shared_proteins['cellular_location_same'] == True])}")
print(f"Number of proteins with same subcellular location: {len(shared_proteins[shared_proteins['subcellular_location_same'] == True])}")


In [None]:
# number of proteins with sequence length < 1024 in test and training set

df_proteins[df_proteins['sequence_length'] < 1024].groupby('train_test_split').size().reset_index(name='counts_sequence_length_lt_1024')

In [None]:
# plot subcellular location distribution

fig = px.bar(df_proteins.groupby('cellular_location').size().reset_index(name='counts'), 
             x='cellular_location', y='counts', 
             title='Location distribution')
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()



In [None]:
# plot subcellular location distribution

fig = px.bar(df_deeploc_train_subset.groupby('cellular_location').size().reset_index(name='counts'), 
             x='cellular_location', y='counts', 
             title='Location distribution in random subset')
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()



In [None]:
# plot subcellular location distribution

fig = px.bar(df_proteins.groupby('subcellular_location').size().reset_index(name='counts'), 
             x='subcellular_location', y='counts', 
             title='Distribution of proteins over subcellular locations',
             color_discrete_sequence=['#303496'])
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
# descending order
fig.update_xaxes(categoryorder='total descending')
fig.show()



In [None]:
# plot subcellular location distribution

fig = px.bar(df_deeploc_train_subset.groupby('subcellular_location').size().reset_index(name='counts'), 
             x='subcellular_location', y='counts', 
             title='Distribution of proteins over subcellular locations in the random subset',
             color_discrete_sequence=['#303496'])
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
# descending order
fig.update_xaxes(categoryorder='total descending')
fig.show()



In [None]:
df_proteins[df_proteins['subcellular_location']=='Cytoplasm'].groupby('cellular_location').size().reset_index(name='counts')

In [None]:
# check if same protein_names as in df_stark_deeploc

protein_names_deeploc = df_proteins['protein_name'].to_list()
protein_names_stark_deeploc = df_stark_deeploc['protein_name'].to_list()

protein_names_deeploc.sort()
protein_names_stark_deeploc.sort()

protein_names_deeploc == protein_names_stark_deeploc

In [55]:
subset_df_proteins = df_proteins[~((df_proteins['subcellular_location']=='Cytoplasm') & (df_proteins['cellular_location']=="Nucleus"))]

In [None]:
subset_df_proteins.shape

In [None]:
# number of proteins with sequence length < 1024 in test and training set

subset_df_proteins[subset_df_proteins['sequence_length'] < 1024].groupby('train_test_split').size().reset_index(name='counts_sequence_length_lt_1024')

In [None]:
# plot subcellular location distribution

fig = px.bar(subset_df_proteins.groupby('subcellular_location').size().reset_index(name='counts'), 
             x='subcellular_location', y='counts', 
             title='Subcellular location distribution')
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.update_xaxes(categoryorder='total descending')
fig.show()



Matches described dataset distribution in DeepLoc paper. See Table 1:

Nucleus	4043

Cytoplasm	2542

Extracellular	1973

Mitochondrion	1510

Cell membrane	1340

Endoplasmic reticulum (ER)	862

Plastid	757

Golgi apparatus	356

Lysosome/Vacuole	321

Peroxisome	154

Does not yet match break down of Stark et al. 2021

In [None]:
# number of proteins with sequence length < 1024 in test and training set

subset_df_proteins[subset_df_proteins['sequence_length'] < 1024].groupby('train_test_split').size().reset_index(name='counts_sequence_length_lt_1024')

In [None]:
df_deeploc_train_subset['sequence_length'][:10]

### Tissue data

In [None]:
df_tissue = pd.read_csv(fp_tissue_data)
print(df_tissue.shape)
df_tissue.head()

In [None]:
df_tissue = df_tissue.dropna(how='all')
df_tissue.shape

In [None]:
df_tissue.columns

In [None]:
df_tissue['UniProtKB'].nunique()

In [None]:
df_tissue['UniProtKB'].isna().sum()

In [None]:
df_tissue['UniProt Isoform'].isna().sum()

In [None]:
proteins_tissue = set(df_tissue['UniProtKB'])
proteins_deeploc_train = set(df_deeploc_train['protein_name'])
proteins_deeploc_test = set(df_deeploc_test['protein_name'])
len(proteins_tissue), len(proteins_deeploc_train), len(proteins_deeploc_test)

In [None]:
# number of tissue data points in training set
len(proteins_deeploc_train) - len(proteins_deeploc_train - proteins_tissue)

In [None]:
# number of tissue data points in test set
len(proteins_deeploc_test) - len(proteins_deeploc_test - proteins_tissue)

In [None]:
df_tissue

In [None]:
# Identify duplicated rows based on a specific column
duplicated_rows = df_tissue[df_tissue['UniProt Isoform'].duplicated(keep=False)]

# Display the result
duplicated_rows

In [None]:
df_tissue_long = df_tissue.iloc[:, 0:31].melt(var_name="Column", value_name="Value")

fig = px.violin(df_tissue_long, x="Column", y="Value", title="Tissue expression per tissue")
fig.update_layout(
    template="plotly_white",
    font={'family': 'Arial', 'color': 'black'}
)

fig.show()

In [None]:
df_tissue_long = df_tissue.iloc[:, 0:31].melt(var_name="Column", value_name="Value")

fig = px.box(df_tissue_long, x="Column", y="Value", log_y=True, title="Tissue expression per tissue on log scale")
fig.update_layout(
    template="plotly_white",
    font={'family': 'Arial', 'color': 'black'}
)

fig.show()

### Export data

In [74]:
# reformat data for our task

out_dir = "data/"

# reformat to fasta file

with open(out_dir + "deeploc_test.fasta", "w") as f:
    for row_idx in range(0, len(df_deeploc_test)):
        f.write(f">{df_deeploc_test.iloc[row_idx]['protein_name']}\n{df_deeploc_test.iloc[row_idx]['sequence']}\n")
        

with open(out_dir + "deeploc_train.fasta", "w") as f:
    for row_idx in range(0, len(df_deeploc_train)):
            f.write(f">{df_deeploc_train.iloc[row_idx]['protein_name']}\n{df_deeploc_train.iloc[row_idx]['sequence']}\n")

with open(out_dir + "deeploc_train_subset.fasta", "w") as f:
    for row_idx in range(0, len(df_deeploc_train_subset)):
            f.write(f">{df_deeploc_train_subset.iloc[row_idx]['protein_name']}\n{df_deeploc_train_subset.iloc[row_idx]['sequence']}\n")

In [None]:
df_deeploc_test[['protein_name', 'subcellular_location', 'cellular_location', 'sequence_length']].to_csv(out_dir + "deeploc_test_features.csv", index=False)
df_deeploc_train[['protein_name', 'subcellular_location', 'cellular_location', 'sequence_length']].to_csv(out_dir + "deeploc_train_features.csv", index=False)
df_deeploc_train_subset[['protein_name', 'subcellular_location', 'cellular_location', 'sequence_length']].to_csv(out_dir + "deeploc_train_subset_features.csv", index=False)

In [None]:
# plot the distribution of sequence length per categories

df_deeploc_train.head()

In [None]:
fig = px.histogram(
    data_frame = df_deeploc_train,
    color_discrete_sequence=['#303496'],
    x = 'sequence_length'
)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()

In [None]:
fig = px.histogram(
    data_frame = df_deeploc_train,
    color='cellular_location',
    x = 'sequence_length',
    barmode="overlay",
    histnorm="percent",
)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()

In [None]:
import plotly.express as px

fig = px.histogram(
    data_frame=df_deeploc_train,
    x='sequence_length',
    facet_row='cellular_location',
    histnorm="percent",
    color_discrete_sequence=px.colors.qualitative.Set1,
)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title="Sequence Length Distribution by Cellular Location"
)
fig.show()

In [None]:
import plotly.express as px

fig = px.histogram(
    data_frame=df_deeploc_train,
    x='sequence_length',
    facet_row='subcellular_location',
    histnorm="percent",
    color_discrete_sequence=px.colors.qualitative.Set1,
)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title="Sequence Length Distribution by Cellular Location"
)
fig.show()

In [34]:
from plotly.subplots import make_subplots
from math import ceil
import plotly.graph_objects as go

locations = df_deeploc_train['subcellular_location'].unique()

n_cols = 5  # Number of columns
n_rows = ceil(len(locations) / n_cols)  # Compute rows using ceiling

# Create subplots
fig = make_subplots(
    rows=n_rows,
    cols=n_cols,
    subplot_titles=locations,  # Use unique locations as subplot titles
    horizontal_spacing=0.1,
    vertical_spacing=0.1
)

# Add histograms for each subcellular location
for i, location in enumerate(locations):
    row = i // n_cols + 1
    col = i % n_cols + 1
    subset = df_deeploc_train[df_deeploc_train['subcellular_location'] == location]
    
    fig.add_trace(
        go.Histogram(
            x=subset['sequence_length'],
            histnorm="percent",
            marker_color=px.colors.qualitative.Set2[i % len(px.colors.qualitative.Set2)],
            name=location
        ),
        row=row, col=col
    )

# Update layout
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title="Sequence Length Distribution by Subcellular Location",
    height=400 * n_rows,  # Adjust height dynamically based on rows
    showlegend=False  # Set to True if you want legends
)

# Update x-axis and y-axis labels with a cutoff at 6000 for x-axis range
fig.update_xaxes(title_text="Sequence Length", range=[0, 6000])
fig.update_yaxes(title_text="Percent")

fig.show()

In [38]:
n_cols = 5  # Number of columns
n_rows = ceil(len(locations) / n_cols)  # Compute rows using ceiling

# Prepare subplot titles with median values
subplot_titles = []
for location in locations:
    subset = df_deeploc_train[df_deeploc_train['subcellular_location'] == location]
    median_value = subset['sequence_length'].median()
    subplot_titles.append(f"{location}<br>Median: {median_value:.2f}")

# Create subplots
fig = make_subplots(
    rows=n_rows,
    cols=n_cols,
    subplot_titles=subplot_titles,  # Use median values in subplot titles
    horizontal_spacing=0.1,
    vertical_spacing=0.2
)

# Add histograms and median lines for each subcellular location
for i, location in enumerate(locations):
    row = i // n_cols + 1
    col = i % n_cols + 1
    subset = df_deeploc_train[df_deeploc_train['subcellular_location'] == location]
    
    # Calculate the median
    median_value = subset['sequence_length'].median()
    
    # Add histogram
    fig.add_trace(
        go.Histogram(
            x=subset['sequence_length'],
            histnorm="percent",
            marker_color=px.colors.qualitative.Set2[i % len(px.colors.qualitative.Set2)],
            name=location
        ),
        row=row, col=col
    )

# Update layout
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=12),
    template='plotly_white',
    title=f"Sequence Length Distribution by Subcellular Location\nMedian: {median_value:.2f}",
    height=400 * n_rows,  # Adjust height dynamically based on rows
    showlegend=False  # Set to True if you want legends
)

# Update x-axis and y-axis labels with a cutoff at 6000 for x-axis range
fig.update_xaxes(title_text="Sequence Length", range=[0, 6000])
fig.update_yaxes(title_text="Percent")

fig.show()

In [None]:
fig = px.histogram(
    data_frame = df_deeploc_train,
    color='subcellular_location',
    x = 'sequence_length',
    barmode="overlay",
    histnorm="percent",
)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()

# DeepLoc 2.1

Paper: https://academic.oup.com/nar/article/52/W1/W215/7642068?login=false

Data: https://services.healthtech.dtu.dk/services/DeepLoc-2.1/

In [53]:

# import training data
fp_training = 'data/deeploc_2.1/Swissprot_Train_Validation_dataset.csv'
fp_test = 'data/deeploc_2.1/hpa_testset.csv'
fp_membrane = "data/deeploc_2.1/Swissprot_Membrane_Train_Validation_dataset.csv"

### Training data

In [37]:
df_training = pd.read_csv(fp_training, index_col=0)

In [None]:
df_training.shape

In [None]:
df_training.head()

In [None]:
fig = px.histogram(df_training, x='Kingdom', color_discrete_sequence=['#303496'])
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Kingdoms in the Training Dataset',
)
fig.show()

In [None]:
# make a bar plot of the number of proteins in each subcellular location (binary encoding in columns Partition	Membrane	Cytoplasm	Nucleus	Extracellular	Cell membrane	Mitochondrion	Plastid	Endoplasmic reticulum	Lysosome/Vacuole	Golgi apparatus	Peroxisome)

# reforamt the data
df_training_subcellular = df_training.iloc[:, 4:-1]
df_training_subcellular = df_training_subcellular.sum(axis=0).reset_index()
df_training_subcellular.columns = ['Subcellular Location', 'Count']

fig = px.bar(df_training_subcellular, x='Subcellular Location', y='Count', color='Subcellular Location', color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Subcellular Locations in the Training Dataset',
)
fig.show()

### Test data

In [42]:
df_test = pd.read_csv(fp_test, index_col=0)

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
# make a bar plot of the number of proteins in each subcellular location (binary encoding in columns Partition	Membrane	Cytoplasm	Nucleus	Extracellular	Cell membrane	Mitochondrion	Plastid	Endoplasmic reticulum	Lysosome/Vacuole	Golgi apparatus	Peroxisome)

# reforamt the data
df_test_subcellular = df_test.iloc[:, 1:-2]
df_test_subcellular = df_test_subcellular.sum(axis=0).reset_index()
df_test_subcellular.columns = ['Subcellular Location', 'Count']

fig = px.bar(df_test_subcellular, x='Subcellular Location', y='Count', color='Subcellular Location', color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Subcellular Locations in the Test Dataset',
)
fig.show()

### Membrane data

In [46]:
df_membrane = pd.read_csv(fp_membrane, index_col=0)

In [None]:
df_membrane.shape

In [None]:
df_membrane.head()

In [None]:
fig = px.histogram(df_membrane, x='Kingdom', color_discrete_sequence=['#303496'])
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Kingdoms in the Membrane Dataset',
)
fig.show()

In [None]:
# make a bar plot of the number of proteins in each subcellular location (binary encoding in columns Partition	Membrane	Cytoplasm	Nucleus	Extracellular	Cell membrane	Mitochondrion	Plastid	Endoplasmic reticulum	Lysosome/Vacuole	Golgi apparatus	Peroxisome)

# reforamt the data
df_membrane_transformed = df_membrane.iloc[:, 3:-1]
df_membrane_transformed = df_membrane_transformed.sum(axis=0).reset_index()
df_membrane_transformed.columns = ['Location', 'Count']

fig = px.bar(df_membrane_transformed, x='Location', y='Count', color='Location', color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Locations in the Membrane Dataset',
)
fig.show()