In [122]:
import pandas as pd
import plotly.express as px

Looking for dataset to recreate cohort of Stark et al. 2017

https://academic.oup.com/bioinformaticsadvances/article/1/1/vbab035/6432029#supplementary-data

# Stark GitHub

Data from the github repository associated to the paper: https://github.com/HannesStark/protein-localization/tree/master

In [123]:
fp_stark_deeploc_complete = 'data/stark/deeploc_complete_dataset.fasta'
fp_stark_train = 'data/stark/deeploc_our_train_set.fasta'
fp_stark_val = 'data/stark/deeploc_our_val_set.fasta'
fp_stark_test = 'data/stark/deeploc_test_set.fasta'

In [124]:
def read_fasta(fp: str):
    # parse fasta file
    proteins = dict()

    with open(fp, 'r') as f:
        lines = f.readlines()
        for i in range(0, len(lines)):
            if lines[i].startswith('>'):
                protein_name = lines[i].split()[0][1:]
                subcellular_location = lines[i].split()[1].split('-')[0]
                cellular_location = lines[i].split()[1].split('-')[1]
                if len(lines[i].split()) == 3:
                    if lines[i].split()[2] == 'test':
                        train_test_split = lines[i].split()[2]
                    else:
                        print(f'Error: {lines[i]}')
                else:
                    train_test_split = "train"
                # read sequence over all lines until next header
                sequence = ''
                for j in range(i+1, len(lines)):
                    if lines[j].startswith('>'):
                        break
                    else:
                        sequence += lines[j].strip()
                proteins[protein_name] = {
                        'subcellular_location': subcellular_location,
                        'cellular_location': cellular_location,
                        'train_test_split': train_test_split,
                        'sequence': sequence,
                        'sequence_length': len(sequence)
                    }
                    
    return proteins
    

In [125]:
def convert_dict_to_df(protein_dict: dict):
    df = pd.DataFrame.from_dict(protein_dict, orient='index')
    # reindex first column to protein_name
    df.index.name = 'protein_name'
    df.reset_index(inplace=True)
    return df

In [126]:
stark_deeploc = read_fasta(fp_stark_deeploc_complete)
df_stark_deeploc = convert_dict_to_df(stark_deeploc)
df_stark_deeploc.head()

Unnamed: 0,protein_name,subcellular_location,cellular_location,train_test_split,sequence,sequence_length
0,Q9H400,Cell.membrane,M,test,MGLPVSWAPPALWVLGCCALLLSLWALCTACRRPEDAVAPRKRARR...,295
1,Q5I0E9,Cell.membrane,M,train,MEVLEEPAPGPGGADAAERRGLRRLLLSGFQEELRALLVLAGPAFL...,566
2,P63033,Cell.membrane,M,train,MMKTLSSGNCTLNVPAKNSYRMVVLGASRVGKSSIVSRFLNGRFED...,266
3,Q9NR71,Cell.membrane,M,train,MAKRTFSNLETFLIFLLVMMSAITVALLSLLFITSGTIENHKDLGG...,780
4,Q86XT9,Cell.membrane,M,train,MGNCQAGHNLHLCLAHHPPLVCATLILLLLGLSGLGLGSFLLTHRT...,240


In [127]:
df_stark_deeploc.shape

(14004, 6)

In [128]:
stark_train = read_fasta(fp_stark_train)
df_stark_train = convert_dict_to_df(stark_train)
print(df_stark_train.shape)
df_stark_train.head()

(9503, 6)


Unnamed: 0,protein_name,subcellular_location,cellular_location,train_test_split,sequence,sequence_length
0,Q5I0E9,Cell.membrane,M,train,MEVLEEPAPGPGGADAAERRGLRRLLLSGFQEELRALLVLAGPAFL...,566
1,P63033,Cell.membrane,M,train,MMKTLSSGNCTLNVPAKNSYRMVVLGASRVGKSSIVSRFLNGRFED...,266
2,Q9NR71,Cell.membrane,M,train,MAKRTFSNLETFLIFLLVMMSAITVALLSLLFITSGTIENHKDLGG...,780
3,Q86XT9,Cell.membrane,M,train,MGNCQAGHNLHLCLAHHPPLVCATLILLLLGLSGLGLGSFLLTHRT...,240
4,A2CI98,Cell.membrane,M,train,MDPSKQGTLNRVENSVYRTAFKLRSVQTLCQLDLMDSFLIQQVLWR...,653


In [129]:
stark_val = read_fasta(fp_stark_val)
df_stark_val = convert_dict_to_df(stark_val)
print(df_stark_val.shape)
df_stark_val.head()

(1678, 6)


Unnamed: 0,protein_name,subcellular_location,cellular_location,train_test_split,sequence,sequence_length
0,Q9Y4C2,Cell.membrane,M,train,MATPSAAFEALMNGVTSWDVPEDAVPCELLLIGEASFPVMVNDMGQ...,921
1,Q9GKE8,Cell.membrane,M,train,MWPLVVVVLLGSAYCGSAQLIFNITKSVEFTVCNTTVTIPCFVNNM...,303
2,Q923X1,Cell.membrane,M,train,MRLLPLLVGFSTLLNCSYTQNCSKTTCLPNAKCEVHNGVEACFCSQ...,739
3,D2K6F1,Cell.membrane,M,train,MGFGWQGSVSIAFTALAFVVMAADWVGPDVTFTVLLAFLTAFDGQI...,883
4,Q9VWE0,Cell.membrane,M,train,MVAQEQLVLLLMLLAGCRGGANAILDPGWVIPSKVEQLIGGDFNLS...,1282


The number of our train and our val match the numbers in the paper.

In [130]:
stark_test = read_fasta(fp_stark_test)
df_stark_test = convert_dict_to_df(stark_test)
print(df_stark_test.shape)
df_stark_test.head()

(2768, 6)


Unnamed: 0,protein_name,subcellular_location,cellular_location,train_test_split,sequence,sequence_length
0,Q9H400,Cell.membrane,M,test,MGLPVSWAPPALWVLGCCALLLSLWALCTACRRPEDAVAPRKRARR...,295
1,P83456,Cell.membrane,M,test,AGFPEQEPEPKFWNDWAQKTLDKALSLQTLNKNKAQNLILFLGDGM...,477
2,Q9GL77,Cell.membrane,M,test,MEDEAVLDRGASFLKHVCDEEEVEGHHTIYIGVHVPKSYRRRRRHK...,1079
3,Q03445,Cell.membrane,M,test,MHSRLKFLAYLHFICASSIFWPEFSSAQQQQQTVSLTEKIPLGAIF...,991
4,P35525,Cell.membrane,M,test,MAAATAAAATVAGEGMEPRALQYEQTLMYGRYTQELGAFAKEEAAR...,907


In [165]:
df_stark_test['sequence_length'].max()

5654

In [172]:
fig = px.histogram(df_stark_deeploc, x='sequence_length', title='Sequence Length Distribution of Stark DeepLoc Dataset',
                   # log scale for y-acis
                     log_y=True)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()

In [173]:
# how many sequences longer than 1024 in test

df_stark_test[df_stark_test['sequence_length'] > 1024].shape

(311, 6)

# DeepLoc 1.0

Paper: https://academic.oup.com/bioinformatics/article/33/21/3387/3931857#405413820

Data: https://services.healthtech.dtu.dk/services/DeepLoc-1.0/

It is a fasta file composed by header and sequence. The header is composed by the accession number from Uniprot, the annotated subcellular localization and possibly a description field indicating if the protein was part of the test set. The subcellular localization includes an additional label, where S indicates soluble, M membrane and U unknown.

In [131]:
fp_deeploc_1 = 'data/deeploc_1.0/deeploc_data.fasta'

In [132]:
# length of file
with open(fp_deeploc_1) as f:
    n = sum(1 for _ in f)
n

28008

In [133]:
# parse fasta file
proteins = read_fasta(fp_deeploc_1)
len(proteins)

14004

In [134]:
df_proteins = convert_dict_to_df(proteins)
df_proteins.shape

(14004, 6)

In [135]:
# number of proteins in training and test set 
train_test_split = pd.DataFrame.from_dict(proteins, orient='index').groupby('train_test_split').size().reset_index(name='counts_train_test_split')
train_test_split

Unnamed: 0,train_test_split,counts_train_test_split
0,test,2773
1,train,11231


In [136]:
df_deeploc_test = df_proteins[df_proteins['train_test_split'] == 'test']
df_deeploc_test.shape

(2773, 6)

In [137]:
df_deeploc_test[df_deeploc_test['sequence_length'] <= 1024].shape

(2462, 6)

In [175]:
# compare overlap of proteins in df_deeploc_test and df_stark_test

df_deeploc_test_protein_names = set(df_deeploc_test['protein_name'])
df_stark_test_protein_names = set(df_stark_test['protein_name'])

print(f'Number of proteins in DeepLoc test set: {len(df_deeploc_test_protein_names)}')
print(f'Number of proteins in Stark test set: {len(df_stark_test_protein_names)}')
print(f'Number of proteins in both test sets: {len(df_deeploc_test_protein_names.intersection(df_stark_test_protein_names))}')

Number of proteins in DeepLoc test set: 2773
Number of proteins in Stark test set: 2768
Number of proteins in both test sets: 2768


In [176]:
# show lines of df_deep_loc_test that are not in df_stark_test

df_deeploc_test[~df_deeploc_test['protein_name'].isin(df_stark_test_protein_names)]

Unnamed: 0,protein_name,subcellular_location,cellular_location,train_test_split,sequence,sequence_length
2801,P0CX82,Cytoplasm,S,test,MANLRTQKRLAASVVGVGKRKVWLDPNETSEIAQANSRNAIRKLVK...,189
3413,P0CX84,Cytoplasm,S,test,MAGVKAYELRTKSKEQLASQLVDLKKELAELKVQKLSRPSLPKIKT...,120
3973,P0CX31,Cytoplasm,S,test,MSDAVTIRTRKVISNPLLARKQFVVDVLHPNRANVSKDELREKLAE...,135
7389,Q9Y294,Nucleus,U,test,MAKVQVNNVVVLDNPSPFYNPFQFEITFECIEDLSEDLEWKIIYVG...,204
7814,P61964,Nucleus,U,test,MATEEKKPETEAARAQPTPSSSATQSKPTPVKPNYALKFTLAGHTK...,334


Unclear as to why they are not included in the Stark test dataset.

In [177]:
# df_stark_test_protein_names show number of proteinsn in subcellular_location and cellular_location

df_stark_test.groupby(['subcellular_location', 'cellular_location']).size().reset_index(name='counts')

Unnamed: 0,subcellular_location,cellular_location,counts
0,Cell.membrane,M,273
1,Cytoplasm,S,505
2,Endoplasmic.reticulum,M,159
3,Endoplasmic.reticulum,S,6
4,Endoplasmic.reticulum,U,8
5,Extracellular,S,393
6,Golgi.apparatus,M,58
7,Golgi.apparatus,S,1
8,Golgi.apparatus,U,11
9,Lysosome/Vacuole,M,49


In [184]:
# test whether subcelular_location and cellular_location are the same in df_deeploc_test and df_stark_test

shared_proteins = df_deeploc_test[df_deeploc_test['protein_name'].isin(df_stark_test_protein_names)]

shared_proteins['subcellular_location_stark'] = shared_proteins['protein_name'].apply(lambda x: df_stark_test[df_stark_test['protein_name'] == x]['subcellular_location'].values[0])
shared_proteins['cellular_location_stark'] = shared_proteins['protein_name'].apply(lambda x: df_stark_test[df_stark_test['protein_name'] == x]['cellular_location'].values[0])

shared_proteins['subcellular_location_same'] = shared_proteins['subcellular_location'] == shared_proteins['subcellular_location_stark']
shared_proteins['cellular_location_same'] = shared_proteins['cellular_location'] == shared_proteins['cellular_location_stark']

print(f"Number of proteins in both test sets: {len(shared_proteins)}")
print(f"Number of proteins with same cellular location: {len(shared_proteins[shared_proteins['cellular_location_same'] == True])}")
print(f"Number of proteins with same subcellular location: {len(shared_proteins[shared_proteins['subcellular_location_same'] == True])}")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Number of proteins in both test sets: 2768
Number of proteins with same cellular location: 2768
Number of proteins with same subcellular location: 2768




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [180]:
# number of proteins with sequence length < 1024 in test and training set

df_proteins[df_proteins['sequence_length'] < 1024].groupby('train_test_split').size().reset_index(name='counts_sequence_length_lt_1024')

Unnamed: 0,train_test_split,counts_sequence_length_lt_1024
0,test,2462
1,train,10238


In [140]:
# plot subcellular location distribution

fig = px.bar(df_proteins.groupby('cellular_location').size().reset_index(name='counts'), 
             x='cellular_location', y='counts', 
             title='Location distribution')
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()



In [190]:
# plot subcellular location distribution

fig = px.bar(df_proteins.groupby('subcellular_location').size().reset_index(name='counts'), 
             x='subcellular_location', y='counts', 
             title='Distribution of proteins over subcellular locations',
             color_discrete_sequence=['#303496'])
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
# descending order
fig.update_xaxes(categoryorder='total descending')
fig.show()



In [142]:
df_proteins[df_proteins['subcellular_location']=='Cytoplasm'].groupby('cellular_location').size().reset_index(name='counts')

Unnamed: 0,cellular_location,counts
0,Nucleus,146
1,S,2542


In [143]:
# check if same protein_names as in df_stark_deeploc

protein_names_deeploc = df_proteins['protein_name'].to_list()
protein_names_stark_deeploc = df_stark_deeploc['protein_name'].to_list()

protein_names_deeploc.sort()
protein_names_stark_deeploc.sort()

protein_names_deeploc == protein_names_stark_deeploc

True

In [144]:
subset_df_proteins = df_proteins[~((df_proteins['subcellular_location']=='Cytoplasm') & (df_proteins['cellular_location']=="Nucleus"))]

In [145]:
subset_df_proteins.shape

(13858, 6)

In [164]:
# number of proteins with sequence length < 1024 in test and training set

subset_df_proteins[subset_df_proteins['sequence_length'] < 1024].groupby('train_test_split').size().reset_index(name='counts_sequence_length_lt_1024')

Unnamed: 0,train_test_split,counts_sequence_length_lt_1024
0,test,2462
1,train,10104


In [147]:
# plot subcellular location distribution

fig = px.bar(subset_df_proteins.groupby('subcellular_location').size().reset_index(name='counts'), 
             x='subcellular_location', y='counts', 
             title='Subcellular location distribution')
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
)
fig.show()



Matches described dataset distribution in DeepLoc paper. See Table 1:

Nucleus	4043

Cytoplasm	2542

Extracellular	1973

Mitochondrion	1510

Cell membrane	1340

Endoplasmic reticulum (ER)	862

Plastid	757

Golgi apparatus	356

Lysosome/Vacuole	321

Peroxisome	154

Does not yet match break down of Stark et al. 2021

In [148]:
# number of proteins with sequence length < 1024 in test and training set

subset_df_proteins[subset_df_proteins['sequence_length'] < 1024].groupby('train_test_split').size().reset_index(name='counts_sequence_length_lt_1024')

Unnamed: 0,train_test_split,counts_sequence_length_lt_1024
0,test,2462
1,train,10104


In [194]:
# reformat data for our task

out_dir = "data/"

# reformat to fasta file

with open(out_dir + "deeploc_test.fasta", "w") as f:
    for row_idx in range(0, len(df_deeploc_test)):
        f.write(f">{df_deeploc_test.iloc[row_idx]['protein_name']}\n{df_deeploc_test.iloc[row_idx]['sequence']}\n")

In [199]:
df_deeploc_test[['protein_name', 'subcellular_location', 'cellular_location', 'sequence_length']].to_csv(out_dir + "deeploc_test_features.csv", index=False)

# DeepLoc 2.1

Paper: https://academic.oup.com/nar/article/52/W1/W215/7642068?login=false

Data: https://services.healthtech.dtu.dk/services/DeepLoc-2.1/

In [149]:

# import training data
fp_training = 'data/deeploc_2.1/Swissprot_Train_Validation_dataset.csv'
fp_test = 'data/deeploc_2.1/hpa_testset.csv'
fp_membrane = "data/deeploc_2.1/Swissprot_Membrane_Train_Validation_dataset.csv"

### Training data

In [150]:
df_training = pd.read_csv(fp_training, index_col=0)

In [151]:
df_training.shape

(28303, 15)

In [152]:
df_training.head()

Unnamed: 0,ACC,Kingdom,Partition,Membrane,Cytoplasm,Nucleus,Extracellular,Cell membrane,Mitochondrion,Plastid,Endoplasmic reticulum,Lysosome/Vacuole,Golgi apparatus,Peroxisome,Sequence
0,Q28165,Metazoa,4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MAAAAAAAAAAGAAGGRGSGPGRRRHLVPGAGGEAGEGAPGGAGDY...
1,Q86U42,Metazoa,4,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MAAAAAAAAAAGAAGGRGSGPGRRRHLVPGAGGEAGEGAPGGAGDY...
2,Q0GA42,Metazoa,3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,MAAAAAAAAALGVRLRDCCSRGAVLLLFFSLSPRPPAAAAWLLGLR...
3,P82349,Metazoa,1,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,MAAAAAAAAATEQQGSNGPVKKSMREKAVERRNVNKEHNSNFKAGY...
4,Q7L5N1,Metazoa,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MAAAAAAAAATNGTGGSSGMEVDAAVVPSVMACGVTGSVSVALHPL...


In [153]:
fig = px.histogram(df_training, x='Kingdom', color_discrete_sequence=['#303496'])
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Kingdoms in the Training Dataset',
)
fig.show()

In [154]:
# make a bar plot of the number of proteins in each subcellular location (binary encoding in columns Partition	Membrane	Cytoplasm	Nucleus	Extracellular	Cell membrane	Mitochondrion	Plastid	Endoplasmic reticulum	Lysosome/Vacuole	Golgi apparatus	Peroxisome)

# reforamt the data
df_training_subcellular = df_training.iloc[:, 4:-1]
df_training_subcellular = df_training_subcellular.sum(axis=0).reset_index()
df_training_subcellular.columns = ['Subcellular Location', 'Count']

fig = px.bar(df_training_subcellular, x='Subcellular Location', y='Count', color='Subcellular Location', color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Subcellular Locations in the Training Dataset',
)
fig.show()

### Test data

In [155]:
df_test = pd.read_csv(fp_test, index_col=0)

In [156]:
df_test.shape

(1717, 10)

In [157]:
df_test.head()

Unnamed: 0_level_0,Cell membrane,Cytoplasm,Endoplasmic reticulum,Golgi apparatus,Lysosome/Vacuole,Mitochondrion,Nucleus,Peroxisome,Lengths,fasta
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSP00000355743,0,0,0,0,0,0,1,0,496,MAAAAGPGAALSPRPCDSDPATPGAQSPKDDNEDNSNDGTQPSKRR...
ENSP00000429628,0,0,0,0,0,0,1,0,173,MAAAALRDPAQVPVAADLLTDHEEGYVTFEDVAVYFSQEEWRLLDD...
ENSP00000266544,0,0,0,0,0,1,0,0,377,MAAAAQSRVVRVLSMSRSAITAIATSVCHGPPCRQLHHALMPHGKG...
ENSP00000470652,0,0,0,1,0,0,0,0,410,MAAAAVGAGHGAGGPGAASSSGGAREGARVAALCLLWYALSAGGNV...
ENSP00000460751,0,0,0,0,0,0,1,0,163,MAAAAVTRGTPGENSHHLKIFLPKKLLECLPRCPLLPPERLRWNTN...


In [158]:
# make a bar plot of the number of proteins in each subcellular location (binary encoding in columns Partition	Membrane	Cytoplasm	Nucleus	Extracellular	Cell membrane	Mitochondrion	Plastid	Endoplasmic reticulum	Lysosome/Vacuole	Golgi apparatus	Peroxisome)

# reforamt the data
df_test_subcellular = df_test.iloc[:, 1:-2]
df_test_subcellular = df_test_subcellular.sum(axis=0).reset_index()
df_test_subcellular.columns = ['Subcellular Location', 'Count']

fig = px.bar(df_test_subcellular, x='Subcellular Location', y='Count', color='Subcellular Location', color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Subcellular Locations in the Test Dataset',
)
fig.show()

### Membrane data

In [159]:
df_membrane = pd.read_csv(fp_membrane, index_col=0)

In [160]:
df_membrane.shape

(28026, 8)

In [161]:
df_membrane.head()

Unnamed: 0,ACC,Kingdom,Partition,Peripheral,Transmembrane,LipidAnchor,Soluble,Sequence
0,I3R9M8,Archaea,0,1,0,0,0,MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG...
1,I3R9M9,Archaea,1,1,0,0,0,MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR...
2,Q7ZAG8,Archaea,2,1,0,0,0,MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP...
3,Q8PZ67,Archaea,0,1,0,0,1,MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY...
4,Q9YGA6,Archaea,0,1,0,0,0,MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL...


In [162]:
fig = px.histogram(df_membrane, x='Kingdom', color_discrete_sequence=['#303496'])
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Kingdoms in the Membrane Dataset',
)
fig.show()

In [163]:
# make a bar plot of the number of proteins in each subcellular location (binary encoding in columns Partition	Membrane	Cytoplasm	Nucleus	Extracellular	Cell membrane	Mitochondrion	Plastid	Endoplasmic reticulum	Lysosome/Vacuole	Golgi apparatus	Peroxisome)

# reforamt the data
df_membrane_transformed = df_membrane.iloc[:, 3:-1]
df_membrane_transformed = df_membrane_transformed.sum(axis=0).reset_index()
df_membrane_transformed.columns = ['Location', 'Count']

fig = px.bar(df_membrane_transformed, x='Location', y='Count', color='Location', color_discrete_sequence=px.colors.qualitative.Plotly)
fig.update_layout(
    font=dict(color="#303496", family="Arial, sans-serif", size=14),
    template='plotly_white',
    title = 'Distribution of Locations in the Membrane Dataset',
)
fig.show()