In [28]:
import pandas as pd


In [15]:
taxa = pd.read_csv("./taxa.csv",index_col='protein',sep=',')
taxa = taxa.fillna("MISSING VALUE")

In [16]:
taxa.groupby('superkingdom_name').count()

Unnamed: 0_level_0,organism_name,kingdom_name
superkingdom_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Archaea,1154,1154
Bacteria,118743,118743
Eukaryota,331073,331073
MISSING VALUE,1334,1334
Viruses,102,102


In [22]:
taxa.groupby('kingdom_name').count()

Unnamed: 0_level_0,organism_name,superkingdom_name
kingdom_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bamfordvirae,74,74
Fungi,122268,122268
Heunggongvirae,4,4
MISSING VALUE,126266,126266
Metazoa,107793,107793
Viridiplantae,96001,96001


In [5]:
taxa.groupby(['superkingdom_name','kingdom_name']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,organism_name
superkingdom_name,kingdom_name,Unnamed: 2_level_1
Archaea,MISSING VALUE,1154
Bacteria,MISSING VALUE,118743
Eukaryota,Fungi,122268
Eukaryota,MISSING VALUE,5011
Eukaryota,Metazoa,107793
Eukaryota,Viridiplantae,96001
MISSING VALUE,MISSING VALUE,1334
Viruses,Bamfordvirae,74
Viruses,Heunggongvirae,4
Viruses,MISSING VALUE,24


In [6]:
taxa[taxa['superkingdom_name'] == 'MISSING VALUE']

Unnamed: 0_level_0,organism_name,kingdom_name,superkingdom_name
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A2S2N5M3,bird metagenome,MISSING VALUE,MISSING VALUE
A0A2S2N5P5,bird metagenome,MISSING VALUE,MISSING VALUE
A0A2H9T8R4,invertebrate metagenome,MISSING VALUE,MISSING VALUE
A0A2H9T8P1,invertebrate metagenome,MISSING VALUE,MISSING VALUE
A0A2H9T2H2,invertebrate metagenome,MISSING VALUE,MISSING VALUE
...,...,...,...
E9L1I1,uncultured organism CA37,MISSING VALUE,MISSING VALUE
E9L1L3,uncultured organism CA878,MISSING VALUE,MISSING VALUE
E9L1L5,uncultured organism CA878,MISSING VALUE,MISSING VALUE
E9L1I0,uncultured organism CA37,MISSING VALUE,MISSING VALUE


In [7]:
reactions = pd.read_csv("reaction_data.csv", index_col='protein')
#the values in the "protein" column of the CSV file will be used as the index for the resulting DataFrame.

In [8]:
reactions.index.unique()

Index(['Q7Z1V1', 'Q9LJK2', 'Q949P1', 'O81077', 'A2Z212', 'Q05JG2', 'Q09J79',
       'K4CI52', 'Q9FH76', 'Q09J78',
       ...
       'A0A5N6KS64', 'M8CM41', 'A0A2G2XGU4', 'A0A8J6DA16', 'A0A2G5EH64',
       'A0A835LII0', 'A0A835IF43', 'A0A445AAS4', 'A0A445AAN0', 'A0A835HWA8'],
      dtype='object', name='protein', length=12543)

In [9]:
protein_reaction_pairs = reactions['reaction'].reset_index().drop_duplicates()
# operating on reactions dataframe - selecting the 'reaction' column from reactions DF, calls reset_index method which resets the index of the resulting DF, removes duplicate rows
# protein_reaction_pairs DF contains unique 'reaction' values from the "reactions" DF

In [10]:
protein_reaction_pairs.shape

(29568, 2)

In [11]:
protein_reaction_pairs.groupby('protein').count().sort_values(by = 'reaction')
#group the rows of the dataframe by the protein column, using count to count the number of occurrences of each unique protein value
# sorts the resulting dataframe based on the reaction column ascending order

Unnamed: 0_level_0,reaction
protein,Unnamed: 1_level_1
A0A010QIR3,1
A0A7N5P6C8,1
A0A7N5K661,1
A0A7N5J931,1
A0A7N4PNH4,1
...,...
Q08477,33
A0A8J6HC20,40
A0A0K3BQV0,40
A0A0N9I5N1,40


In [12]:
protein_reaction_pairs.groupby('reaction').count().sort_values(by = 'protein')
#opposite as previous - grouping rows by reaction column, and sorting based on values in the reaction column 

Unnamed: 0_level_0,protein
reaction,Unnamed: 1_level_1
73935,1
47336,1
65332,1
65336,1
65520,1
...,...
65760,810
50244,810
35739,1289
24040,3169


In [13]:
pfams = pd.read_csv("pfams.csv")

In [14]:
pfam_counts = pfams.groupby('pfam').count().sort_values('protein', ascending = False)
# sorting in descending order

In [15]:
pfam_counts.head(30)

Unnamed: 0_level_0,protein
pfam,Unnamed: 1_level_1
PF00067,452406
PF00175,4245
PF00258,4133
PF00667,3967
PF03098,1486
PF00111,944
PF07690,251
PF05719,232
PF00970,220
PF04082,213


In [16]:
pfam_counts.shape

(2205, 1)

In [17]:
pfams.groupby('protein').count().sort_values('pfam', ascending = False).head(20)
#show for 20 unique protein values with the highest count, how many times each 'protein' value occurs in the "pfams" DF


Unnamed: 0_level_0,pfam
protein,Unnamed: 1_level_1
A0A8H4FNC2,12
A0A8H7J8G9,11
A0A812PC03,11
A0A0N9I5N1,11
A0A0K3BQV0,11
A0A094C837,11
A0A094G8Q6,11
A0A091CS17,10
A0A8J6HC20,10
A0A8T0EY54,10


In [18]:
names = pd.read_csv("names.csv", index_col='protein') # values in the "protein" column used as index for names DF

In [38]:
print(names[names.isna().all(axis=1)])

           full_recommended_name full_submitted_name
protein                                             
A0A5K1JUS4                   NaN                 NaN
A0A5K1JVN5                   NaN                 NaN
A0A5K1K1S8                   NaN                 NaN
A0A5K1K3Q3                   NaN                 NaN
A0A5K1K115                   NaN                 NaN
A0A5K1K5I7                   NaN                 NaN
A0A5K1JWN2                   NaN                 NaN
A0A5K1JYA0                   NaN                 NaN
A0A5K1K267                   NaN                 NaN
A0A5K1JUI9                   NaN                 NaN
A0A5K1JXP8                   NaN                 NaN
A0A5K1JWW2                   NaN                 NaN
A0A5K1JWN4                   NaN                 NaN
A0A5K1JWA5                   NaN                 NaN
A0A5K1K795                   NaN                 NaN
A0A5K1JT80                   NaN                 NaN
A0A5K1JTM0                   NaN              

In [20]:
names[~names.isna().any(axis=1)].shape
#rows where all values are not missing from the "names" DataFrame

(0, 2)

In [41]:
names[~names.isna().all(axis=1)].index.unique().shape
#array containing the unique values of the index for the rows where not all values are missing from the "names" DataFrame
#print(names[~names.isna().all(axis=1)].index.unique()[0])

Q023S4


In [22]:
name_counts = names.groupby(names.index).count()

In [23]:
name_counts[(name_counts >= 2).any(axis=1)].sort_values(by='full_submitted_name')

Unnamed: 0_level_0,full_recommended_name,full_submitted_name
protein,Unnamed: 1_level_1,Unnamed: 2_level_1
A0A031JCP3,0,2
B0XJT6,0,2
B0XJW5,0,2
B0XJW6,0,2
B0XJY3,0,2
...,...,...
Q9FDZ1,0,5
C7EXA5,0,5
A0A2I0MCB5,0,6
C6KDT8,0,6


In [24]:
unique_reactions = reactions.reset_index().drop('protein', axis=1).drop_duplicates()
reaction_contents = unique_reactions.groupby(['reaction']).aggregate({'chebi': list})
reaction_side_contents = unique_reactions.groupby(['reaction','reaction_side_order']).aggregate({'chebi': list})
#The code creates a new DataFrame named "unique_reactions" by resetting the index of "reactions" using the reset_index method and then removing the "protein" column using the drop method. The drop method is called with the axis parameter set to 1 to indicate that the operation should be performed along the columns axis, and the drop_duplicates method to keep only the unique rows.

#The code creates a new DataFrame named "reaction_contents" by grouping the "unique_reactions" DataFrame by the "reaction" column and aggregating the "chebi" column into lists using the groupby method and the aggregate method with a dictionary argument that maps the "chebi" column to the list aggregation function.

#The code creates a new DataFrame named "reaction_side_contents" by grouping the "unique_reactions" DataFrame by both the "reaction" column and the "reaction_side_order" column, and aggregating the "chebi" column into lists using the groupby method and the aggregate method with a dictionary argument that maps the "chebi" column to the list aggregation function.

In [42]:
not_interesting_compounds =[
    'CHEBI_15377', #H20
    'CHEBI_15379', #O2
    'CHEBI_15378', #H+
    'CHEBI_16526'  #CO2
    #add more non interesting compounds
]

cofactors = [
    'CHEBI_57618' ,# FMNH2 -> FMN (CHEBI_58210)
    'CHEBI_33738' ,# [2Fe-2S]1+ -> [2Fe-2S]1+ (CHEBI_33737)
    'CHEBI_57783' ,# NADPH -> NADP (CHEBI_58349)
    'CHEBI_30616' ,# ATP -> AMP/ADP (CHEBI_456215, CHEBI_456216)
    'CHEBI_59789' # S-adenosyl-L-methionine ->  S-methyl-5'-thioadenosine (CHEBI_33019)
    #CHEBI_33738  #andredoxin
    # treba mi sem pridat napr aj NADP+

]

cofactor_results = [
    'CHEBI_58210',
    'CHEBI_33737',
    'CHEBI_58349',
    'CHEBI_456216', 'CHEBI_456215',
    'CHEBI_33019',
]
unique_reactions[~unique_reactions['chebi'].isin(not_interesting_compounds+cofactors+cofactor_results)].groupby(
    ['chebi','reaction_side_order']
).count().sort_values('reaction', ascending=False).head(50)
# all products and substrates not in non interesting compounds, cofactors & cofactor results
# treba nejako klasifikovat co budu tie produkty a tie substraty co su dolezite pre tie reakcie a co su tam este tie veci co nas nezaujimaju
# zaroven treba vediet nejak klasifikovat tie kofaktory - napr.: berieme do uvahy heme ako kofaktor ktory nas zaujima alebo 
#   ak je v tej reakcii napr NADPH medzi substratmi a NADP+ medzi produktami tak berieme toto ako kofaktor tej reakcie?? 
# zaroven ten heme nie je v RHEA annotovany ako kofaktor ani nic co posobi v tej reakcii takze to by bolo dobre tam nejak podoplnat

Unnamed: 0_level_0,Unnamed: 1_level_0,reaction,smiles
chebi,reaction_side_order,Unnamed: 2_level_1,Unnamed: 3_level_1
CHEBI_32395,1,25,25
CHEBI_43474,2,19,19
CHEBI_16113,1,15,15
CHEBI_57540,1,13,13
CHEBI_57945,2,12,12
CHEBI_57287,2,12,12
CHEBI_57856,2,11,11
CHEBI_64479,2,11,11
CHEBI_16240,1,10,10
CHEBI_15740,2,10,10


In [26]:
classified_reactions = unique_reactions[unique_reactions['chebi'].isin(cofactors)].reaction.unique()
non_classified_reactions = unique_reactions[~unique_reactions['reaction'].isin(classified_reactions)]

In [27]:
non_classified_reactions[~non_classified_reactions['chebi'].isin(not_interesting_compounds+cofactors+cofactor_results)].groupby(
    ['chebi','reaction_side_order']
).count().sort_values('reaction', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,reaction,smiles
chebi,reaction_side_order,Unnamed: 2_level_1,Unnamed: 3_level_1
CHEBI_57540,1,12,12
CHEBI_64479,2,11,11
CHEBI_57945,2,10,10
CHEBI_16240,1,10,10
CHEBI_78449,1,9,9
...,...,...,...
CHEBI_29806,2,1,1
CHEBI_29748,1,1,1
CHEBI_295975,2,1,1
CHEBI_29034,2,1,1


In [37]:
non_classified_reactions[non_classified_reactions['chebi'].isin(['CHEBI_30616'])]

Unnamed: 0,reaction,reaction_side_order,chebi,smiles


In [36]:
non_classified_reactions.reaction.unique().shape

(178,)

In [31]:
atp_rs = {r for r in unique_reactions[unique_reactions['chebi'].isin(['CHEBI_30616'])].reaction}
adpamp_rs = {r for r in unique_reactions[unique_reactions['chebi'].isin(['CHEBI_456216', 'CHEBI_456215'])].reaction}

In [32]:
adpamp_rs - atp_rs

{10040, 10412, 37059, 48612}