## Read in UniProt Sequence Data - GitEnterprise

In [1]:
#imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#read in the data for cleaning
#this data is saved in a compressed pickle file
protein = pd.read_pickle('data/gzip-reviewed-proteins-data.pkl', compression = 'gzip')

In [3]:
#view the read in data
protein.head(2)

Unnamed: 0,Protein names,Organism,Length,Sequence,Gene ontology (molecular function)
0,Replicase polyprotein 1ab (pp1ab) (ORF1ab poly...,Avian infectious bronchitis virus (strain M41)...,6631,MASSLKQGVSPKLRDVILVSKDIPEQLCDALFFYTSHNPKDYADAF...,ATP binding [GO:0005524]; cysteine-type endope...
1,CAD protein [Includes: Glutamine-dependent car...,Mus musculus (Mouse),2225,MAALVLEDGSVLQGRPFGAAVSTAGEVVFQTGMVGYPEALTDPSYK...,amino acid binding [GO:0016597]; aspartate car...


In [4]:
#look at the info about the data and the columns
protein.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404840 entries, 0 to 414320
Data columns (total 5 columns):
 #   Column                              Non-Null Count   Dtype   
---  ------                              --------------   -----   
 0   Protein names                       404840 non-null  category
 1   Organism                            404840 non-null  category
 2   Length                              404840 non-null  int32   
 3   Sequence                            404840 non-null  category
 4   Gene ontology (molecular function)  404840 non-null  category
dtypes: category(4), int32(1)
memory usage: 26.9 MB


In [5]:
#create dataframe of the classes
GO_function = protein[['Gene ontology (molecular function)','Sequence', 'Length', 'Organism', 'Protein names']]

In [6]:
#drop the null values in the new function dataframe
GO_function = GO_function.dropna(axis=0, how = 'any')

In [7]:
##contains is reading the brackets as regex -- make sure regex = False
GO_function['Gene ontology (molecular function)'] = GO_function['Gene ontology (molecular function)'].map(str)

In [8]:
#check shape
GO_function.shape

(404840, 5)

In [9]:
#start searching for groupings of keywords
GO_function.nunique()

Gene ontology (molecular function)     28473
Sequence                              338697
Length                                  3112
Organism                               11223
Protein names                          86637
dtype: int64

## Class 1 - rRNA Binding & Structural Constituent of the Ribosome

In [10]:
#narrowing down classes -- rRNA Binding & structual consituent of the ribosome CLASS 1
rRNA_binding = GO_function[GO_function['Gene ontology (molecular function)'].str.startswith(
    'rRNA binding [GO:0019843]; structural constituent of ribosome [GO:0003735]')]
rRNA_binding.shape

(26348, 5)

In [11]:
#contains what we want
rRNA_binding_contains = GO_function[GO_function['Gene ontology (molecular function)'].str.contains('rRNA binding [GO:0019843]; structural constituent of ribosome [GO:0003735]', regex=False)]
rRNA_binding_contains.shape

(27985, 5)

In [12]:
#create column for class
#rRNA binding = class 1
rRNA_binding_contains['class'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rRNA_binding_contains['class'] = 1


In [13]:
#the unique values
rRNA_binding_contains.nunique()

Gene ontology (molecular function)       41
Sequence                              20054
Length                                  303
Organism                               1410
Protein names                           571
class                                     1
dtype: int64

In [14]:
#searching for overlap in the classes
rRNA_binding_contains[rRNA_binding_contains['Gene ontology (molecular function)'].str.contains('DNA binding [GO:0003677]',regex=False)]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
70846,DNA binding [GO:0003677]; rRNA binding [GO:001...,MHVKKGDKVMVISGKDKGKQGTILAAFPKKDRVLVEGVNMVKKHSK...,103,Bacillus subtilis (strain 168),50S ribosomal protein L24 (12 kDa DNA-binding ...,1
232889,DNA binding [GO:0003677]; rRNA binding [GO:001...,MHVKKGDKVMVISGKDKGKQGTILAAFPKKDRVLVEGVNMVKKHSK...,103,Bacillus subtilis subsp. spizizenii (strain AT...,50S ribosomal protein L24,1
413798,DNA binding [GO:0003677]; nucleotide binding [...,MADNEEDVEAEEEYTELTDISGVGPSKAESLREAGFESVEDVRGAD...,241,Haloarcula marismortui (strain ATCC 43049 / DS...,50S ribosomal protein L32e (Hl5),1


In [15]:
#searching for overlap in the classes -- none with ATP
rRNA_binding_contains[rRNA_binding_contains['Gene ontology (molecular function)'].str.contains('ATP binding [GO:0005524]',regex=False)]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class


## Class 2 - DNA Binding

In [16]:
#narrowing down classes -- DNA binding CLASS 2
DNA_binding = GO_function[GO_function['Gene ontology (molecular function)'].str.startswith("DNA binding [GO:0003677]")]
DNA_binding.shape

(21886, 5)

In [17]:
#the dataframe of DNA binding that is 'contained' within the GO function columns
DNA_binding_contains = GO_function[GO_function['Gene ontology (molecular function)'].str.contains("DNA binding [GO:0003677]",regex=False)]
DNA_binding_contains.shape

(30043, 5)

In [18]:
#create class 2
DNA_binding_contains['class'] = 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DNA_binding_contains['class'] = 2


In [19]:
#the unique values
DNA_binding_contains.nunique()

Gene ontology (molecular function)     2553
Sequence                              24785
Length                                 1816
Organism                               3060
Protein names                          9878
class                                     1
dtype: int64

In [20]:
#check the class is there
DNA_binding_contains.head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
3,bacterial-type RNA polymerase core enzyme bind...,MSQNTLKVHDLNEDAEFDENGVEVFDEKALVEQEPSDNDLAEEELL...,330,Escherichia coli (strain K12),RNA polymerase sigma factor RpoS (Sigma S) (Si...,2
22,DNA binding [GO:0003677]; DNA polymerase bindi...,MTTETFVKDIKPGLKNLNLIFIVLETGRVTKTKDGHEVRTCKVADK...,211,Homo sapiens (Human),SOSS complex subunit B1 (Nucleic acid-binding ...,2
25,5'-3' DNA helicase activity [GO:0043139]; 5'-3...,MASSTVESFVAQQLQLLELERDAEVEERRSWQEHSSLRELQSRGVC...,993,Mus musculus (Mouse),DNA-binding protein SMUBP-2 (EC 3.6.4.12) (EC ...,2
179,DNA binding [GO:0003677]; DNA-directed 5'-3' R...,MLRNGNEGMSTIPGFSQIQFEGFCRFINQGLAEELEKFPTIKDPDH...,1075,Saccharum officinarum (Sugarcane),DNA-directed RNA polymerase subunit beta (EC 2...,2
186,DNA binding [GO:0003677]; DNA-directed 5'-3' R...,MNIFFQRKLVKYFGNIKKKKHYLTVSEECNHYVFVKKTYVSSVFLK...,3120,Chlamydomonas reinhardtii (Chlamydomonas smithii),DNA-directed RNA polymerase subunit beta'' (EC...,2


In [21]:
DNA_binding['Gene ontology (molecular function)']

22        DNA binding [GO:0003677]; DNA polymerase bindi...
179       DNA binding [GO:0003677]; DNA-directed 5'-3' R...
186       DNA binding [GO:0003677]; DNA-directed 5'-3' R...
188       DNA binding [GO:0003677]; DNA-binding transcri...
190       DNA binding [GO:0003677]; DNA-directed 5'-3' R...
                                ...                        
414261                             DNA binding [GO:0003677]
414285                             DNA binding [GO:0003677]
414302                             DNA binding [GO:0003677]
414309                             DNA binding [GO:0003677]
414311    DNA binding [GO:0003677]; DNA-binding transcri...
Name: Gene ontology (molecular function), Length: 21886, dtype: category
Categories (28473, object): ['(+)-abscisic acid 8'-hydroxylase activity [GO..., '(+)-abscisic acid 8'-hydroxylase activity [GO..., '(+)-abscisic acid 8'-hydroxylase activity [GO..., '(+)-abscisic acid D-glucopyranosyl ester tran..., ..., 'zeatin O-beta-D-xylosyltrans

In [22]:
#the overlap between DNA binding and ATP binding proteins
DNA_binding_contains[DNA_binding_contains['Gene ontology (molecular function)'].str.contains('ATP binding [GO:0005524]',regex=False)]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
25,5'-3' DNA helicase activity [GO:0043139]; 5'-3...,MASSTVESFVAQQLQLLELERDAEVEERRSWQEHSSLRELQSRGVC...,993,Mus musculus (Mouse),DNA-binding protein SMUBP-2 (EC 3.6.4.12) (EC ...,2
527,"4 iron, 4 sulfur cluster binding [GO:0051539];...",MPESLIAGIPVHFPFEPYPVQRAYMEKVIQCLRDGTNGVLESPTGT...,985,Drosophila yakuba (Fruit fly),Regulator of telomere elongation helicase 1 ho...,2
528,ATP binding [GO:0005524]; DNA binding [GO:0003...,MIASVRGEVLEIALDHAVIESAGVGYRVNATPATLGGLQRGTEARL...,200,Rhodococcus opacus (strain B4),Holliday junction ATP-dependent DNA helicase R...,2
529,ATP binding [GO:0005524]; DNA binding [GO:0003...,MHKDEDQRLLGAVPLPNDPDRSLRPQVLDDFIGQEAARANLKIFIE...,361,Bartonella henselae (strain ATCC 49882 / DSM 2...,Holliday junction ATP-dependent DNA helicase R...,2
532,ATP binding [GO:0005524]; DNA binding [GO:0003...,MYEYINGLITNIYPAYLVIADRSGVGYKLFVANPYRFEQNVESHVY...,196,Leuconostoc citreum (strain KM20),Holliday junction ATP-dependent DNA helicase R...,2
...,...,...,...,...,...,...
414122,ATP binding [GO:0005524]; DNA binding [GO:0003...,MSNDTLHKYEALPEDHRNVALRPCLIEEFVGQTEVIKNLKVFIQSA...,331,Anaplasma marginale (strain Florida),Holliday junction ATP-dependent DNA helicase R...,2
414124,ATP binding [GO:0005524]; DNA binding [GO:0003...,MIEADRLISAAVINDEESIDRAIRPKLLTEYVGQPHVREQMEIFIQ...,334,Yersinia pseudotuberculosis serotype IB (strai...,Holliday junction ATP-dependent DNA helicase R...,2
414129,ATP binding [GO:0005524]; DNA binding [GO:0003...,MYEYFEGIISEVTPSYVVVDVNGIGYKVFSPTPFAYKQGQKAKVYI...,195,Lactobacillus acidophilus (strain ATCC 700396 ...,Holliday junction ATP-dependent DNA helicase R...,2
414140,ATP binding [GO:0005524]; DNA binding [GO:0003...,MIGRLRGIILEKQPPLVLLETNGVGYEVQLPMTCFYELPELGQEAI...,204,Yersinia pseudotuberculosis serotype IB (strai...,Holliday junction ATP-dependent DNA helicase R...,2


## Class 3 - ATP Binding

In [23]:
#narrowing down classes -- ATP Binding CLASS 3
#this dataframe contains the proteins that startwith ATP binding [GO:0005524]
ATP_binding = GO_function[GO_function['Gene ontology (molecular function)'].str.startswith("ATP binding [GO:0005524]")]
ATP_binding.shape

(59297, 5)

In [24]:
#this dataframe is the sequences which 'contain' ATP binding [GO:0005524] function
ATP_binding_contains = GO_function[GO_function['Gene ontology (molecular function)'].str.contains("ATP binding [GO:0005524]", regex=False)]
ATP_binding_contains.shape

(81057, 5)

In [25]:
#create the class for ATP binding
ATP_binding_contains['class'] = 3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ATP_binding_contains['class'] = 3


In [26]:
#check to make sure that the class added correctly
ATP_binding_contains.nunique()

Gene ontology (molecular function)     4745
Sequence                              68894
Length                                 2262
Organism                               3454
Protein names                         10675
class                                     1
dtype: int64

In [27]:
#check to make sure it added
ATP_binding_contains.head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
0,ATP binding [GO:0005524]; cysteine-type endope...,MASSLKQGVSPKLRDVILVSKDIPEQLCDALFFYTSHNPKDYADAF...,6631,Avian infectious bronchitis virus (strain M41)...,Replicase polyprotein 1ab (pp1ab) (ORF1ab poly...,3
1,amino acid binding [GO:0016597]; aspartate car...,MAALVLEDGSVLQGRPFGAAVSTAGEVVFQTGMVGYPEALTDPSYK...,2225,Mus musculus (Mouse),CAD protein [Includes: Glutamine-dependent car...,3
12,ATP binding [GO:0005524]; identical protein bi...,MEHGSSRGFIWLILFLDFVSRVTGKTQVDALIALRSSLSSGDHTNN...,601,Arabidopsis thaliana (Mouse-ear cress),Somatic embryogenesis receptor kinase 5 (AtSER...,3
16,ATP binding [GO:0005524]; MAP kinase activity ...,MASATSTPTIADGNSNKESVATSRSPHTHDLNFELPEEYEMINLIG...,372,Schizosaccharomyces pombe (strain 972 / ATCC 2...,Mitogen-activated protein kinase spk1 (MAP kin...,3
25,5'-3' DNA helicase activity [GO:0043139]; 5'-3...,MASSTVESFVAQQLQLLELERDAEVEERRSWQEHSSLRELQSRGVC...,993,Mus musculus (Mouse),DNA-binding protein SMUBP-2 (EC 3.6.4.12) (EC ...,3


In [28]:
#the number of proteins which are atp and dna binders -- class overlap
ATP_binding[ATP_binding['Gene ontology (molecular function)'].str.contains('DNA binding [GO:0003677]',regex=False)]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names
528,ATP binding [GO:0005524]; DNA binding [GO:0003...,MIASVRGEVLEIALDHAVIESAGVGYRVNATPATLGGLQRGTEARL...,200,Rhodococcus opacus (strain B4),Holliday junction ATP-dependent DNA helicase R...
529,ATP binding [GO:0005524]; DNA binding [GO:0003...,MHKDEDQRLLGAVPLPNDPDRSLRPQVLDDFIGQEAARANLKIFIE...,361,Bartonella henselae (strain ATCC 49882 / DSM 2...,Holliday junction ATP-dependent DNA helicase R...
532,ATP binding [GO:0005524]; DNA binding [GO:0003...,MYEYINGLITNIYPAYLVIADRSGVGYKLFVANPYRFEQNVESHVY...,196,Leuconostoc citreum (strain KM20),Holliday junction ATP-dependent DNA helicase R...
534,ATP binding [GO:0005524]; DNA binding [GO:0003...,MERIISELEMPNEIEIQKSLRPKSFDEYIGQENLKEKMSISIKAAQ...,332,Fusobacterium nucleatum subsp. nucleatum (stra...,Holliday junction ATP-dependent DNA helicase R...
538,ATP binding [GO:0005524]; DNA binding [GO:0003...,MIGKISGILDFRGPDHVLIDVRGVGYIVHVSDRTLAAMPAPGEGVA...,224,Rhodobacter sphaeroides (strain ATCC 17025 / A...,Holliday junction ATP-dependent DNA helicase R...
...,...,...,...,...,...
414122,ATP binding [GO:0005524]; DNA binding [GO:0003...,MSNDTLHKYEALPEDHRNVALRPCLIEEFVGQTEVIKNLKVFIQSA...,331,Anaplasma marginale (strain Florida),Holliday junction ATP-dependent DNA helicase R...
414124,ATP binding [GO:0005524]; DNA binding [GO:0003...,MIEADRLISAAVINDEESIDRAIRPKLLTEYVGQPHVREQMEIFIQ...,334,Yersinia pseudotuberculosis serotype IB (strai...,Holliday junction ATP-dependent DNA helicase R...
414129,ATP binding [GO:0005524]; DNA binding [GO:0003...,MYEYFEGIISEVTPSYVVVDVNGIGYKVFSPTPFAYKQGQKAKVYI...,195,Lactobacillus acidophilus (strain ATCC 700396 ...,Holliday junction ATP-dependent DNA helicase R...
414140,ATP binding [GO:0005524]; DNA binding [GO:0003...,MIGRLRGIILEKQPPLVLLETNGVGYEVQLPMTCFYELPELGQEAI...,204,Yersinia pseudotuberculosis serotype IB (strai...,Holliday junction ATP-dependent DNA helicase R...


## Class 4 - Hormone Activity

In [29]:
#cotnains the hormone activity [GO:0005179]
HORMONE_activity = GO_function[GO_function['Gene ontology (molecular function)'].str.contains('hormone activity [GO:0005179]', regex = False)]
HORMONE_activity.shape

(1715, 5)

In [30]:
#create the class label for class 4
HORMONE_activity['class'] = 4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HORMONE_activity['class'] = 4


In [31]:
#check to make sure it added
HORMONE_activity.nunique()

Gene ontology (molecular function)     170
Sequence                              1522
Length                                 291
Organism                               440
Protein names                          904
class                                    1
dtype: int64

In [32]:
#check to make sure it added correctly
HORMONE_activity.head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
8,digestive hormone activity [GO:0046659]; hormo...,MEPPLPTPMLLLLLLLLSSSAALPAPPRTPRHSDGMFTSELSRLQD...,133,Mus musculus (Mouse),Secretin,4
659,hormone activity [GO:0005179],AGCKNFFWKTFTSC,14,Anguilla anguilla (European freshwater eel) (M...,Somatostatin-1 (Somatostatin I),4
670,hormone activity [GO:0005179]; metal ion bindi...,MATGSRTSLLLAFTLLCLPQLKEAGAFPTIPLSRLLDNAMLRAHRL...,217,Saimiri boliviensis boliviensis (Bolivian squi...,Somatotropin (Growth hormone),4
1208,hormone activity [GO:0005179]; identical prote...,MAFHSLLLLGLASLLFVSDAAPVIHGAEDSKCPLMVKVLDAVRGSP...,149,Monodelphis domestica (Gray short-tailed opossum),Transthyretin (Prealbumin),4
1210,hormone activity [GO:0005179]; thyroid hormone...,MAFHSLLLLCLAGLVFLSEAGPVAHGAEDSKCPLMVKVLDSVRGSP...,149,Sminthopsis macroura (Stripe-faced dunnart),Transthyretin (Prealbumin),4


In [33]:
#hormone activity
general_hormone = GO_function[GO_function['Gene ontology (molecular function)'].str.contains('hormone activity')]
general_hormone.shape

(2195, 5)

## Class 5 - GTPase Activity

In [34]:
#the dataframe for protein functions with GTPase activity [GO:0003924]
GTPase_activity = GO_function[GO_function['Gene ontology (molecular function)'].str.contains('GTPase activity [GO:0003924]', regex=False)]
GTPase_activity.shape

(7063, 5)

In [35]:
#add the label for class 5
GTPase_activity['class'] = 5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GTPase_activity['class'] = 5


In [36]:
#check to make sure that the class added correctly
GTPase_activity.nunique()

Gene ontology (molecular function)     296
Sequence                              5892
Length                                 808
Organism                              1261
Protein names                          645
class                                    1
dtype: int64

In [37]:
#check to make sure it added correctly
GTPase_activity.head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
429,GTPase activity [GO:0003924]; GTP binding [GO:...,MSKLSHQQQRRIHNHRQNKLAAGDRADAALVVAHLGYQLIVDDHGE...,316,Dichelobacter nodosus (strain VCS1703A),Small ribosomal subunit biogenesis GTPase RsgA...,5
440,GTPase activity [GO:0003924]; GTP binding [GO:...,MAKRQLNRRQNWRIEKIQNERAARAAKREQHALQELEGGDLGPEQL...,343,Pseudomonas putida (strain W619),Small ribosomal subunit biogenesis GTPase RsgA...,5
618,GTPase activity [GO:0003924]; GTP binding [GO:...,MENLISLVNKIQRACTALGDHGENSALPTLWDSLPAIAVVGGQSSG...,610,Glycine max (Soybean) (Glycine hispida),Dynamin-related protein 5A (Soybean dynamin-li...,5
658,7S RNA binding [GO:0008312]; GTPase activity [...,MVLAQLGGSISRALAQMSNATVIDEKVLGECLNEISRALLQSDVQF...,497,Hordeum vulgare (Barley),Signal recognition particle 54 kDa protein 2 (...,5
733,7S RNA binding [GO:0008312]; GTPase activity [...,MIGFADRLAEITKKIKGASIIDEDFVKEVVRDVQRALLEADVDVKL...,447,Methanopyrus kandleri (strain AV19 / DSM 6324 ...,Signal recognition particle 54 kDa protein (SR...,5


## Class 6 - NADH Dehydrogenase

In [38]:
#dataframe for quinone activity 'GO:0050136'
quinone_activity = GO_function[GO_function['Gene ontology (molecular function)'].str.contains('NADH dehydrogenase (quinone) activity [GO:0050136]', regex=False)]
quinone_activity.shape

(1061, 5)

In [39]:
#dataframe for ubiquinone activity [GO:0008137]
ubiquinone_activity = GO_function[GO_function['Gene ontology (molecular function)'].str.contains('NADH dehydrogenase (ubiquinone) activity [GO:0008137]', regex=False)]
ubiquinone_activity.shape

(4115, 5)

In [40]:
#concat the NADH dehydrogenase quinone and ubiquinone dataframes into one
NADH_activity = pd.concat([quinone_activity, ubiquinone_activity])
NADH_activity.shape

(5176, 5)

In [41]:
#add the class 6 to the dataframe
NADH_activity['class'] = 6 

In [42]:
#check to make sure it was added correctly
NADH_activity.nunique()

Gene ontology (molecular function)      57
Sequence                              4241
Length                                 563
Organism                              1408
Protein names                          240
class                                    1
dtype: int64

In [43]:
#check to make sure
NADH_activity.head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
14857,NAD binding [GO:0051287]; NADH dehydrogenase (...,MSESEIPGQVVVDLEADQSAEGGRRMVLNMGPQHPSTHGVLRLLME...,409,Solibacter usitatus (strain Ellin6076),NADH-quinone oxidoreductase subunit D 1 (EC 7....,6
14869,"4 iron, 4 sulfur cluster binding [GO:0051539];...",MANPIEEILGTAAAIAKGMGITFKEMMGPTVTDDYPDAPPKFEERF...,167,Solibacter usitatus (strain Ellin6076),NADH-quinone oxidoreductase subunit I 1 (EC 7....,6
14871,NAD binding [GO:0051287]; NADH dehydrogenase (...,MTEHNVRNFNINFGPQHPAAHGVLRLVLELDGEIVERVDPHIGLLH...,396,Rhizobium etli (strain CFN 42 / ATCC 51251),NADH-quinone oxidoreductase subunit D 1 (EC 7....,6
14878,"4 iron, 4 sulfur cluster binding [GO:0051539];...",MIGWLEAMLRVGRKLFVKAETQLYPEEKPKLFPRSRGRIVLTRDPD...,171,Rhodopseudomonas palustris (strain HaA2),NADH-quinone oxidoreductase subunit I 1 (EC 7....,6
14886,NAD binding [GO:0051287]; NADH dehydrogenase (...,MPEGALRNFTINFGPQHPAAHGVLRLVLELDGEIVERVDPHIGLLH...,396,Rhodopseudomonas palustris (strain BisB18),NADH-quinone oxidoreductase subunit D (EC 7.1....,6


In [44]:
NADH_activity[NADH_activity['Gene ontology (molecular function)'].str.contains("oxidoreductase activity [GO:0016491]")]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class


## Class 7 - Oxidoreductase Activity

In [45]:
#oxidioreductase activity
general_oxido = GO_function[GO_function['Gene ontology (molecular function)'].str.contains("oxidoreductase activity")]
general_oxido.shape

(10846, 5)

In [46]:
#dataframe of the function for oxidoreductase activity [GO:0016491]
#contains that specific GO function
oxidoreductase_activity = GO_function[GO_function['Gene ontology (molecular function)'].str.contains("oxidoreductase activity [GO:0016491]", regex=False)]
oxidoreductase_activity.shape

(3538, 5)

In [47]:
#add the class 7 to the dataframe
oxidoreductase_activity['class'] = 7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oxidoreductase_activity['class'] = 7


In [48]:
#check to make sure the class was added correctly
oxidoreductase_activity.nunique()

Gene ontology (molecular function)     571
Sequence                              3314
Length                                 867
Organism                               937
Protein names                         2362
class                                    1
dtype: int64

In [49]:
#check to make sure 7 was added
oxidoreductase_activity.head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
97,oxidoreductase activity [GO:0016491],MLVISANEQRNLVNMNEVIAYAALALKEFSAERTITPIRGSLPFAN...,325,Bacillus thuringiensis subsp. konkukian (strai...,Delta(1)-pyrroline-2-carboxylate reductase (Py...,7
626,electron transfer activity [GO:0009055]; flavi...,MLSSALKLTKKVCSTKSNGLIRSFSTQTQSRDYAVVDHTYDAIVVG...,626,Dictyostelium discoideum (Slime mold),Succinate dehydrogenase [ubiquinone] flavoprot...,7
735,oxidoreductase activity [GO:0016491],MVVVAVAGGTGGVGRTVLDAIAKSGQHQAIVLSRTTSVPTAVDEPK...,341,Metarhizium robertsii (strain ARSEF 23 / ATCC ...,Oxidoreductase swnN (EC 1.3.1.-) (Swainsonine ...,7
972,oxidoreductase activity [GO:0016491],MGEDFMHPPFQTYPSKNSEGKKHIVIVGGGIIGCCTAYYLTQHPSF...,523,Saccharomyces cerevisiae (strain ATCC 204508 /...,Putative oxidoreductase TDA3 (EC 1.-.-.-) (Bat...,7
1076,oxidoreductase activity [GO:0016491],MTVSSSIVPPGGLVLVTGVTGFIGSYIANGLLELGYRVRGTVRSSE...,340,Aspergillus terreus (strain NIH 2624 / FGSC A1...,NAD-dependent epimerase/dehydratase terH (EC 1...,7


In [50]:
#overlap with NADH or NAD
oxidoreductase_activity[oxidoreductase_activity['Gene ontology (molecular function)'].str.contains('NAD+')]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
3080,NAD(P)H nitroreductase activity [GO:0018545]; ...,MTQLTREQVLELFHQRSSTRYYDPTKKISDEDFECILECGRLSPSS...,220,Haemophilus influenzae (strain ATCC 51907 / DS...,Putative NAD(P)H nitroreductase (EC 1.-.-.-),7
8482,2-octaprenyl-6-methoxyphenol hydroxylase activ...,MSVIIVGGGMAGATLALAISRLSHGALPVHLIEATAPESHAHPGFD...,392,Escherichia coli (strain K12),2-octaprenyl-6-methoxyphenol hydroxylase (EC 1...,7
13262,alcohol dehydrogenase (NADP+) activity [GO:000...,MVPKFYKLSNGFKIPSIALGTYDIPRSQTAEIVYEGVKCGYRHFDT...,282,Saccharomyces cerevisiae (strain ATCC 204508 /...,Uncharacterized oxidoreductase YJR096W (EC 1.-...,7
14875,actin monomer binding [GO:0003785]; arginine b...,MGNLKSVAQEPGPPCGLGLGLGLGLCGKQGPATPAPEPSRAPASLL...,1203,Homo sapiens (Human),"Nitric oxide synthase, endothelial (EC 1.14.13...",7
15272,metal ion binding [GO:0046872]; NAD binding [G...,MINNKPIIGIPIGDPAGVGPEIVVKSLTEAEVYEKCNPILIGDAKV...,334,Clostridium botulinum (strain Hall / ATCC 3502...,Putative D-threonate 4-phosphate dehydrogenase...,7
...,...,...,...,...,...,...
405155,calmodulin binding [GO:0005516]; flavin adenin...,MLCPWQFAFKPHAVKNQSSEEKDINNNVEKDVKVHSFVKDDAKLHS...,1136,Gallus gallus (Chicken),"Nitric oxide synthase, inducible (EC 1.14.13.3...",7
410428,NAD binding [GO:0051287]; NADP binding [GO:005...,MKTGSEFHVGIVGLGSMGMGAALSCVRAGLSTWGADLNSNACATLK...,302,Escherichia coli O6:H1 (strain CFT073 / ATCC 7...,L-threonate dehydrogenase (EC 1.1.1.411),7
410956,electron transfer activity [GO:0009055]; flavi...,MAQLDTLDLVVLAVLLVGSVAYFTKGTYWAVAKDPYASTGPAMNGA...,694,Aspergillus niger,NADPH--cytochrome P450 reductase (CPR) (P450R)...,7
411664,metal ion binding [GO:0046872]; NAD binding [G...,MQRPIIAIPMGDPAGVGPEIVVKALANEEMYRIARPLVIGDAGVLR...,333,Heliobacterium modesticaldum (strain ATCC 5154...,D-erythronate 4-phosphate dehydrogenase (EC 1....,7


## Class 8 - Toxin Acitivity

In [51]:
GO_function[GO_function['Gene ontology (molecular function)'].str.contains("toxin activity")].value_counts().sum()

7051

In [52]:
#create the toxin activity [GO:0090729] dataframe
toxin_activity = GO_function[GO_function['Gene ontology (molecular function)'].str.contains("toxin activity [GO:0090729]", regex=False)]
toxin_activity.shape

(7051, 5)

In [53]:
#check to make sure that the 
toxin_activity['class'] = 8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  toxin_activity['class'] = 8


In [54]:
#check to make sure it was added correctly
toxin_activity.nunique()

Gene ontology (molecular function)     196
Sequence                              6818
Length                                 580
Organism                              1029
Protein names                         6686
class                                    1
dtype: int64

In [55]:
#check to make sure class 8 was added correctly
toxin_activity.head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
588,sodium channel inhibitor activity [GO:0019871]...,KEGYIVNYYTGCKFACAKLGDNDYCLRECKARYGKGAGGYCYAFGC...,67,Centruroides noxius (Mexican scorpion),Beta-toxin Cn8 (Toxin 8) (Toxin II-13.4),8
598,sodium channel inhibitor activity [GO:0019871]...,KEGYLVNSYTGCKFECFKLGDNDYCKRECKQQYGKGSGGYCYAFGC...,66,Centruroides suffusus (Durango bark scorpion),Toxin Css8 (Beta-neurotoxin CssVIII) (Css VIII),8
610,sodium channel inhibitor activity [GO:0019871]...,VRDGYIMIKDTNCKFSCNIFKKWEYCSPLCQSKGAETGYCYNFGCW...,67,Androctonus crassicauda (Arabian fat-tailed sc...,Putative sodium channel alpha-toxin Acra5,8
611,sodium channel inhibitor activity [GO:0019871]...,MTRFVLFLSCFFLIGMVVECKDGYLVGDDGCKMHCFTRPGHYCASE...,86,Tityus discrepans (Venezuelan scorpion),Toxin Td8 (PT-beta* NaTx14.6),8
614,sodium channel inhibitor activity [GO:0019871]...,VRDGYIALPHNCAYGCLNNEYCNNLCTKDGAKIGYCNIVGKYGNAC...,66,Mesobuthus martensii (Manchurian scorpion) (Bu...,Alpha-like toxin BmK-M7 (BmKM7) (Bmk M7),8


In [56]:
#direct overlap
toxin_activity[toxin_activity['Gene ontology (molecular function)'].str.contains('ATP binding [GO:0005524]',regex=False)]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
16293,acid-amino acid ligase activity [GO:0016881]; ...,MATLAYSIEVEGLEDETLVVRGFHGQESLSNSVFLGQACYGFRYEV...,1163,Vibrio cholerae serotype O1 (strain ATCC 39315...,Actin cross-linking toxin VgrG1 (EC 6.3.2.-),8
117100,acid-amino acid ligase activity [GO:0016881]; ...,MATLAYSIEVEGLEDETLVVRGFHGQESLSNSVFLGQACYGFRYEV...,1095,Vibrio cholerae serotype O1 (strain ATCC 39541...,Actin cross-linking toxin VgrG1 (EC 6.3.2.-),8
154509,acid-amino acid ligase activity [GO:0016881]; ...,MVFYLIPKRRVWLMGKPFWRSVEYFFTGNYSADDGNNNIVAIGFGG...,4558,Vibrio cholerae serotype O1 (strain ATCC 39315...,Multifunctional-autoprocessing repeats-in-toxi...,8
190083,ATPase activity [GO:0016887]; ATP binding [GO:...,MFQFHLTSKAKKVIELYAQEEAKRLNHDMVTPEHILLGLLYESEAL...,828,Brachyspira hyodysenteriae (Treponema hyodysen...,Hemolysin B,8
298112,ATP binding [GO:0005524]; calcium- and calmodu...,MQQSHQAGYANAADRESGIPAAVLDGIKAVAKEKNATLMFRLVNPH...,1706,Bordetella bronchiseptica (strain ATCC BAA-588...,Bifunctional hemolysin/adenylate cyclase (AC-H...,8
368832,adenylate cyclase activity [GO:0004016]; ATP b...,MTRNKFIPNKFSIISFSVLLFAISSSQAIEVNAMNEHYTESDIKRN...,800,Bacillus anthracis,Calmodulin-sensitive adenylate cyclase (EC 4.6...,8
377124,ATP binding [GO:0005524]; calcium- and calmodu...,MQQSHQAGYANAADRESGIPAAVLDGIKAVAKEKNATLMFRLVNPH...,1706,Bordetella pertussis (strain ATCC 9797 / DSM 5...,Bifunctional hemolysin/adenylate cyclase (AC-H...,8
396434,adenylate cyclase activity [GO:0004016]; ATP b...,MQQSHQAGYANAADRESGIPAAVLDGIKAVAKEKNATLMFRLVNPH...,1706,Bordetella pertussis (strain Tohama I / ATCC B...,Bifunctional hemolysin/adenylate cyclase (AC-H...,8


In [57]:
toxin_activity[toxin_activity['Gene ontology (molecular function)'].str.contains('DNA binding [GO:0003677]',regex=False)]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
35447,DNA binding [GO:0003677]; protein serine/threo...,MSELTDLLLQGPRSAPELRQRLAISQATFSRLVAREDRVIRFGKAR...,443,Escherichia coli (strain K12),Toxin YjjJ (Putative DNA-binding transcription...,8
241877,DNA binding [GO:0003677]; flavin adenine dinuc...,MNDFLLLLLVLFLGVPRSENHVINLEECFQEPEYENWLATASHGLT...,491,Ophiophagus hannah (King cobra) (Naja hannah),L-amino-acid oxidase (LAO) (Oh-LAAO) (EC 1.4.3.2),8


In [58]:
toxin_activity[toxin_activity['Gene ontology (molecular function)'].str.contains('NADH dehydrogenase (quinone) activity [GO:0050136]',regex=False)]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class


## Create Final DataFrame for the Chosen Classes

In [59]:
#use the identified classes to parse the protein dataframe and grab the rows desired
#concat the dataframes together
final_df = pd.concat([rRNA_binding_contains, DNA_binding_contains, ATP_binding_contains, 
                     HORMONE_activity, GTPase_activity, NADH_activity, oxidoreductase_activity,
                     toxin_activity], ignore_index=True)

In [60]:
#has the correct number of rows based on number of class observations
#highly imbalanced classes -- deal with in modeling
final_df.shape

(163628, 6)

In [61]:
#the first rows of the dataframe
final_df.head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
0,rRNA binding [GO:0019843]; structural constitu...,MNVILLDKIANLGSLGDQVSVKSGYARNFLFPQGKAVPATKSNVDL...,150,Idiomarina loihiensis (strain ATCC BAA-735 / D...,50S ribosomal protein L9,1
1,rRNA binding [GO:0019843]; structural constitu...,MQLYDFYKKNVLIKLKNKFNYKSIMQVPKIEKITLNMGVGKASFDK...,178,Wigglesworthia glossinidia brevipalpis,50S ribosomal protein L5,1
2,rRNA binding [GO:0019843]; structural constitu...,MELKLLNDQGQAASNVAAPDTIFGRDYNEALIHQVVVAYQANARSG...,206,Janthinobacterium sp. (strain Marseille) (Mini...,50S ribosomal protein L4,1
3,rRNA binding [GO:0019843]; structural constitu...,MNVILLDKIANLGNLGDQVAVKAGYARNYLLPQGKAVVANESNVKV...,150,Shewanella sp. (strain MR-7),50S ribosomal protein L9,1
4,rRNA binding [GO:0019843]; structural constitu...,MSRVAKAPVNIPAGVEVKLDGQLLTVKGKNGELSRKIHESVEVKQD...,177,Haemophilus influenzae (strain PittGG),50S ribosomal protein L6,1


## Cleaning Final Dataframe

In [62]:
#the number of unique organisms in the dataframe
#range from prokaryotes to eukarotes
final_df['Organism'].nunique()

6638

In [63]:
#as we have closely related organisms we have duplicate names
final_df['Protein names'].nunique()

30938

In [64]:
#some of the value counts for the protein names
final_df['Protein names'].value_counts().head()

Holliday junction ATP-dependent DNA helicase RuvA (EC 3.6.4.12)    1459
Holliday junction ATP-dependent DNA helicase RuvB (EC 3.6.4.12)    1408
Transcriptional repressor NrdR                                     1255
50S ribosomal protein L2                                            862
30S ribosomal protein S19                                           859
Name: Protein names, dtype: int64

In [65]:
#as we have thousands of organisms that are closely related there are duplicate sequences for a protein within the dataframe
#to avoid overfitting to a frequently occurring dataframe we are going to drop the duplicate values present in the data
final_df['Sequence'].nunique()

130982

In [66]:
#an exaple of a duplicate sequence 
#as we can see they are from closely related organisms -- drop these duplicates from the data
final_df[final_df['Sequence'] == 'MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKTTVIALREIEEGLINNQILDVRERQEQQEQEAAELQAVTAIAEGRR'].head()

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
28547,DNA binding [GO:0003677]; DNA-directed 5'-3' R...,MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKT...,91,Escherichia coli (strain K12),DNA-directed RNA polymerase subunit omega (RNA...,2
29446,DNA binding [GO:0003677]; DNA-directed 5'-3' R...,MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKT...,91,Citrobacter koseri (strain ATCC BAA-895 / CDC ...,DNA-directed RNA polymerase subunit omega (RNA...,2
29519,DNA binding [GO:0003677]; DNA-directed 5'-3' R...,MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKT...,91,Escherichia coli O9:H4 (strain HS),DNA-directed RNA polymerase subunit omega (RNA...,2
29920,DNA binding [GO:0003677]; DNA-directed 5'-3' R...,MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKT...,91,Escherichia coli (strain ATCC 8739 / DSM 1576 ...,DNA-directed RNA polymerase subunit omega (RNA...,2
30084,DNA binding [GO:0003677]; DNA-directed 5'-3' R...,MARVTVQDAVEKIGNRFDLVLVAARRARQMQVGGKDPLVPEENDKT...,91,Shigella dysenteriae serotype 1 (strain Sd197),DNA-directed RNA polymerase subunit omega (RNA...,2


In [67]:
#dropping the duplicates from the sequence column
final_df = final_df.drop_duplicates(['Sequence'])
final_df.shape

(130982, 6)

In [68]:
#now we don't have any duplicates
final_df['Sequence'].value_counts().sum()

130982

In [69]:
#check the value counts for the classes -- there should be 8 classes
final_df['class'].nunique()

8

In [70]:
#drop this row from the data frame
final_df[final_df['Length']==35213]

Unnamed: 0,Gene ontology (molecular function),Sequence,Length,Organism,Protein names,class
63055,actin filament binding [GO:0051015]; actinin b...,"!!! TO LONG IT DOES NOT FIT IN EXCELL, NOT REP...",35213,Mus musculus (Mouse),Titin (EC 2.7.11.1) (Connectin),3


In [71]:
#drop row 63055
final_df.drop(axis=0, index = 63055, inplace=True)

In [72]:
#the max length is 18,562 -- case int32 to an int16 dtype to save memory (Int16 = -32768 to 32767)
final_df['Length'].max()

18562

In [73]:
#cast both class and length numeric columns to int16
final_df['Length'] = final_df['Length'].astype('int16')
final_df['class'] = final_df['class'].astype('int16')

In [74]:
#get the memory of the final dataframe in MBs
final_df.memory_usage(deep=True).sum()/1_000_000

190.184898

In [78]:
#the object types of the columns in the final dataframe
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130981 entries, 0 to 163627
Data columns (total 6 columns):
 #   Column                              Non-Null Count   Dtype   
---  ------                              --------------   -----   
 0   Gene ontology (molecular function)  130981 non-null  category
 1   Sequence                            130981 non-null  category
 2   Length                              130981 non-null  int16   
 3   Organism                            130981 non-null  category
 4   Protein names                       130981 non-null  category
 5   class                               130981 non-null  int16   
dtypes: category(4), int16(2)
memory usage: 20.6 MB


In [79]:
#cast the columns from categories to objects to save space in memory
columns = ['Gene ontology (molecular function)','Sequence', 'Organism', 'Protein names']
for col in columns:
    final_df[col] = final_df[col].astype('object')

In [85]:
#get the final memory of the dataframe in MBs
final_df.memory_usage(deep=True).sum()/1_000_000

118.351587

In [86]:
#save the file as a compressed pickle
final_df.to_pickle('data/compressed-class-separated.pkl', compression = 'gzip', protocol = 4)