# Extract only gene or gene product entities

Begin by cleaning the data and only extracting entities considered gene or gene products. Pubtator classifies what we want as "gene" whereas scispacy classifies what we want as "GENE_OR_GENE_PRODUCTS"

In [67]:
import pandas as pd

In [100]:
# initalize paths
pubtator_entities_path = "../results/pubtator_abstract_entities.csv"
scispacy_entities_path = "../results/scispacy_abstract_entities.csv"

In [101]:
# initalize dataframes
pubtator_entities = pd.read_csv(pubtator_entities_path, sep="\t")
scispacy_entities = pd.read_csv(scispacy_entities_path)
scispacy_entities = scispacy_entities.drop(columns=["Unnamed: 0"], axis=1)

In [102]:
# rename column for consitency
scispacy_entities = scispacy_entities.rename({"entity-labels":"entity-type"}, axis=1)

In [103]:
# ensure proper formatting
pubtator_entities

Unnamed: 0,entity,entity-type,pmid
0,hallmark of heart failure,Disease,34806902
1,murine,Species,34806902
2,ADAMTS5,Gene,34806902
3,a disintegrin and metalloproteinase with throm...,Gene,34806902
4,ADAMTS5,Gene,34806902
...,...,...,...
141,transgelin,Gene,29484645
142,human,Species,29133944
143,death,Disease,29133944
144,human,Species,29133944


In [104]:
# ensure proper formatting
scispacy_entities

Unnamed: 0,entity,entity-type,pmid
0,extracellular matrix,CELLULAR_COMPONENT,34806902
1,ECM,CELLULAR_COMPONENT,34806902
2,heart,ORGAN,34806902
3,murine cardiac fibroblasts,ORGANISM,34806902
4,ADAMTS5,GENE_OR_GENE_PRODUCT,34806902
...,...,...,...
195,cell surface,CELLULAR_COMPONENT,29133944
196,heart,ORGAN,29133944
197,mitochondrial,CELLULAR_COMPONENT,29133944
198,heart,ORGAN,29133944


In [105]:
# only save proteins 
pubtator_entities_proteins = pubtator_entities[pubtator_entities["entity-type"] == "Gene"]
scispacy_entites_proteins = scispacy_entities[scispacy_entities["entity-type"] == "GENE_OR_GENE_PRODUCT"]

In [106]:
# inspect results
pubtator_entities_proteins

Unnamed: 0,entity,entity-type,pmid
2,ADAMTS5,Gene,34806902
3,a disintegrin and metalloproteinase with throm...,Gene,34806902
4,ADAMTS5,Gene,34806902
6,ADAMTS5,Gene,34806902
8,ADAMTS5,Gene,34806902
9,Adamts5,Gene,34806902
15,Ang II,Gene,34806902
16,Adamts5,Gene,34806902
18,Ang II,Gene,34806902
20,integrin beta 1,Gene,34806902


In [107]:
# inspect results
scispacy_entites_proteins

Unnamed: 0,entity,entity-type,pmid
4,ADAMTS5,GENE_OR_GENE_PRODUCT,34806902
5,disintegrin,GENE_OR_GENE_PRODUCT,34806902
6,thrombospondin motifs 5,GENE_OR_GENE_PRODUCT,34806902
7,ADAMTS5,GENE_OR_GENE_PRODUCT,34806902
8,chondroitin sulphate proteoglycans,GENE_OR_GENE_PRODUCT,34806902
...,...,...,...
168,myosin light chain 3,GENE_OR_GENE_PRODUCT,29484645
169,myosin regulatory light chain 2-ventricular/ca...,GENE_OR_GENE_PRODUCT,29484645
174,tricarboxylic acid,GENE_OR_GENE_PRODUCT,29484645
176,fatty acid omega,GENE_OR_GENE_PRODUCT,29484645


# Visualize counts

Tabulate value counts

In [108]:
pubtator_entities_proteins.groupby(["pmid"])["entity-type"].value_counts()

pmid      entity-type
29484645  Gene            5
31670476  Gene            4
33998164  Gene            4
34098726  Gene           31
34806902  Gene           15
Name: entity-type, dtype: int64

In [109]:
scispacy_entites_proteins.groupby(["pmid"])["entity-type"].value_counts()

pmid      entity-type         
29484645  GENE_OR_GENE_PRODUCT     8
31670476  GENE_OR_GENE_PRODUCT     3
33998164  GENE_OR_GENE_PRODUCT     8
34098726  GENE_OR_GENE_PRODUCT    33
34806902  GENE_OR_GENE_PRODUCT    32
Name: entity-type, dtype: int64

Explore proteins found in PMID: 34098726

In [43]:
pubtator_entities_proteins.groupby(["pmid"]).value_counts()

pmid      entity                                                              entity-type
29484645  myosin light chain 3                                                Gene            1
          myosin light chain 4                                                Gene            1
          transgelin                                                          Gene            1
          myosin regulatory light chain 2-ventricular/cardiac muscle isoform  Gene            1
          myosin regulatory light chain 2-atrial isoform                      Gene            1
31670476  Serpina3                                                            Gene            3
          serine proteinase inhibitor A3                                      Gene            1
33998164  PLN                                                                 Gene            4
34098726  HINT1                                                               Gene           13
          HOXA5                               

In [42]:
scispacy_entites_proteins.groupby(["pmid"]).value_counts()

pmid      entity                                                                                    entity-type         
29484645  fatty acid omega                                                                          GENE_OR_GENE_PRODUCT     1
          myosin light chain 3                                                                      GENE_OR_GENE_PRODUCT     1
          myosin light chain 4                                                                      GENE_OR_GENE_PRODUCT     1
          myosin regulatory light chain 2-atrial                                                    GENE_OR_GENE_PRODUCT     1
          myosin regulatory light chain 2-ventricular/cardiac                                       GENE_OR_GENE_PRODUCT     1
          transgelin                                                                                GENE_OR_GENE_PRODUCT     1
          tricarboxylic acid                                                                        GENE_OR_GENE_PROD