### Week 5 - Biological Databases - PubMed
- October 2023
- [https://https://github.com/tisimpson/bioinformatics1](https://github.com/tisimpson/bioinformatics1)
- [ian.simpson@ed.ac.uk](mailto:ian.simpson@ed.ac.uk)

In [None]:
import pandas as pd
import urllib as ul
import numpy as np

In [None]:
%pip install biopython

In [None]:
from Bio import Entrez

Entrez.email = "A.N.Other@example.com" # You should replace this with your e-mail address 

year_counts = {}

# note the egquery function provides Entrez database counts from a global search.
for i in range(2010,2023,1):
    handle = Entrez.egquery(term=str(i)+'[dp]')
    record = Entrez.read(handle)
    handle.close()
    # we can iterate through the record and only return the 'nucleotide' result
    for row in record["eGQueryResult"]:
        if row["DbName"]=="pubmed":
            year_counts[i] = int(row["Count"])

print(year_counts)

In [None]:
papers_by_year = pd.DataFrame.from_dict(year_counts,orient='index',columns=['counts'])

papers_by_year.plot.bar(xlabel='year',ylabel='paper count',legend=False)

In [None]:
#how many Autism papers?

#search 1
handle = Entrez.egquery(term='"Autism Spectrum Disorder"')
record = Entrez.read(handle)
handle.close()

for row in record["eGQueryResult"]:
    if row["DbName"]=="pubmed":
        print('Using phrase but no tag')
        print(row["Count"]+' papers\n')

#search 2
handle = Entrez.egquery(term='Autism')
record = Entrez.read(handle)
handle.close()

for row in record["eGQueryResult"]:
    if row["DbName"]=="pubmed":
        print('Using just Autism')
        print(row["Count"]+' papers\n')

#search 3
handle = Entrez.egquery(term='"Autism Spectrum Disorder"'+'[MH]')
record = Entrez.read(handle)
handle.close()

for row in record["eGQueryResult"]:
    if row["DbName"]=="pubmed":
        print('Using MeSH')
        print(row["Count"]+' papers\n')

#which is best and why?

In [None]:
#RNA sequencing
rna_sequencing_counts = {}

# note the egquery function provides Entrez database counts from a global search.
for i in range(2010,2022,1):
    handle = Entrez.egquery(term=str(i)+'[dp] AND "RNA sequencing"')
    record = Entrez.read(handle)
    handle.close()
    # we can iterate through the record and only return the 'nucleotide' result
    for row in record["eGQueryResult"]:
        if row["DbName"]=="pubmed":
            rna_sequencing_counts[i] = int(row["Count"])

rnaseq_papers_by_year = pd.DataFrame.from_dict(rna_sequencing_counts,orient='index',columns=['counts'])

rnaseq_papers_by_year.plot.line(xlabel='year',ylabel='paper count',legend=False)

In [None]:
#microarray
microarray_counts = {}

# note the egquery function provides Entrez database counts from a global search.
for i in range(2010,2022,1):
    handle = Entrez.egquery(term=str(i)+'[dp] AND microarray')
    record = Entrez.read(handle)
    handle.close()
    # we can iterate through the record and only return the 'nucleotide' result
    for row in record["eGQueryResult"]:
        if row["DbName"]=="pubmed":
            microarray_counts[i] = int(row["Count"])

microarray_papers_by_year = pd.DataFrame.from_dict(microarray_counts,orient='index',columns=['counts'])

microarray_papers_by_year.plot.line(xlabel='year',ylabel='paper count',legend=False)

In [None]:
#plot both data together

combined = pd.merge(rnaseq_papers_by_year,microarray_papers_by_year,left_index=True,right_index=True)

combined.columns = ['RNA-seq','microarray']

combined.plot.line(xlabel='year',ylabel='paper count')

#what is this graph telling us?