In [1]:
import pandas as pd
from Bio import Entrez
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

import re

from Bio import SeqIO

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

41671


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [22]:
query = "SELECT * FROM sequence"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration'

In [7]:
# add_publication = (
#     "INSERT INTO publication "
#     "(id, title, doi, author, year) "
#     "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
# )
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)

In [33]:
def get_taxonomy_data(record):
    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print("Fetched taxid from NCBI {}".format(id))
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for \n {} setting it to 1".format(s))
        taxids.append(1)  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print("Unexpected error: {}, Retrying, attempt {}".format(sys.exc_info()[0], i))
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None: taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None: taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Update sequence XP_012928609.2 to previous version XP_012928609.1

Новая версия записи низкого качества (по данным из NCBI). Последовательность имеет длинный лишний фрагмент с N-конца.

In [9]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_012928609.2'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_012928609.2,cH4,861442511,,,10181,Heterocephalus glaber,Chordata,Mammalia,,,MFDVFGRDKGGKVLDKGRAKRHPKVLRGNIQGITKPAISRLARRSG...,,,


In [10]:
df.groupby(["accession"])["publication_id"].apply(
    lambda x: [] if x.dropna().empty else list(x.unique())
).reset_index()

Unnamed: 0,accession,publication_id
0,XP_012928609.2,[]


## Get sequence XP_012928609.1 from NCBI

In [12]:
with Entrez.efetch(
    db="protein", id="XP_012928609.1", rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_012928609.1
Name: XP_012928609
Description: PREDICTED: histone H4 [Heterocephalus glaber]
Database cross-references: BioProject:PRJNA197330
Number of features: 4
/topology=linear
/data_file_division=ROD
/date=02-JUL-2015
/accessions=['XP_012928609']
/sequence_version=1
/db_source=REFSEQ: accession XM_013073155.1
/keywords=['RefSeq', 'includes ab initio']
/source=Heterocephalus glaber (naked mole-rat)
/organism=Heterocephalus glaber
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Glires', 'Rodentia', 'Hystricomorpha', 'Bathyergidae', 'Heterocephalus']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NW_004624756.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
XP_012928609.2.
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation

In [14]:
str(record.seq)

'MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG'

## Updating

In [15]:
query = (
    "UPDATE sequence SET accession='XP_012928609.1', "
    f"sequence='{str(record.seq)}' "
    "WHERE accession='XP_012928609.2'"
)
print(query)

UPDATE sequence SET accession='XP_012928609.1' sequence='MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG' WHERE accession='XP_012928609.2'


In [17]:
query = (
    "UPDATE sequence SET accession='XP_012928609.1', "
    f"sequence='{str(record.seq)}' "
    "WHERE accession='XP_012928609.2'"
)
cursor.execute(query)
# Make sure data is committed to the database
conn.commit()

In [18]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_012928609.2'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


In [19]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_012928609.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_012928609.1,cH4,861442511,,,10181,Heterocephalus glaber,Chordata,Mammalia,,,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,,,


# Update sequence XP_002365268.2 to previous version XP_002365268.1

Новая версия записи низкого качества (по данным из NCBI). Последовательность имеет длинный лишний фрагмент с N-конца.

In [20]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_002365268.2'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_002365268.2,cH2A_(Protists),237831941,,,508771,Toxoplasma gondii ME49,Apicomplexa,Conoidasida,,,MWLCDWAKEPEVSFFDSYKISAGNTHTCFPTRGRLLPVFLGSFLHS...,,XP_002365268.2,22650316
1,XP_002365268.2,cH2A_(Protists),237831941,,,508771,Toxoplasma gondii ME49,Apicomplexa,Conoidasida,,,MWLCDWAKEPEVSFFDSYKISAGNTHTCFPTRGRLLPVFLGSFLHS...,,XP_002365268.2,26989147


In [21]:
df.groupby(["accession"])["publication_id"].apply(
    lambda x: [] if x.dropna().empty else list(x.unique())
).reset_index()

Unnamed: 0,accession,publication_id
0,XP_002365268.2,"[22650316, 26989147]"


## Delete relations with publications befor updating sequence record

In [23]:
publications = ["22650316", "26989147"]

In [28]:
query = "DELETE FROM sequence_has_publication WHERE sequence_accession='XP_002365268.2'"
cursor.execute(query)

In [29]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_002365268.2'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df.groupby(["accession"])["publication_id"].apply(
    lambda x: [] if x.dropna().empty else list(x.unique())
).reset_index()

Unnamed: 0,accession,publication_id
0,XP_002365268.2,[]


## Get sequence XP_002365268.1 from NCBI

In [30]:
with Entrez.efetch(
    db="protein", id="XP_002365268.1", rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_002365268.1
Name: XP_002365268
Description: histone H2A, putative [Toxoplasma gondii ME49]
Number of features: 4
/topology=linear
/data_file_division=INV
/date=21-NOV-2012
/accessions=['XP_002365268']
/sequence_version=1
/db_source=REFSEQ: accession XM_002365227.1
/keywords=['RefSeq']
/source=Toxoplasma gondii ME49
/organism=Toxoplasma gondii ME49
/taxonomy=['Eukaryota', 'Alveolata', 'Apicomplexa', 'Conoidasida', 'Coccidia', 'Eucoccidiorida', 'Eimeriorina', 'Sarcocystidae', 'Toxoplasma']
/references=[Reference(title='Annotation of Toxoplasma gondii ME49', ...), Reference(title='Direct Submission', ...)]
/comment=PROVISIONAL REFSEQ: This record has not yet been subject to final
NCBI review. The reference sequence was derived from EEA98127.
XP_002365268.2.
Method: conceptual translation.
/molecule_type=protein
Seq('MSAKGKGGRAKKSGKSSSKSAKAGLQFPVGRIGRYLKKGRYAKRVGAGAPVYMA...KSQ')


In [31]:
str(record.seq)

'MSAKGKGGRAKKSGKSSSKSAKAGLQFPVGRIGRYLKKGRYAKRVGAGAPVYMAAVLEYLCAEILELAGNAARDHKKTRIIPRHIQLAVRNDEELSKFLGGVTIASGGVMPNVHSVLLPKKSKGKKSQ'

## Updating

In [32]:
query = (
    "UPDATE sequence SET accession='XP_002365268.1', "
    f"sequence='{str(record.seq)}' "
    "WHERE accession='XP_002365268.2'"
)
print(query)

UPDATE sequence SET accession='XP_002365268.1', sequence='MSAKGKGGRAKKSGKSSSKSAKAGLQFPVGRIGRYLKKGRYAKRVGAGAPVYMAAVLEYLCAEILELAGNAARDHKKTRIIPRHIQLAVRNDEELSKFLGGVTIASGGVMPNVHSVLLPKKSKGKKSQ' WHERE accession='XP_002365268.2'


In [33]:
cursor.execute(query)

In [34]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_002365268.2'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id


In [35]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_002365268.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_002365268.1,cH2A_(Protists),237831941,,,508771,Toxoplasma gondii ME49,Apicomplexa,Conoidasida,,,MSAKGKGGRAKKSGKSSSKSAKAGLQFPVGRIGRYLKKGRYAKRVG...,,,


## Return relations to publications

In [36]:
publications

['22650316', '26989147']

In [38]:
for pid in publications:
    cursor.execute(add_sequence_has_publication, ("XP_002365268.1", pid))

In [41]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_002365268.1'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_002365268.1,cH2A_(Protists),237831941,,,508771,Toxoplasma gondii ME49,Apicomplexa,Conoidasida,,,MSAKGKGGRAKKSGKSSSKSAKAGLQFPVGRIGRYLKKGRYAKRVG...,,XP_002365268.1,22650316
1,XP_002365268.1,cH2A_(Protists),237831941,,,508771,Toxoplasma gondii ME49,Apicomplexa,Conoidasida,,,MSAKGKGGRAKKSGKSSSKSAKAGLQFPVGRIGRYLKKGRYAKRVG...,,XP_002365268.1,26989147


In [40]:
# Make sure data is committed to the database
conn.commit()

# Add rabbit H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** oryCun2, chr13, +-, 12617595-12619869

**Protein accession:** XP_002715119.1

>[Genome assembly OryCun2.0](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000003625.3/)
>
>Status: RefSeq GCF_000003625.3 is suppressed
>
>This record was removed as a result of standard genome annotation processing. Please see www.ncbi.nlm.nih.gov/genome/annotation_euk/process/ for more information.

Actual version is [Genome assembly UM_NZW_1.0](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_009806435.1/)

**Protein accession after updating the genome assembly:** XP_002715119.2

## Get sequence XP_002715119.2 from NCBI

In [8]:
with Entrez.efetch(
    db="protein", id="XP_002715119.2", rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_002715119.2
Name: XP_002715119
Description: histone H2B type 2-K1 [Oryctolagus cuniculus]
Database cross-references: BioProject:PRJNA896980
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=09-MAR-2023
/accessions=['XP_002715119']
/sequence_version=2
/db_source=REFSEQ: accession XM_002715073.3
/keywords=['RefSeq']
/source=Oryctolagus cuniculus (rabbit)
/organism=Oryctolagus cuniculus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Euarchontoglires', 'Glires', 'Lagomorpha', 'Leporidae', 'Oryctolagus']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_067386) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
On Nov 8, 2022 this sequence version replaced XP_002715119.1.
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', Ord

In [9]:
str(record.seq)

'MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQVHPDIGISARAMSIMNSFVNDVFERLAGEAAQLAQYSGRSTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [10]:
record.annotations["organism"]

'Oryctolagus cuniculus'

In [24]:
# record.features[0].qualifiers['db_xref']
taxonomy_data = {}
taxonomy_data["organism"] = record.annotations["organism"]
try:
    for a in record.features[0].qualifiers["db_xref"]:
        text = re.search("(\S+):(\S+)", a).group(1)
        taxid = re.search("(\S+):(\S+)", a).group(2)
        if text == "taxon":
            print("Fetched taxid from NCBI {}".format(id))
            taxonomy_data["taxonomy_id"] = int(taxid)
        else:
            continue
except:
    print("!!!!!!Unable to get TAXID for \n {} setting it to 1".format(s))
    taxids.append(1)  # unable to identify

lineage = dict()
for i in range(10):
    try:
        handle = Entrez.efetch(
            id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
        )
        tax_data = Entrez.read(handle)
        lineage = {
            d["Rank"]: d["ScientificName"]
            for d in tax_data[0]["LineageEx"]
            if d["Rank"] in ["class", "phylum"]
        }
        break
    except:
        print("Unexpected error: {}, Retrying, attempt {}".format(sys.exc_info()[0], i))
        if i == 9:
            print(
                f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
            )
        else:
            continue
taxonomy_data["phylum"] = lineage.get("phylum", None)
taxonomy_data["class"] = lineage.get("class", None)
if taxonomy_data["phylum"] is not None: taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
if taxonomy_data["class"] is not None: taxonomy_data["class"] = str(taxonomy_data["class"])
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Oryctolagus cuniculus',
 'taxonomy_id': 9986,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding

In [25]:
data_sequence = {
    "accession": "XP_002715119.2",
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_002715119.2',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9986,
 'organism': 'Oryctolagus cuniculus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQVHPDIGISARAMSIMNSFVNDVFERLAGEAAQLAQYSGRSTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [26]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [27]:
cursor.execute(add_sequence, data_sequence)

In [28]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_002715119.2'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,,


## Add publication

In [29]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [30]:
cursor.execute(add_sequence_has_publication, ("XP_002715119.2", pid))

In [31]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    "WHERE s.accession='XP_002715119.2'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534


In [32]:
# Make sure data is committed to the database
conn.commit()

# Add pig H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** susScr11, chr18, ++, 6019516-6022010

**Protein accession:** XP_013846203.1

In [38]:
ACCESSION = "XP_013846203.1"

## Get sequence XP_013846203.1 from NCBI

In [39]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_013846203.1
Name: XP_013846203
Description: late histone H2B.L4-like [Sus scrofa]
Database cross-references: BioProject:PRJNA28993
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=13-MAY-2017
/accessions=['XP_013846203']
/sequence_version=1
/db_source=REFSEQ: accession XM_013990749.2
/keywords=['RefSeq']
/source=Sus scrofa (pig)
/organism=Sus scrofa
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Suina', 'Suidae', 'Sus']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_010460.4) annotated using gene prediction method: Gnomon,
supported by EST evidence.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI'), ('Annotation Status', 'Full annotati

In [40]:
str(record.seq)

'MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [41]:
record.annotations["organism"]

'Sus scrofa'

In [42]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Sus scrofa',
 'taxonomy_id': 9823,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding

In [43]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_013846203.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9823,
 'organism': 'Sus scrofa',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [44]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [45]:
cursor.execute(add_sequence, data_sequence)

In [46]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,,


## Add publication

In [47]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [48]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [49]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534


In [50]:
# Make sure data is committed to the database
conn.commit()

# Add sheep H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** oviAri4, chr4, +-, 113150440-113152940

**Protein accession:** XP_014950940.1

>[Genome assembly Oar_v4.0](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000298735.2/)
>
>Status: RefSeq GCF_000298735.2 is suppressed
>
>This record was removed as a result of standard genome annotation processing. Please see www.ncbi.nlm.nih.gov/genome/annotation_euk/process/ for more information.

Actual version is [Genome assembly ]()

**Protein accession after updating the genome assembly:** 

# Close connections

In [51]:
cursor.close()
conn.close()
tunnel.stop()