In [1]:
import pandas as pd
from Bio import Entrez
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

import re

from Bio import SeqIO

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

39395


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
query = "SELECT * FROM sequence"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration'

In [7]:
# add_publication = (
#     "INSERT INTO publication "
#     "(id, title, doi, author, year) "
#     "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
# )
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)

In [8]:
def get_taxonomy_data(record):
    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print("Fetched taxid from NCBI {}".format(id))
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for \n {} setting it to 1".format(s))
        taxids.append(1)  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print("Unexpected error: {}, Retrying, attempt {}".format(sys.exc_info()[0], i))
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None: taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None: taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add sheep H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** oviAri4, chr4, +-, 113150440-113152940

**Sequence from article:**
```fasta
>Sheep_H2B.K
MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
```

BLASTP has one result with 100% coverage and 99% identity (1 substitution).

**Protein accession:** XP_027824938.1

In [9]:
ACCESSION = "HISTDB_H2B_K_0"

## Adding HISTDB_H2B_K_0

In [10]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9940 ,
    "organism": "Ovis aries",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_K_0',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9940,
 'organism': 'Ovis aries',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [11]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [12]:
cursor.execute(add_sequence, data_sequence)

In [13]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [14]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [15]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [16]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534


## Get sequence XP_027824938.1 from NCBI

In [18]:
ACCESSION = "XP_027824938.1"

In [19]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_027824938.1
Name: XP_027824938
Description: histone H2B type 2-K1 [Ovis aries]
Database cross-references: BioProject:PRJNA739192
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=30-OCT-2023
/accessions=['XP_027824938']
/sequence_version=1
/db_source=REFSEQ: accession XM_027969137.3
/keywords=['RefSeq']
/source=Ovis aries (sheep)
/organism=Ovis aries
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Ruminantia', 'Pecora', 'Bovidae', 'Caprinae', 'Ovis']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_056057.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Status', 'Full

In [20]:
str(record.seq)

'MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [21]:
record.annotations["organism"]

'Ovis aries'

In [22]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Ovis aries',
 'taxonomy_id': 9940,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_027824938.1

In [23]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_027824938.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9940,
 'organism': 'Ovis aries',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [24]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [25]:
cursor.execute(add_sequence, data_sequence)

In [26]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [27]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [28]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [29]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_027824938.1,35099534


In [30]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
2,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
3,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_027824938.1,35099534


In [31]:
# Make sure data is committed to the database
conn.commit()

# Add cow H2B.Ks

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Genes:**
- bosTau9, chr4, +-, 113782777-113785296, H2B.K.1 (ancestral)
- bosTau9, chr11, ++, 102459398-102459763, H2B.K.3
- bosTau9, chr1, ++, 7246094-7246459, H2B.K.2

**Sequences from article:**
```fasta
>Cow_H2B.K.1
MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
>Cow_H2B.K.2
MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIVNLFVNDLFERLAGKAAWLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
>Cow_H2B.K.3
MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
```

BLASTP results:
- one H2B.K.1 with 100% coverage and 100% identity;
- one H2B.K.2 with 100% coverage and 100% identity;
- one H2B.K.3 with 100% coverage and 100% identity (maybe shifted reading frame, 19aa extra from N-tail).

**Protein accessions:** XP_024846715.1, XP_010799227.1, XP_059747847.1

## Accession XP_024846715.1

In [32]:
ACCESSION = "XP_024846715.1"

## Get sequence XP_024846715.1 from NCBI

In [33]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_024846715.1
Name: XP_024846715
Description: histone H2B type 2-K1 [Bos taurus]
Database cross-references: BioProject:PRJNA450837
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=03-OCT-2023
/accessions=['XP_024846715']
/sequence_version=1
/db_source=REFSEQ: accession XM_024990947.2
/keywords=['RefSeq', 'includes ab initio']
/source=Bos taurus (domestic cattle)
/organism=Bos taurus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Ruminantia', 'Pecora', 'Bovidae', 'Bovinae', 'Bos']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_037331.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'

In [34]:
str(record.seq)

'MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [35]:
record.annotations["organism"]

'Bos taurus'

In [36]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Bos taurus',
 'taxonomy_id': 9913,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_024846715.1

In [37]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_024846715.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9913,
 'organism': 'Bos taurus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [38]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [39]:
cursor.execute(add_sequence, data_sequence)

In [40]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [41]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [42]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [43]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_024846715.1,35099534


In [44]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
2,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
3,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_024846715.1,35099534
4,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_027824938.1,35099534


## Accession XP_010799227.1

In [45]:
ACCESSION = "XP_010799227.1"

## Get sequence XP_010799227.1 from NCBI

In [46]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_010799227.1
Name: XP_010799227
Description: histone H2B type 2-K1 [Bos taurus]
Database cross-references: BioProject:PRJNA450837
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=03-OCT-2023
/accessions=['XP_010799227']
/sequence_version=1
/db_source=REFSEQ: accession XM_010800925.4
/keywords=['RefSeq']
/source=Bos taurus (domestic cattle)
/organism=Bos taurus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Ruminantia', 'Pecora', 'Bovidae', 'Bovinae', 'Bos']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_037328.1) annotated using gene prediction method: Gnomon,
supported by EST evidence.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI Re

In [47]:
str(record.seq)

'MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIVNLFVNDLFERLAGKAAWLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [48]:
record.annotations["organism"]

'Bos taurus'

In [49]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Bos taurus',
 'taxonomy_id': 9913,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_010799227.1

In [50]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_010799227.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9913,
 'organism': 'Bos taurus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIVNLFVNDLFERLAGKAAWLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [51]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [52]:
cursor.execute(add_sequence, data_sequence)

In [53]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [54]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [55]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [56]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534


In [57]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
2,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
3,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
4,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_024846715.1,35099534
5,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_027824938.1,35099534


## Accession XP_059747847.1

In [58]:
ACCESSION = "XP_059747847.1"

## Get sequence XP_059747847.1 from NCBI

In [59]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_059747847.1
Name: XP_059747847
Description: histone H2B type 2-K1-like [Bos taurus]
Database cross-references: BioProject:PRJNA450837
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=03-OCT-2023
/accessions=['XP_059747847']
/sequence_version=1
/db_source=REFSEQ: accession XM_059891864.1
/keywords=['RefSeq']
/source=Bos taurus (domestic cattle)
/organism=Bos taurus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Ruminantia', 'Pecora', 'Bovidae', 'Bovinae', 'Bos']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_037338.1) annotated using gene prediction method: Gnomon,
supported by EST evidence.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NC

In [60]:
str(record.seq)

'MMDTRSSKLVCHNILRTRRMSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [61]:
record.annotations["organism"]

'Bos taurus'

In [62]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Bos taurus',
 'taxonomy_id': 9913,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_059747847.1

In [63]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_059747847.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9913,
 'organism': 'Bos taurus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MMDTRSSKLVCHNILRTRRMSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDLFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [64]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [65]:
cursor.execute(add_sequence, data_sequence)

In [66]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_059747847.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MMDTRSSKLVCHNILRTRRMSAEHGQLQQSGGRRGRSPGDKKSRRR...,,,


## Add publication

In [67]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [68]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [69]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_059747847.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MMDTRSSKLVCHNILRTRRMSAEHGQLQQSGGRRGRSPGDKKSRRR...,,XP_059747847.1,35099534


In [70]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
2,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
3,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
4,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_024846715.1,35099534
5,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_027824938.1,35099534
6,XP_059747847.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MMDTRSSKLVCHNILRTRRMSAEHGQLQQSGGRRGRSPGDKKSRRR...,,XP_059747847.1,35099534


In [71]:
# Make sure data is committed to the database
conn.commit()

# Add hourse H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Equus caballus, equCab3, chr4, +-, 102919882-102922597

**Sequence from article:**
```fasta
>Horse_H2B.K
MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAAQLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
```

BLASTP has one result with 100% coverage and 100% identity.

**Protein accession:** XP_005609614.1

In [72]:
ACCESSION = "XP_005609614.1"

## Get sequence XP_005609614.1 from NCBI

In [73]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_005609614.1
Name: XP_005609614
Description: histone H2B type 2-K1 [Equus caballus]
Database cross-references: BioProject:PRJNA19129
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=07-AUG-2024
/accessions=['XP_005609614']
/sequence_version=1
/db_source=REFSEQ: accession XM_005609557.3
/keywords=['RefSeq']
/source=Equus caballus (horse)
/organism=Equus caballus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Perissodactyla', 'Equidae', 'Equus']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_009147.3) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Status', 'Updated annotation'), ('An

In [74]:
str(record.seq)

'MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAAQLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [75]:
record.annotations["organism"]

'Equus caballus'

In [76]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Equus caballus',
 'taxonomy_id': 9796,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_005609614.1

In [77]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_005609614.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9796,
 'organism': 'Equus caballus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAAQLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [78]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [79]:
cursor.execute(add_sequence, data_sequence)

In [80]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [81]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [82]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [83]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534


In [84]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
2,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
3,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
4,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
5,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_024846715.1,35099534
6,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_027824938.1,35099534
7,XP_059747847.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MMDTRSSKLVCHNILRTRRMSAEHGQLQQSGGRRGRSPGDKKSRRR...,,XP_059747847.1,35099534


In [85]:
# Make sure data is committed to the database
conn.commit()

# Add white rhinocerous H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Ceratotherium simum, cerSim1, JH767754, ++, 6739207-6741678

**Sequence from article:**
```fasta
>Rhino_H2B.K
MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
```

BLASTP has one result with 100% coverage and 100% identity.

**Protein accession:** XP_014643104.1

In [86]:
ACCESSION = "XP_014643104.1"

## Get sequence XP_014643104.1 from NCBI

In [87]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_014643104.1
Name: XP_014643104
Description: PREDICTED: late histone H2B.L4-like [Ceratotherium simum simum]
Database cross-references: BioProject:PRJNA191537
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=27-NOV-2015
/accessions=['XP_014643104']
/sequence_version=1
/db_source=REFSEQ: accession XM_014787618.1
/keywords=['RefSeq']
/source=Ceratotherium simum simum (southern white rhinoceros)
/organism=Ceratotherium simum simum
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Perissodactyla', 'Rhinocerotidae', 'Ceratotherium']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NW_004454187.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('A

In [88]:
str(record.seq)

'MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [89]:
record.annotations["organism"]

'Ceratotherium simum simum'

In [90]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Ceratotherium simum simum',
 'taxonomy_id': 73337,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_014643104.1

In [91]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_014643104.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 73337,
 'organism': 'Ceratotherium simum simum',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [92]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [93]:
cursor.execute(add_sequence, data_sequence)

In [94]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [95]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [96]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [97]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534


In [98]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
2,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
3,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
4,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
5,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534
6,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_024846715.1,35099534
7,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_027824938.1,35099534
8,XP_059747847.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MMDTRSSKLVCHNILRTRRMSAEHGQLQQSGGRRGRSPGDKKSRRR...,,XP_059747847.1,35099534


In [99]:
# Make sure data is committed to the database
conn.commit()

# Add cat H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Felis catus, felCat9, chrA2, +-, 165304073-165306381

**Sequence from article:**
```fasta
>Cat_H2B.K
MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
```

BLASTP has one result with 100% coverage and 100% identity.

**Protein accession:** XP_019681595.1

In [100]:
ACCESSION = "XP_019681595.1"

## Get sequence XP_019681595.1 from NCBI

In [101]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_019681595.1
Name: XP_019681595
Description: histone H2B type 2-E1 isoform X1 [Felis catus]
Database cross-references: BioProject:PRJNA773801
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=09-NOV-2021
/accessions=['XP_019681595']
/sequence_version=1
/db_source=REFSEQ: accession XM_019826036.3
/keywords=['RefSeq']
/source=Felis catus (domestic cat)
/organism=Felis catus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Carnivora', 'Feliformia', 'Felidae', 'Felinae', 'Felis']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_058369.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI'), ('Annotation Status', 'Ful

In [102]:
str(record.seq)

'MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [103]:
record.annotations["organism"]

'Felis catus'

In [104]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Felis catus',
 'taxonomy_id': 9685,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_014643104.1

In [105]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_019681595.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9685,
 'organism': 'Felis catus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [106]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [107]:
cursor.execute(add_sequence, data_sequence)

In [108]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_019681595.1,H2B.K,,,,9685,Felis catus,Chordata,Mammalia,,,MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [109]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [110]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [111]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_019681595.1,H2B.K,,,,9685,Felis catus,Chordata,Mammalia,,,MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_019681595.1,35099534


In [112]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
2,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
3,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
4,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
5,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534
6,XP_019681595.1,H2B.K,,,,9685,Felis catus,Chordata,Mammalia,,,MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_019681595.1,35099534
7,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_024846715.1,35099534
8,XP_027824938.1,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_027824938.1,35099534
9,XP_059747847.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MMDTRSSKLVCHNILRTRRMSAEHGQLQQSGGRRGRSPGDKKSRRR...,,XP_059747847.1,35099534


In [113]:
# Make sure data is committed to the database
conn.commit()

# Add dog H2B.Ks

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Genes:**
- Canis lupus familiaris, canFam3, chr16, +-, 15248179-15250469, H2B.K.1 (ancestral)
- Canis lupus familiaris, canFam3, chr18, +-, 49216892-49217257, H2B.K.2

**Sequences from article:**
```fasta
>Dog_H2B.K.1
MGAEHGQQPQSGGRRGRGSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
>Dog_H2B.K.2
MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVHPDIGIFSKAMSIMNSFVNDVFERLAGKAAQLAQYLGQTTLTSWEVQTAVRWLLPGELAKHAISEGTKAITKYTGSK
```

BLASTP results:
- one H2B.K.1 with 100% coverage and 100% identity (gene also known as H2BE);
- one H2B.K.2 with 100% coverage and 100% identity (is that a same gene as in the article?).

**Protein accessions:** XP_022259586.1, HISTDB_H2B_K_1

## Add XP_022259586.1

In [114]:
ACCESSION = "XP_022259586.1"

## Get sequence XP_022259586.1 from NCBI

In [115]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_022259586.1
Name: XP_022259586
Description: histone H2B type 2-E1 [Canis lupus familiaris]
Database cross-references: BioProject:PRJNA12384
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=06-JAN-2021
/accessions=['XP_022259586']
/sequence_version=1
/db_source=REFSEQ: accession XM_022403878.2
/keywords=['RefSeq']
/source=Canis lupus familiaris (dog)
/organism=Canis lupus familiaris
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Carnivora', 'Caniformia', 'Canidae', 'Canis']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_006598.4) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI'), ('Annotation Status', 'Fu

In [116]:
str(record.seq)

'MGAEHGQQPQSGGRRGRGSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [117]:
record.annotations["organism"]

'Canis lupus familiaris'

In [118]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Canis lupus familiaris',
 'taxonomy_id': 9615,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_022259586.1

In [119]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_022259586.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9615,
 'organism': 'Canis lupus familiaris',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MGAEHGQQPQSGGRRGRGSGDKKSKKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [120]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [121]:
cursor.execute(add_sequence, data_sequence)

In [122]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_022259586.1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGAEHGQQPQSGGRRGRGSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [123]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [124]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [125]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_022259586.1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGAEHGQQPQSGGRRGRGSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_022259586.1,35099534


## Add HISTDB_H2B_K_1

In [127]:
ACCESSION = "HISTDB_H2B_K_1"

In [129]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9615 ,
    "organism": "Canis lupus familiaris",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVHPDIGIFSKAMSIMNSFVNDVFERLAGKAAQLAQYLGQTTLTSWEVQTAVRWLLPGELAKHAISEGTKAITKYTGSK",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_K_1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9615,
 'organism': 'Canis lupus familiaris',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVHPDIGIFSKAMSIMNSFVNDVFERLAGKAAQLAQYLGQTTLTSWEVQTAVRWLLPGELAKHAISEGTKAITKYTGSK',
 'variant_under_consideration': None}

In [130]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [131]:
cursor.execute(add_sequence, data_sequence)

In [132]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [133]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [134]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [135]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534


In [136]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534
2,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
3,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
4,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
5,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
6,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534
7,XP_019681595.1,H2B.K,,,,9685,Felis catus,Chordata,Mammalia,,,MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_019681595.1,35099534
8,XP_022259586.1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGAEHGQQPQSGGRRGRGSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_022259586.1,35099534
9,XP_024846715.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_024846715.1,35099534


In [137]:
# Make sure data is committed to the database
conn.commit()

# Add panda H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Ailuropoda melanoleuca, ailMel1, GL192435.1, ++, 2497397-2499608

**Sequence from article:**
```fasta
>Panda_H2B.K
MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
```

BLASTP has one result with 100% coverage and 100% identity.

**Protein accession:** XP_019651116.1

In [153]:
ACCESSION = "XP_019651116.1"

## Get sequence XP_019651116.1 from NCBI

In [154]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_019651116.1
Name: XP_019651116
Description: late histone H2B.L4 [Ailuropoda melanoleuca]
Database cross-references: BioProject:PRJNA633539
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=26-MAY-2020
/accessions=['XP_019651116']
/sequence_version=1
/db_source=REFSEQ: accession XM_019795557.2
/keywords=['RefSeq']
/source=Ailuropoda melanoleuca (giant panda)
/organism=Ailuropoda melanoleuca
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Carnivora', 'Caniformia', 'Ursidae', 'Ailuropoda']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_048218.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI'), ('Annotation 

In [155]:
str(record.seq)

'MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [156]:
record.annotations["organism"]

'Ailuropoda melanoleuca'

In [157]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Ailuropoda melanoleuca',
 'taxonomy_id': 9646,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_014643104.1

In [158]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_019651116.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9646,
 'organism': 'Ailuropoda melanoleuca',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVHPDIGISSKAMSIMNSFVNDVFERLAGEAARLAQYSGRTTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [159]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [160]:
cursor.execute(add_sequence, data_sequence)

In [161]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_019651116.1,H2B.K,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVH...,,,


## Add publication

In [162]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [163]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [164]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_019651116.1,H2B.K,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVH...,,XP_019651116.1,35099534


In [165]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534
2,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
3,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
4,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
5,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
6,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534
7,XP_019651116.1,H2B.K,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVH...,,XP_019651116.1,35099534
8,XP_019681595.1,H2B.K,,,,9685,Felis catus,Chordata,Mammalia,,,MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_019681595.1,35099534
9,XP_022259586.1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGAEHGQQPQSGGRRGRGSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_022259586.1,35099534


In [166]:
# Make sure data is committed to the database
conn.commit()

# Add elephant H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Loxodonta africana, loxAfr3, scaffold_12, +-, 59810077-59812441

**Sequence from article:**
```fasta
>Elephant_H2B.K
MSAELGQQQQQQQSGGQRGRSSGDKKPKKRSRRKENYSVYIYKVLKQVHPDISISSKAMSIMNSFVNDVFERLAGEATRLAQYSGRTTLTSREVQTAARLLLPGELAKHAVSEGTKAVTKYISSK
```

BLASTP has one result with 100% coverage and 100% identity.

**Protein accession:** XP_023403847.1

In [167]:
ACCESSION = "XP_023403847.1"

## Get sequence XP_023403847.1 from NCBI

In [168]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_023403847.1
Name: XP_023403847
Description: histone H2B type 2-K1 [Loxodonta africana]
Database cross-references: BioProject:PRJNA1092696
Number of features: 7
/topology=linear
/data_file_division=MAM
/date=05-APR-2024
/accessions=['XP_023403847']
/sequence_version=1
/db_source=REFSEQ: accession XM_023548079.2
/keywords=['RefSeq']
/source=Loxodonta africana (African savanna elephant)
/organism=Loxodonta africana
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Afrotheria', 'Proboscidea', 'Elephantidae', 'Loxodonta']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_087363) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Sta

In [169]:
str(record.seq)

'MSAELGQQQQQQQSGGQRGRSSGDKKPKKRSRRKENYSVYIYKVLKQVHPDISISSKAMSIMNSFVNDVFERLAGEATRLAQYSGRTTLTSREVQTAARLLLPGELAKHAVSEGTKAVTKYISSK'

In [170]:
record.annotations["organism"]

'Loxodonta africana'

In [171]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Loxodonta africana',
 'taxonomy_id': 9785,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_023403847.1

In [172]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_023403847.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9785,
 'organism': 'Loxodonta africana',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAELGQQQQQQQSGGQRGRSSGDKKPKKRSRRKENYSVYIYKVLKQVHPDISISSKAMSIMNSFVNDVFERLAGEATRLAQYSGRTTLTSREVQTAARLLLPGELAKHAVSEGTKAVTKYISSK',
 'variant_under_consideration': None}

In [173]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [174]:
cursor.execute(add_sequence, data_sequence)

In [175]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_023403847.1,H2B.K,,,,9785,Loxodonta africana,Chordata,Mammalia,,,MSAELGQQQQQQQSGGQRGRSSGDKKPKKRSRRKENYSVYIYKVLK...,,,


## Add publication

In [176]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [177]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [178]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_023403847.1,H2B.K,,,,9785,Loxodonta africana,Chordata,Mammalia,,,MSAELGQQQQQQQSGGQRGRSSGDKKPKKRSRRKENYSVYIYKVLK...,,XP_023403847.1,35099534


In [179]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534
2,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
3,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
4,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
5,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
6,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534
7,XP_019651116.1,H2B.K,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVH...,,XP_019651116.1,35099534
8,XP_019681595.1,H2B.K,,,,9685,Felis catus,Chordata,Mammalia,,,MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_019681595.1,35099534
9,XP_022259586.1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGAEHGQQPQSGGRRGRGSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_022259586.1,35099534


In [180]:
# Make sure data is committed to the database
conn.commit()

# Add platypus H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Ornithorhynchus anatinus, ornAna2, chrUn_DS181150v1, ++, 14809945-14812026

**Sequence from article:**
```fasta
>Platypus_H2B.K
MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHPDTGISSKAMSIMNSFVNDVFEQLAGEAARLAQYLGRSTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK
```

BLASTP has one result with 100% coverage and 100% identity (is that a same gene as in the article?).

**Protein accession:** HISTDB_H2B_K_2

## Add HISTDB_H2B_K_2

In [185]:
ACCESSION = "HISTDB_H2B_K_2"

In [186]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9258,
    "organism": "Ornithorhynchus anatinus",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHPDTGISSKAMSIMNSFVNDVFEQLAGEAARLAQYLGRSTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_K_2',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9258,
 'organism': 'Ornithorhynchus anatinus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHPDTGISSKAMSIMNSFVNDVFEQLAGEAARLAQYLGRSTLTSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [187]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [188]:
cursor.execute(add_sequence, data_sequence)

In [189]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_2,H2B.K,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHP...,,,


## Add publication

In [190]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [191]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [192]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_2,H2B.K,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHP...,,HISTDB_H2B_K_2,35099534


In [193]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534
2,HISTDB_H2B_K_2,H2B.K,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHP...,,HISTDB_H2B_K_2,35099534
3,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
4,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
5,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
6,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
7,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534
8,XP_019651116.1,H2B.K,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVH...,,XP_019651116.1,35099534
9,XP_019681595.1,H2B.K,,,,9685,Felis catus,Chordata,Mammalia,,,MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_019681595.1,35099534


In [None]:
# Make sure data is committed to the database
conn.commit()

# Add chicken H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Gallus gallus, galGal6, chr2, ++, 174543-175540

**Protein accession:** XP_423715.4

In [9]:
ACCESSION = "XP_423715.4"

## Get sequence XP_423715.4 from NCBI

In [10]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_423715.4
Name: XP_423715
Description: late histone H2B.L4 [Gallus gallus]
Database cross-references: BioProject:PRJNA698609
Number of features: 7
/topology=linear
/data_file_division=VRT
/date=01-MAR-2022
/accessions=['XP_423715']
/sequence_version=4
/db_source=REFSEQ: accession XM_423715.8
/keywords=['RefSeq']
/source=Gallus gallus (chicken)
/organism=Gallus gallus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Archelosauria', 'Archosauria', 'Dinosauria', 'Saurischia', 'Theropoda', 'Coelurosauria', 'Aves', 'Neognathae', 'Galloanserae', 'Galliformes', 'Phasianidae', 'Phasianinae', 'Gallus']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_052533.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
On May 17, 2018 this sequence version replaced XP_423715.3.
COMPLETENESS: full length.
/structured_c

In [11]:
str(record.seq)

'MSAESGRMRGHPSSSGDKKSKRKPKRKETYSVYIYKVLKQVHPDTGISSKAMSIMNSFVNDIFERLAVEASRLAQYNHRSTITSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [12]:
record.annotations["organism"]

'Gallus gallus'

In [13]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Gallus gallus',
 'taxonomy_id': 9031,
 'phylum': 'Chordata',
 'class': 'Aves'}

## Adding XP_023403847.1

In [14]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_423715.4',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9031,
 'organism': 'Gallus gallus',
 'phylum': 'Chordata',
 'class': 'Aves',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSAESGRMRGHPSSSGDKKSKRKPKRKETYSVYIYKVLKQVHPDTGISSKAMSIMNSFVNDIFERLAVEASRLAQYNHRSTITSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [15]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [16]:
cursor.execute(add_sequence, data_sequence)

In [17]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_423715.4,H2B.K,,,,9031,Gallus gallus,Chordata,Aves,,,MSAESGRMRGHPSSSGDKKSKRKPKRKETYSVYIYKVLKQVHPDTG...,,,


## Add publication

In [18]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [19]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [20]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_423715.4,H2B.K,,,,9031,Gallus gallus,Chordata,Aves,,,MSAESGRMRGHPSSSGDKKSKRKPKRKETYSVYIYKVLKQVHPDTG...,,XP_423715.4,35099534


In [21]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534
2,HISTDB_H2B_K_2,H2B.K,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHP...,,HISTDB_H2B_K_2,35099534
3,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
4,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
5,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
6,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
7,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534
8,XP_019651116.1,H2B.K,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVH...,,XP_019651116.1,35099534
9,XP_019681595.1,H2B.K,,,,9685,Felis catus,Chordata,Mammalia,,,MSAEHGQQQQSGGRRGRSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_019681595.1,35099534


In [22]:
# Make sure data is committed to the database
conn.commit()

# Add zebra finch H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Taeniopygia guttata, taeGut2, chr2, ++, 118721-119732

**Protein accession:** XP_002190629.1

In [27]:
ACCESSION = "XP_002190629.1"

## Get sequence XP_002190629.1 from NCBI

In [28]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_002190629.1
Name: XP_002190629
Description: PREDICTED: late histone H2B.L4-like [Taeniopygia guttata]
Database cross-references: BioProject:PRJNA32405
Number of features: 3
/topology=linear
/data_file_division=VRT
/date=26-MAY-2015
/accessions=['XP_002190629']
/sequence_version=1
/db_source=REFSEQ: accession XM_002190593.1
/keywords=['RefSeq', 'includes ab initio']
/source=Taeniopygia guttata (zebra finch)
/organism=Taeniopygia guttata
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Archelosauria', 'Archosauria', 'Dinosauria', 'Saurischia', 'Theropoda', 'Coelurosauria', 'Aves', 'Neognathae', 'Passeriformes', 'Passeroidea', 'Estrildidae', 'Estrildinae', 'Taeniopygia']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NW_002198248.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
XP_002190629.2.
COMP

In [29]:
str(record.seq)

'MSSERLKKRGHAVASGKKSSKRKPKRKEAFSVYIYKVLKQVHPDLAISSKAMSIMNSFVNDMLERLAAEASRLARYRCHTTVSSREVQAAARQLLPGQLAQHAVSEGTKAVTKYTTSK'

In [30]:
record.annotations["organism"]

'Taeniopygia guttata'

In [31]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Taeniopygia guttata',
 'taxonomy_id': 59729,
 'phylum': 'Chordata',
 'class': 'Aves'}

## Adding XP_002190629.1

In [32]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_002190629.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 59729,
 'organism': 'Taeniopygia guttata',
 'phylum': 'Chordata',
 'class': 'Aves',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSSERLKKRGHAVASGKKSSKRKPKRKEAFSVYIYKVLKQVHPDLAISSKAMSIMNSFVNDMLERLAAEASRLARYRCHTTVSSREVQAAARQLLPGQLAQHAVSEGTKAVTKYTTSK',
 'variant_under_consideration': None}

In [33]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [34]:
cursor.execute(add_sequence, data_sequence)

In [35]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_002190629.1,H2B.K,,,,59729,Taeniopygia guttata,Chordata,Aves,,,MSSERLKKRGHAVASGKKSSKRKPKRKEAFSVYIYKVLKQVHPDLA...,,,


## Add publication

In [36]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [37]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [38]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_002190629.1,H2B.K,,,,59729,Taeniopygia guttata,Chordata,Aves,,,MSSERLKKRGHAVASGKKSSKRKPKRKEAFSVYIYKVLKQVHPDLA...,,XP_002190629.1,35099534


In [39]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534
2,HISTDB_H2B_K_2,H2B.K,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHP...,,HISTDB_H2B_K_2,35099534
3,XP_002190629.1,H2B.K,,,,59729,Taeniopygia guttata,Chordata,Aves,,,MSSERLKKRGHAVASGKKSSKRKPKRKEAFSVYIYKVLKQVHPDLA...,,XP_002190629.1,35099534
4,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
5,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
6,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
7,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
8,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534
9,XP_019651116.1,H2B.K,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MSAEHGRQQQPGGRRGRSSGDKKSRKRSRRKETYSMYIYKVLKQVH...,,XP_019651116.1,35099534


In [41]:
# Make sure data is committed to the database
conn.commit()

# Add zebrafish H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Danio rerio, danRer11, chr24, ++, 34069746-34071923

**Protein accession:** NP_001002724.1

In [42]:
ACCESSION = "NP_001002724.1"

## Get sequence NP_001002724.1 from NCBI

In [43]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: NP_001002724.1
Name: NP_001002724
Description: histone H2B type 2-K1 [Danio rerio]
Number of features: 7
/topology=linear
/data_file_division=VRT
/date=11-SEP-2024
/accessions=['NP_001002724', 'XP_685271']
/sequence_version=1
/db_source=REFSEQ: accession NM_001002724.2
/keywords=['RefSeq']
/source=Danio rerio (zebrafish)
/organism=Danio rerio
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Actinopterygii', 'Neopterygii', 'Teleostei', 'Ostariophysi', 'Cypriniformes', 'Danionidae', 'Danioninae', 'Danio']
/references=[Reference(title='Enhancer trap lines with GFP driven by smad6b and frizzled1 regulatory sequences for the study of epithelial morphogenesis in the developing zebrafish inner ear', ...), Reference(title='A cell cycle-coordinated Polymerase II transcription compartment encompasses gene expression before global genome activation', ...), Reference(title='Comparison of proteomic profiles in the zebrafish retina during experimental deg

In [44]:
str(record.seq)

'MSNEGAKKKGKAPGDKKGSKRKSKRRETYAVYIYKVLKQVHPDTGISSRAMSIMNSFVNDVFERIATEASRLAHYNKRSTITSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [45]:
record.annotations["organism"]

'Danio rerio'

In [46]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Danio rerio',
 'taxonomy_id': 7955,
 'phylum': 'Chordata',
 'class': 'Actinopteri'}

## Adding NP_001002724.1

In [47]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'NP_001002724.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 7955,
 'organism': 'Danio rerio',
 'phylum': 'Chordata',
 'class': 'Actinopteri',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MSNEGAKKKGKAPGDKKGSKRKSKRRETYAVYIYKVLKQVHPDTGISSRAMSIMNSFVNDVFERIATEASRLAHYNKRSTITSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [48]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [49]:
cursor.execute(add_sequence, data_sequence)

In [51]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,NP_001002724.1,H2B.K,,,,7955,Danio rerio,Chordata,Actinopteri,,,MSNEGAKKKGKAPGDKKGSKRKSKRRETYAVYIYKVLKQVHPDTGI...,,,


## Add publication

In [52]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [53]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [54]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,NP_001002724.1,H2B.K,,,,7955,Danio rerio,Chordata,Actinopteri,,,MSNEGAKKKGKAPGDKKGSKRKSKRRETYAVYIYKVLKQVHPDTGI...,,NP_001002724.1,35099534


In [55]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534
2,HISTDB_H2B_K_2,H2B.K,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHP...,,HISTDB_H2B_K_2,35099534
3,NP_001002724.1,H2B.K,,,,7955,Danio rerio,Chordata,Actinopteri,,,MSNEGAKKKGKAPGDKKGSKRKSKRRETYAVYIYKVLKQVHPDTGI...,,NP_001002724.1,35099534
4,XP_002190629.1,H2B.K,,,,59729,Taeniopygia guttata,Chordata,Aves,,,MSSERLKKRGHAVASGKKSSKRKPKRKEAFSVYIYKVLKQVHPDLA...,,XP_002190629.1,35099534
5,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
6,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
7,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
8,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534
9,XP_014643104.1,H2B.K,,,,73337,Ceratotherium simum simum,Chordata,Mammalia,,,MSTEHGQQHHPGGRRGCSPGDKKFKKRSRRKETYSMYIYKVLKQVH...,,XP_014643104.1,35099534


In [56]:
# Make sure data is committed to the database
conn.commit()

# Add coelacanth H2B.K

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Latimeria chalumnae, LatCha1, JH128031, +-, 495204-499714

**Protein accession:** XP_006007163.1

In [57]:
ACCESSION = "XP_006007163.1"

## Get sequence XP_006007163.1 from NCBI

In [58]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_006007163.1
Name: XP_006007163
Description: late histone H2B.L4 [Latimeria chalumnae]
Database cross-references: BioProject:PRJNA1100969
Number of features: 7
/topology=linear
/data_file_division=VRT
/date=25-APR-2024
/accessions=['XP_006007163']
/sequence_version=1
/db_source=REFSEQ: accession XM_006007101.2
/keywords=['RefSeq']
/source=Latimeria chalumnae (coelacanth)
/organism=Latimeria chalumnae
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Coelacanthiformes', 'Coelacanthidae', 'Latimeria']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_088140) annotated using gene prediction method: Gnomon,
supported by mRNA evidence.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Status', 'Full ann

In [59]:
str(record.seq)

'MTNDPGKKKSKNPGEKKSSKKKAKRRETYSVYIYKVLKQVHPDTGISSKAMSIMNSFVNDVFERIATEASRLAQYNKRSTITSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK'

In [60]:
record.annotations["organism"]

'Latimeria chalumnae'

In [61]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Latimeria chalumnae',
 'taxonomy_id': 7897,
 'phylum': 'Chordata',
 'class': None}

## Adding NP_001002724.1

In [62]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.K",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_006007163.1',
 'variant': 'H2B.K',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 7897,
 'organism': 'Latimeria chalumnae',
 'phylum': 'Chordata',
 'class': None,
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MTNDPGKKKSKNPGEKKSSKKKAKRRETYSVYIYKVLKQVHPDTGISSKAMSIMNSFVNDVFERIATEASRLAQYNKRSTITSREVQTAVRLLLPGELAKHAVSEGTKAVTKYTSSK',
 'variant_under_consideration': None}

In [63]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'NoneType'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [64]:
cursor.execute(add_sequence, data_sequence)

In [65]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_006007163.1,H2B.K,,,,7897,Latimeria chalumnae,Chordata,,,,MTNDPGKKKSKNPGEKKSSKKKAKRRETYSVYIYKVLKQVHPDTGI...,,,


## Add publication

In [66]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [67]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [68]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_006007163.1,H2B.K,,,,7897,Latimeria chalumnae,Chordata,,,,MTNDPGKKKSKNPGEKKSSKKKAKRRETYSVYIYKVLKQVHPDTGI...,,XP_006007163.1,35099534


In [69]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.K'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_K_0,H2B.K,,,,9940,Ovis aries,Chordata,Mammalia,,,MSAEHGQLQQAGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_0,35099534
1,HISTDB_H2B_K_1,H2B.K,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MGTEHGQQPQSGGRRGHGSGDKKSKKHSRRKETYSMYIYKVLKQVH...,,HISTDB_H2B_K_1,35099534
2,HISTDB_H2B_K_2,H2B.K,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MSPEGGQQQQQQPRPRARGDRRPKRRTRRKETYSVYIYKVLKQVHP...,,HISTDB_H2B_K_2,35099534
3,NP_001002724.1,H2B.K,,,,7955,Danio rerio,Chordata,Actinopteri,,,MSNEGAKKKGKAPGDKKGSKRKSKRRETYAVYIYKVLKQVHPDTGI...,,NP_001002724.1,35099534
4,XP_002190629.1,H2B.K,,,,59729,Taeniopygia guttata,Chordata,Aves,,,MSSERLKKRGHAVASGKKSSKRKPKRKEAFSVYIYKVLKQVHPDLA...,,XP_002190629.1,35099534
5,XP_002715119.2,H2B.K,,,,9986,Oryctolagus cuniculus,Chordata,Mammalia,,,MSAERGQQQQQASSRRGRSSGNKKSRKRSKRKETYSMYIYKVLKQV...,,XP_002715119.2,35099534
6,XP_005609614.1,H2B.K,,,,9796,Equus caballus,Chordata,Mammalia,,,MSTEHGQQHQSGGRRGCSSGDKKSKKRSRRKETYSMYIYKVLKQVH...,,XP_005609614.1,35099534
7,XP_006007163.1,H2B.K,,,,7897,Latimeria chalumnae,Chordata,,,,MTNDPGKKKSKNPGEKKSSKKKAKRRETYSVYIYKVLKQVHPDTGI...,,XP_006007163.1,35099534
8,XP_010799227.1,H2B.K,,,,9913,Bos taurus,Chordata,Mammalia,,,MSAEHGQLQQSGGRRGRSPGDKKSRRRSRRKETYSMYIYKVLKQVH...,,XP_010799227.1,35099534
9,XP_013846203.1,H2B.K,,,,9823,Sus scrofa,Chordata,Mammalia,,,MSSAHGQQQQQQQQQQQQQQQGGGRRGRSSGEKKSKKRNRRKETYS...,,XP_013846203.1,35099534


In [70]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [71]:
cursor.close()
conn.close()
tunnel.stop()