In [1]:
import pandas as pd
from Bio import Entrez
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

Entrez.email = "l.singh@intbio.org"

import re

from Bio import SeqIO

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

39799


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
query = "SELECT * FROM sequence"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration'

In [7]:
# add_publication = (
#     "INSERT INTO publication "
#     "(id, title, doi, author, year) "
#     "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
# )
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)

In [8]:
def get_taxonomy_data(record):
    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print("Fetched taxid from NCBI {}".format(id))
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for \n {} setting it to 1".format(s))
        taxids.append(1)  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print("Unexpected error: {}, Retrying, attempt {}".format(sys.exc_info()[0], i))
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None: taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None: taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# Add sheep H2B.N

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Ovis aries, oviAri4, chr11, +-, 16915435-16919543

**Sequence from article:**
```fasta
>Sheep_H2B.N
MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFNYMGKLLKQGHPDFSGCSWILDALRALEDWQLEWVSLEAVRLSLYNHRRTVTSREILEAVQQRCSQKTLGINEVALHGSVVEMIALVQKQKIGSFGGLS
```

BLASTP has one result with 100% coverage and 99% identity (1 substitution).

**Protein accession:** XP_060251208.1

In [10]:
ACCESSION = "HISTDB_H2B_N_0"

## Adding HISTDB_H2B_N_0

In [11]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9940 ,
    "organism": "Ovis aries",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFNYMGKLLKQGHPDFSGCSWILDALRALEDWQLEWVSLEAVRLSLYNHRRTVTSREILEAVQQRCSQKTLGINEVALHGSVVEMIALVQKQKIGSFGGLS",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_N_0',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9940,
 'organism': 'Ovis aries',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFNYMGKLLKQGHPDFSGCSWILDALRALEDWQLEWVSLEAVRLSLYNHRRTVTSREILEAVQQRCSQKTLGINEVALHGSVVEMIALVQKQKIGSFGGLS',
 'variant_under_consideration': None}

In [12]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [13]:
cursor.execute(add_sequence, data_sequence)

In [14]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,,


## Add publication

In [15]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [16]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [17]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534


## Get sequence XP_060251208.1 from NCBI

In [18]:
ACCESSION = "XP_060251208.1"

In [19]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_060251208.1
Name: XP_060251208
Description: histone H2A.N [Ovis aries]
Database cross-references: BioProject:PRJNA739192
Number of features: 3
/topology=linear
/data_file_division=MAM
/date=30-OCT-2023
/accessions=['XP_060251208']
/sequence_version=1
/db_source=REFSEQ: accession XM_060395225.1
/keywords=['RefSeq']
/source=Ovis aries (sheep)
/organism=Ovis aries
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Ruminantia', 'Pecora', 'Bovidae', 'Caprinae', 'Ovis']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_056064.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Status', 'Full annotat

In [20]:
str(record.seq)

'MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFNYMGKLLKQGHPDFSGCSWILDALRALEDWQLEWISLEAVRLSLYNHRRTVTSREILEAVQQRCSQKTLGINEVALHGSVVEMIALVQKQKIGSFGGLS'

In [21]:
record.annotations["organism"]

'Ovis aries'

In [22]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Ovis aries',
 'taxonomy_id': 9940,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_060251208.1

In [23]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_060251208.1',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9940,
 'organism': 'Ovis aries',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFNYMGKLLKQGHPDFSGCSWILDALRALEDWQLEWISLEAVRLSLYNHRRTVTSREILEAVQQRCSQKTLGINEVALHGSVVEMIALVQKQKIGSFGGLS',
 'variant_under_consideration': None}

In [24]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [25]:
cursor.execute(add_sequence, data_sequence)

In [26]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,,


## Add publication

In [27]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [28]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [29]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


In [31]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


In [32]:
# Make sure data is committed to the database
conn.commit()

# Add cow H2B.Ns

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Genes:**
- Bos taurus, bosTau9, chr19, +-, 17314140-17318442, H2B.N.1 (ancestral)
- Bos taurus, bosTau9, chr10, ++, 64118789-64119220, H2B.N.2

**Sequences from article:**
```fasta
>Cow_H2B.N.1
MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFNYMGKLLKQVHPDFSGCSWILDALRVLEDWQLEWVSLEAVRLSLYNHRRTITSREILEAVKQRCSQKSLGINEVDLHGSVVEMIALVQKQKIGSFGGLS
>Cow_H2B.N.2
MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFNYMEKLLKQVHPDFSGCSWILDALRVLEDWQLEWVSLEAVRLSFYNHRRTITTKEILKAVKQRCSQKSLGINEVDLHGSVVEMIALVQKQKIGSFGGLS
```

BLASTP results:
- one H2B.N.1 with 100% coverage and 100% identity;
- one H2B.N.2 with 100% coverage and 100% identity.

**Protein accessions:** XP_059734412.1, XP_059746706.1

## Accession XP_059734412.1

In [33]:
ACCESSION = "XP_059734412.1"

## Get sequence XP_059734412.1 from NCBI

In [34]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_059734412.1
Name: XP_059734412
Description: histone H2A.N [Bos taurus]
Database cross-references: BioProject:PRJNA450837
Number of features: 4
/topology=linear
/data_file_division=MAM
/date=03-OCT-2023
/accessions=['XP_059734412']
/sequence_version=1
/db_source=REFSEQ: accession XM_059878429.1
/keywords=['RefSeq']
/source=Bos taurus (domestic cattle)
/organism=Bos taurus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Ruminantia', 'Pecora', 'Bovidae', 'Bovinae', 'Bos']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_037346.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Status', 'Full

In [35]:
str(record.seq)

'MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFNYMGKLLKQVHPDFSGCSWILDALRVLEDWQLEWVSLEAVRLSLYNHRRTITSREILEAVKQRCSQKSLGINEVDLHGSVVEMIALVQKQKIGSFGGLS'

In [36]:
record.annotations["organism"]

'Bos taurus'

In [37]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Bos taurus',
 'taxonomy_id': 9913,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_059734412.1

In [38]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_059734412.1',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9913,
 'organism': 'Bos taurus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFNYMGKLLKQVHPDFSGCSWILDALRVLEDWQLEWVSLEAVRLSLYNHRRTITSREILEAVKQRCSQKSLGINEVDLHGSVVEMIALVQKQKIGSFGGLS',
 'variant_under_consideration': None}

In [39]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [40]:
cursor.execute(add_sequence, data_sequence)

In [41]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,,


## Add publication

In [42]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [43]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [44]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534


In [45]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534
2,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


## Accession XP_059746706.1

In [46]:
ACCESSION = "XP_059746706.1"

## Get sequence XP_059746706.1 from NCBI

In [47]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_059746706.1
Name: XP_059746706
Description: histone H2A.N-like [Bos taurus]
Database cross-references: BioProject:PRJNA450837
Number of features: 4
/topology=linear
/data_file_division=MAM
/date=03-OCT-2023
/accessions=['XP_059746706']
/sequence_version=1
/db_source=REFSEQ: accession XM_059890723.1
/keywords=['RefSeq']
/source=Bos taurus (domestic cattle)
/organism=Bos taurus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Laurasiatheria', 'Artiodactyla', 'Ruminantia', 'Pecora', 'Bovidae', 'Bovinae', 'Bos']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_037337.1) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Status', 

In [48]:
str(record.seq)

'MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFNYMEKLLKQVHPDFSGCSWILDALRVLEDWQLEWVSLEAVRLSFYNHRRTITTKEILKAVKQRCSQKSLGINEVDLHGSVVEMIALVQKQKIGSFGGLS'

In [49]:
record.annotations["organism"]

'Bos taurus'

In [50]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Bos taurus',
 'taxonomy_id': 9913,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_059746706.1

In [51]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_059746706.1',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9913,
 'organism': 'Bos taurus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFNYMEKLLKQVHPDFSGCSWILDALRVLEDWQLEWVSLEAVRLSFYNHRRTITTKEILKAVKQRCSQKSLGINEVDLHGSVVEMIALVQKQKIGSFGGLS',
 'variant_under_consideration': None}

In [52]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [53]:
cursor.execute(add_sequence, data_sequence)

In [54]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_059746706.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFN...,,,


## Add publication

In [55]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [56]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [57]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_059746706.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFN...,,XP_059746706.1,35099534


In [58]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534
2,XP_059746706.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFN...,,XP_059746706.1,35099534
3,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


In [59]:
# Make sure data is committed to the database
conn.commit()

# Add white rhinocerous H2B.N

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Ceratotherium simum, cerSim1, JH767835, +-, 2022797-2025311

**Sequence from article:**
```fasta
>Rhino_H2B.N
MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFSYMGKILEQIAHYRKLSRLCLILVPFLPRPTQTSVGAPGSWMHWALEAWRLEWVSLEAVRLSFCNHRRAVTSREILEAVKRRSSWKSF
```

BLASTP has NO results.

In [60]:
ACCESSION = "HISTDB_H2B_N_1"

## Adding HISTDB_H2B_N_1

In [62]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9807,
    "organism": "Ceratotherium simum",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFSYMGKILEQIAHYRKLSRLCLILVPFLPRPTQTSVGAPGSWMHWALEAWRLEWVSLEAVRLSFCNHRRAVTSREILEAVKRRSSWKSF",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_N_1',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9807,
 'organism': 'Ceratotherium simum',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFSYMGKILEQIAHYRKLSRLCLILVPFLPRPTQTSVGAPGSWMHWALEAWRLEWVSLEAVRLSFCNHRRAVTSREILEAVKRRSSWKSF',
 'variant_under_consideration': None}

In [63]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [64]:
cursor.execute(add_sequence, data_sequence)

In [65]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,,


## Add publication

In [66]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [67]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [68]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,HISTDB_H2B_N_1,35099534


In [69]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,HISTDB_H2B_N_1,35099534
2,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534
3,XP_059746706.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFN...,,XP_059746706.1,35099534
4,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


In [70]:
# Make sure data is committed to the database
conn.commit()

# Add dog H2B.N

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Canis lupus familiaris, canFam3, chr9, +-, 40155561-40158159

**Sequences from article:**
```fasta
>Dog_H2B.N
MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSYMGKILKQTHPDFSGCSWILDALGSLEDWLLEQVSLEAVRLSFYNHRRAVTSREILGAIKQRSFLKSFCVNEVF
```

BLASTP has NO results.

In [71]:
ACCESSION = "HISTDB_H2B_N_2"

## Adding HISTDB_H2B_N_2

In [72]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9615,
    "organism": "Canis lupus familiaris",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSYMGKILKQTHPDFSGCSWILDALGSLEDWLLEQVSLEAVRLSFYNHRRAVTSREILGAIKQRSFLKSFCVNEVF",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_N_2',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9615,
 'organism': 'Canis lupus familiaris',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSYMGKILKQTHPDFSGCSWILDALGSLEDWLLEQVSLEAVRLSFYNHRRAVTSREILGAIKQRSFLKSFCVNEVF',
 'variant_under_consideration': None}

In [73]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [74]:
cursor.execute(add_sequence, data_sequence)

In [75]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_2,H2B.N,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSY...,,,


## Add publication

In [76]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [77]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [78]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_2,H2B.N,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSY...,,HISTDB_H2B_N_2,35099534


In [79]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,HISTDB_H2B_N_1,35099534
2,HISTDB_H2B_N_2,H2B.N,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSY...,,HISTDB_H2B_N_2,35099534
3,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534
4,XP_059746706.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFN...,,XP_059746706.1,35099534
5,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


In [80]:
# Make sure data is committed to the database
conn.commit()

# Add panda H2B.N

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Ailuropoda melanoleuca, ailMel1, GL192339.1, ++, 4628399-4630866

**Sequence from article:**
```fasta
>Panda_H2B.N
MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSYMGKILKQTHPDFSGCSWILDALGSLEDWLLEWVSLEAVRLSFYNHRRAVTSREILGAVKQRSFRKSFCINKVF
```

BLASTP has one result with 100% coverage and 100% identity (is that a same gene as in the article?).

**Protein accession:** HISTDB_H2B_N_3

In [81]:
ACCESSION = "HISTDB_H2B_N_3"

## Adding HISTDB_H2B_N_3

In [82]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9646,
    "organism": "Ailuropoda melanoleuca",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSYMGKILKQTHPDFSGCSWILDALGSLEDWLLEWVSLEAVRLSFYNHRRAVTSREILGAVKQRSFRKSFCINKVF",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_N_3',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9646,
 'organism': 'Ailuropoda melanoleuca',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSYMGKILKQTHPDFSGCSWILDALGSLEDWLLEWVSLEAVRLSFYNHRRAVTSREILGAVKQRSFRKSFCINKVF',
 'variant_under_consideration': None}

In [83]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [84]:
cursor.execute(add_sequence, data_sequence)

In [85]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_3,H2B.N,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSY...,,,


## Add publication

In [86]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [87]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [88]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_3,H2B.N,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSY...,,HISTDB_H2B_N_3,35099534


In [89]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,HISTDB_H2B_N_1,35099534
2,HISTDB_H2B_N_2,H2B.N,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSY...,,HISTDB_H2B_N_2,35099534
3,HISTDB_H2B_N_3,H2B.N,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSY...,,HISTDB_H2B_N_3,35099534
4,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534
5,XP_059746706.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFN...,,XP_059746706.1,35099534
6,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


In [90]:
# Make sure data is committed to the database
conn.commit()

# Add elephant H2B.N

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Loxodonta africana, loxAfr3, scaffold_31, ++, 2335286-2338599

**Sequence from article:**
```fasta
>Elephant_H2B.N
MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEAHFCYMGKILKQTHPDFSGCSWVLEALGCLDDWQLEWVSLEAVRLSFYKHRRAITSREILEAMKQRSPRRSF
```

BLASTP has one result with 100% coverage and 100% identity (is that a same gene as in the article?).

**Protein accession:** HISTDB_H2B_N_4

In [91]:
ACCESSION = "HISTDB_H2B_N_4"

## Adding HISTDB_H2B_N_4

In [92]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9785,
    "organism": "Loxodonta africana",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEAHFCYMGKILKQTHPDFSGCSWVLEALGCLDDWQLEWVSLEAVRLSFYKHRRAITSREILEAMKQRSPRRSF",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_N_4',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9785,
 'organism': 'Loxodonta africana',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEAHFCYMGKILKQTHPDFSGCSWVLEALGCLDDWQLEWVSLEAVRLSFYKHRRAITSREILEAMKQRSPRRSF',
 'variant_under_consideration': None}

In [93]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [94]:
cursor.execute(add_sequence, data_sequence)

In [95]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_4,H2B.N,,,,9785,Loxodonta africana,Chordata,Mammalia,,,MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEA...,,,


## Add publication

In [96]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [97]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [98]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_4,H2B.N,,,,9785,Loxodonta africana,Chordata,Mammalia,,,MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEA...,,HISTDB_H2B_N_4,35099534


In [99]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,HISTDB_H2B_N_1,35099534
2,HISTDB_H2B_N_2,H2B.N,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSY...,,HISTDB_H2B_N_2,35099534
3,HISTDB_H2B_N_3,H2B.N,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSY...,,HISTDB_H2B_N_3,35099534
4,HISTDB_H2B_N_4,H2B.N,,,,9785,Loxodonta africana,Chordata,Mammalia,,,MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEA...,,HISTDB_H2B_N_4,35099534
5,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534
6,XP_059746706.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFN...,,XP_059746706.1,35099534
7,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


In [100]:
# Make sure data is committed to the database
conn.commit()

# Add armadillo H2B.N

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Dasypus novemcinctus, dasNov3, JH566127, ++, 1120922-1123462

**Sequence from article:**
```fasta
>Armadillo_H2B.N
MYYVCLDSLKFPKKKTDVYSLAERKYEWARSAFGKRRRRRWRRKKKEVYFSYMRKILKQVHADFSGCSWVLDALGSLDDWRLEWVSLEAVRLSFYNHRRAVTSREILEAVKQRLSWKSF
```

BLASTP has one result with 100% coverage and 100% identity.

**Protein accession:** XP_058139847.1

## Accession XP_058139847.1

In [101]:
ACCESSION = "XP_058139847.1"

## Get sequence XP_058139847.1 from NCBI

In [102]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_058139847.1
Name: XP_058139847
Description: histone H2A.N [Dasypus novemcinctus]
Database cross-references: BioProject:PRJNA994966
Number of features: 4
/topology=linear
/data_file_division=MAM
/date=21-JUL-2023
/accessions=['XP_058139847']
/sequence_version=1
/db_source=REFSEQ: accession XM_058283864.1
/keywords=['RefSeq']
/source=Dasypus novemcinctus (nine-banded armadillo)
/organism=Dasypus novemcinctus
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Eutheria', 'Xenarthra', 'Cingulata', 'Dasypodidae', 'Dasypus']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_080693) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotation Status', 'Full 

In [103]:
str(record.seq)

'MYYVCLDSLKFPKKKTDVYSLAERKYEWARSAFGKRRRRRWRRKKKEVYFSYMRKILKQVHADFSGCSWVLDALGSLDDWRLEWVSLEAVRLSFYNHRRAVTSREILEAVKQRLSWKSF'

In [104]:
record.annotations["organism"]

'Dasypus novemcinctus'

In [105]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Dasypus novemcinctus',
 'taxonomy_id': 9361,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_058139847.1

In [106]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_058139847.1',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9361,
 'organism': 'Dasypus novemcinctus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MYYVCLDSLKFPKKKTDVYSLAERKYEWARSAFGKRRRRRWRRKKKEVYFSYMRKILKQVHADFSGCSWVLDALGSLDDWRLEWVSLEAVRLSFYNHRRAVTSREILEAVKQRLSWKSF',
 'variant_under_consideration': None}

In [107]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [108]:
cursor.execute(add_sequence, data_sequence)

In [109]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_058139847.1,H2B.N,,,,9361,Dasypus novemcinctus,Chordata,Mammalia,,,MYYVCLDSLKFPKKKTDVYSLAERKYEWARSAFGKRRRRRWRRKKK...,,,


## Add publication

In [110]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [111]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [112]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_058139847.1,H2B.N,,,,9361,Dasypus novemcinctus,Chordata,Mammalia,,,MYYVCLDSLKFPKKKTDVYSLAERKYEWARSAFGKRRRRRWRRKKK...,,XP_058139847.1,35099534


In [113]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,HISTDB_H2B_N_1,35099534
2,HISTDB_H2B_N_2,H2B.N,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSY...,,HISTDB_H2B_N_2,35099534
3,HISTDB_H2B_N_3,H2B.N,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSY...,,HISTDB_H2B_N_3,35099534
4,HISTDB_H2B_N_4,H2B.N,,,,9785,Loxodonta africana,Chordata,Mammalia,,,MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEA...,,HISTDB_H2B_N_4,35099534
5,XP_058139847.1,H2B.N,,,,9361,Dasypus novemcinctus,Chordata,Mammalia,,,MYYVCLDSLKFPKKKTDVYSLAERKYEWARSAFGKRRRRRWRRKKK...,,XP_058139847.1,35099534
6,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534
7,XP_059746706.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTIYIPAKEKDEWVCSATGKKRRKKKEAYFN...,,XP_059746706.1,35099534
8,XP_060251208.1,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,XP_060251208.1,35099534


In [114]:
# Make sure data is committed to the database
conn.commit()

# Add opposum H2B.Ns

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Genes:**
- Monodelphis domestica, monDom5, chr2, +-, 502881650-502884120, H2B.N.1 (ancestral)
- Monodelphis domestica, monDom5, chr2, +-, 384161946-384162260, H2B.N.2

**Sequences from article:**
```fasta
>Opposum_H2B.N.1
MKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAKILKQVHQDFSGYSWVLDALWSLDYYLFEQATLEAVRLSFYNHRRVVTSREMLEALNKVPLEGWM
>Opposum_H2B.N.2
MKILRAGATVPRSFITTGKRRGYMRTVSGKKKDFYFSYIAKILKQVHQDFSGYSWVLDALWSLDYYLFEQATLEAVRLSFYNHRRVVTSREMLETLSKVPLEGWM
```

BLASTP results:
- two H2B.N.1 with 100% coverage and 100% identity (maybe shifted reading frame, 5-6aa extra from N-tail);
- one H2B.N.2 with 100% coverage and 99% identity (1 insertion).

**Protein accessions:** XP_007485606.1, XP_007485607.1, XP_007485268.2

## Accession XP_007485606.1

In [117]:
ACCESSION = "XP_007485606.1"

## Get sequence XP_007485606.1 from NCBI

In [118]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_007485606.1
Name: XP_007485606
Description: histone H2A.N isoform X1 [Monodelphis domestica]
Database cross-references: BioProject:PRJNA967365
Number of features: 4
/topology=linear
/data_file_division=MAM
/date=05-JUN-2023
/accessions=['XP_007485606']
/sequence_version=1
/db_source=REFSEQ: accession XM_007485544.2
/keywords=['RefSeq']
/source=Monodelphis domestica (gray short-tailed opossum)
/organism=Monodelphis domestica
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Metatheria', 'Didelphimorphia', 'Didelphidae', 'Monodelphis']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_077228) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotatio

In [119]:
str(record.seq)

'MEVGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAKILKQVHQDFSGYSWVLDALWSLDYYLFEQATLEAVRLSFYNHRRVVTSREMLEALNKVPLEGWM'

In [120]:
record.annotations["organism"]

'Monodelphis domestica'

In [121]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Monodelphis domestica',
 'taxonomy_id': 13616,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_007485606.1

In [122]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_007485606.1',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 13616,
 'organism': 'Monodelphis domestica',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MEVGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAKILKQVHQDFSGYSWVLDALWSLDYYLFEQATLEAVRLSFYNHRRVVTSREMLEALNKVPLEGWM',
 'variant_under_consideration': None}

In [123]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [124]:
cursor.execute(add_sequence, data_sequence)

In [125]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_007485606.1,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MEVGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIA...,,,


## Add publication

In [126]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [127]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [128]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_007485606.1,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MEVGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIA...,,XP_007485606.1,35099534


## Accession XP_007485607.1

In [130]:
ACCESSION = "XP_007485607.1"

## Get sequence XP_007485607.1 from NCBI

In [131]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_007485607.1
Name: XP_007485607
Description: histone H2A.N isoform X2 [Monodelphis domestica]
Database cross-references: BioProject:PRJNA967365
Number of features: 4
/topology=linear
/data_file_division=MAM
/date=05-JUN-2023
/accessions=['XP_007485607']
/sequence_version=1
/db_source=REFSEQ: accession XM_007485545.2
/keywords=['RefSeq']
/source=Monodelphis domestica (gray short-tailed opossum)
/organism=Monodelphis domestica
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Metatheria', 'Didelphimorphia', 'Didelphidae', 'Monodelphis']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_077228) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Annotation-Data', OrderedDict([('Annotation Provider', 'NCBI RefSeq'), ('Annotatio

In [132]:
str(record.seq)

'MEGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAKILKQVHQDFSGYSWVLDALWSLDYYLFEQATLEAVRLSFYNHRRVVTSREMLEALNKVPLEGWM'

In [133]:
record.annotations["organism"]

'Monodelphis domestica'

In [134]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Monodelphis domestica',
 'taxonomy_id': 13616,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_007485607.1

In [135]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_007485607.1',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 13616,
 'organism': 'Monodelphis domestica',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MEGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAKILKQVHQDFSGYSWVLDALWSLDYYLFEQATLEAVRLSFYNHRRVVTSREMLEALNKVPLEGWM',
 'variant_under_consideration': None}

In [136]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [137]:
cursor.execute(add_sequence, data_sequence)

In [139]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_007485607.1,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MEGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAK...,,,


## Add publication

In [140]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [141]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [142]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_007485607.1,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MEGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAK...,,XP_007485607.1,35099534


## Accession XP_007485268.2

In [144]:
ACCESSION = "XP_007485268.2"

## Get sequence XP_007485268.2 from NCBI

In [145]:
with Entrez.efetch(
    db="protein", id=ACCESSION, rettype="gb", retmode="text"
) as handle:
    record = SeqIO.read(handle, "genbank")
    print(record)

ID: XP_007485268.2
Name: XP_007485268
Description: histone H2A.N-like [Monodelphis domestica]
Database cross-references: BioProject:PRJNA967365
Number of features: 4
/topology=linear
/data_file_division=MAM
/date=05-JUN-2023
/accessions=['XP_007485268']
/sequence_version=2
/db_source=REFSEQ: accession XM_007485206.2
/keywords=['RefSeq', 'includes ab initio']
/source=Monodelphis domestica (gray short-tailed opossum)
/organism=Monodelphis domestica
/taxonomy=['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', 'Metatheria', 'Didelphimorphia', 'Didelphidae', 'Monodelphis']
/comment=MODEL REFSEQ:  This record is predicted by automated computational
analysis. This record is derived from a genomic sequence
(NC_077228) annotated using gene prediction method: Gnomon.
Also see:
    Documentation of NCBI's Annotation Process
On Jun 5, 2023 this sequence version replaced XP_007485268.1.
COMPLETENESS: full length.
/structured_comment=OrderedDict([('Genome-Anno

In [146]:
str(record.seq)

'MKIPRAGATVPRSFITTGKRRGYMRTVSGKKKDFYFSYIAKILKQVHQDFSGYSWVLDALWSLDYYLFEQATLEAVRLSFYNHRRVVTSREMLETLSKVPLEGWM'

In [147]:
record.annotations["organism"]

'Monodelphis domestica'

In [148]:
taxonomy_data = get_taxonomy_data(record)
taxonomy_data

Fetched taxid from NCBI <built-in function id>


{'organism': 'Monodelphis domestica',
 'taxonomy_id': 13616,
 'phylum': 'Chordata',
 'class': 'Mammalia'}

## Adding XP_007485268.2

In [149]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": None,
    "organism": None,
    "phylum": None,
    "class": None,
    "taxonomy_group": None,
    "info": None,
    "sequence": str(record.seq),
    "variant_under_consideration": None,
}
data_sequence.update(taxonomy_data)
data_sequence

{'accession': 'XP_007485268.2',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 13616,
 'organism': 'Monodelphis domestica',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MKIPRAGATVPRSFITTGKRRGYMRTVSGKKKDFYFSYIAKILKQVHQDFSGYSWVLDALWSLDYYLFEQATLEAVRLSFYNHRRVVTSREMLETLSKVPLEGWM',
 'variant_under_consideration': None}

In [150]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [151]:
cursor.execute(add_sequence, data_sequence)

In [152]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_007485268.2,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MKIPRAGATVPRSFITTGKRRGYMRTVSGKKKDFYFSYIAKILKQV...,,,


## Add publication

In [153]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [154]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [155]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,XP_007485268.2,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MKIPRAGATVPRSFITTGKRRGYMRTVSGKKKDFYFSYIAKILKQV...,,XP_007485268.2,35099534


In [156]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,HISTDB_H2B_N_1,35099534
2,HISTDB_H2B_N_2,H2B.N,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSY...,,HISTDB_H2B_N_2,35099534
3,HISTDB_H2B_N_3,H2B.N,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSY...,,HISTDB_H2B_N_3,35099534
4,HISTDB_H2B_N_4,H2B.N,,,,9785,Loxodonta africana,Chordata,Mammalia,,,MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEA...,,HISTDB_H2B_N_4,35099534
5,XP_007485268.2,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MKIPRAGATVPRSFITTGKRRGYMRTVSGKKKDFYFSYIAKILKQV...,,XP_007485268.2,35099534
6,XP_007485606.1,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MEVGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIA...,,XP_007485606.1,35099534
7,XP_007485607.1,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MEGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAK...,,XP_007485607.1,35099534
8,XP_058139847.1,H2B.N,,,,9361,Dasypus novemcinctus,Chordata,Mammalia,,,MYYVCLDSLKFPKKKTDVYSLAERKYEWARSAFGKRRRRRWRRKKK...,,XP_058139847.1,35099534
9,XP_059734412.1,H2B.N,,,,9913,Bos taurus,Chordata,Mammalia,,,MYFICLHGLQFPKRKLTIYIPAKEKDEWVHSATGKKRRKKKETYFN...,,XP_059734412.1,35099534


In [157]:
# Make sure data is committed to the database
conn.commit()

# Add platypus H2B.N

**Atricle:** https://academic.oup.com/mbe/article/39/2/msac019/6517784#333890704

**Gene:** Ornithorhynchus anatinus, ornAna2, chrUn_DS198020v1, +-, 11357-11548

**Sequence from article:**
```fasta
>Platypus_H2B.N
MGGPLGPGFLSLLETETKSPAVAPEAEMEGPVEARKEYRCVRTSLSKKKEAYSSYIAHVLKQTQPEPRGWGRAEGNLESRDGQLLERVAGEAVRLTLLQAAKTVTSRVVRGALELVLAELVEE
```

BLASTP has one result with 100% coverage and 100% identity (is that a same gene as in the article?).

**Protein accession:** HISTDB_H2B_N_5

## Add HISTDB_H2B_N_5

In [158]:
ACCESSION = "HISTDB_H2B_N_5"

In [159]:
data_sequence = {
    "accession": ACCESSION,
    "variant": "H2B.N",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 9258,
    "organism": "Ornithorhynchus anatinus",
    "phylum": "Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MGGPLGPGFLSLLETETKSPAVAPEAEMEGPVEARKEYRCVRTSLSKKKEAYSSYIAHVLKQTQPEPRGWGRAEGNLESRDGQLLERVAGEAVRLTLLQAAKTVTSRVVRGALELVLAELVEE",
    "variant_under_consideration": None,
}
data_sequence

{'accession': 'HISTDB_H2B_N_5',
 'variant': 'H2B.N',
 'gi': None,
 'ncbi_gene_id': None,
 'hgnc_gene_name': None,
 'taxonomy_id': 9258,
 'organism': 'Ornithorhynchus anatinus',
 'phylum': 'Chordata',
 'class': 'Mammalia',
 'taxonomy_group': None,
 'info': None,
 'sequence': 'MGGPLGPGFLSLLETETKSPAVAPEAEMEGPVEARKEYRCVRTSLSKKKEAYSSYIAHVLKQTQPEPRGWGRAEGNLESRDGQLLERVAGEAVRLTLLQAAKTVTSRVVRGALELVLAELVEE',
 'variant_under_consideration': None}

In [160]:
for k, v in data_sequence.items():
    print(k, type(v))

accession <class 'str'>
variant <class 'str'>
gi <class 'NoneType'>
ncbi_gene_id <class 'NoneType'>
hgnc_gene_name <class 'NoneType'>
taxonomy_id <class 'int'>
organism <class 'str'>
phylum <class 'str'>
class <class 'str'>
taxonomy_group <class 'NoneType'>
info <class 'NoneType'>
sequence <class 'str'>
variant_under_consideration <class 'NoneType'>


In [161]:
cursor.execute(add_sequence, data_sequence)

In [162]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_5,H2B.N,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MGGPLGPGFLSLLETETKSPAVAPEAEMEGPVEARKEYRCVRTSLS...,,,


## Add publication

In [163]:
pid = "35099534"
query = "SELECT * FROM publication WHERE id='35099534'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,title,doi,author,year
0,35099534,,,,


In [164]:
cursor.execute(add_sequence_has_publication, (ACCESSION, pid))

In [165]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.accession='{ACCESSION}'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_5,H2B.N,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MGGPLGPGFLSLLETETKSPAVAPEAEMEGPVEARKEYRCVRTSLS...,,HISTDB_H2B_N_5,35099534


In [167]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
    f"WHERE s.variant='H2B.N'"
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
0,HISTDB_H2B_N_0,H2B.N,,,,9940,Ovis aries,Chordata,Mammalia,,,MHFICLHGLQFPKRKLTISIPAKEKDEWVHSATGKKRRKKKEAYFN...,,HISTDB_H2B_N_0,35099534
1,HISTDB_H2B_N_1,H2B.N,,,,9807,Ceratotherium simum,Chordata,Mammalia,,,MYFICLRGLRFPKKTTNYILAKKKYEWTSSAIGKKRRRKKKEAYFS...,,HISTDB_H2B_N_1,35099534
2,HISTDB_H2B_N_2,H2B.N,,,,9615,Canis lupus familiaris,Chordata,Mammalia,,,MYYICLHGLRFPEKRTILYIPAREKYEWANSALRKKRKKKEVYFSY...,,HISTDB_H2B_N_2,35099534
3,HISTDB_H2B_N_3,H2B.N,,,,9646,Ailuropoda melanoleuca,Chordata,Mammalia,,,MYYVCLHDPRFPKKRTTLYIPAKAKYECANSALRHKRKKKEVYFSY...,,HISTDB_H2B_N_3,35099534
4,HISTDB_H2B_N_4,H2B.N,,,,9785,Loxodonta africana,Chordata,Mammalia,,,MYYVCLGGLKFPKKSEVHIPAKKKYEWANSAFEKKRRRRRRKKKEA...,,HISTDB_H2B_N_4,35099534
5,HISTDB_H2B_N_5,H2B.N,,,,9258,Ornithorhynchus anatinus,Chordata,Mammalia,,,MGGPLGPGFLSLLETETKSPAVAPEAEMEGPVEARKEYRCVRTSLS...,,HISTDB_H2B_N_5,35099534
6,XP_007485268.2,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MKIPRAGATVPRSFITTGKRRGYMRTVSGKKKDFYFSYIAKILKQV...,,XP_007485268.2,35099534
7,XP_007485606.1,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MEVGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIA...,,XP_007485606.1,35099534
8,XP_007485607.1,H2B.N,,,,13616,Monodelphis domestica,Chordata,Mammalia,,,MEGTEMKIPRAGATVPRSFLRTGKRRGYMRTVSGKKKDFYFSYIAK...,,XP_007485607.1,35099534
9,XP_058139847.1,H2B.N,,,,9361,Dasypus novemcinctus,Chordata,Mammalia,,,MYYVCLDSLKFPKKKTDVYSLAERKYEWARSAFGKRRRRRWRRKKK...,,XP_058139847.1,35099534


In [168]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [169]:
cursor.close()
conn.close()
tunnel.stop()