In [1]:
import io

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

36237


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [121]:
add_histone = (
    "INSERT INTO histone "
    "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
    "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
)
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
add_alternate_names = (
    "INSERT INTO alternative_name "
    "(name, taxonomy, gene, splice, histone) "
    "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
)
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [None]:
data_histone_description = {
    "summary": None,
    "taxonomy": None,
    "genes": None,
    "evolution": None, 
    "expression": None,
    "knock_out": None,
    "function": None,
    "sequence": None,
    "localization": None,
    "deposition": None,
    "structure": None,
    "interactions": None,
    "disease": None,
    "caveats": None,
}

In [7]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# To Do H3

## <span style="color:green">Add sequences to H3-like_(Plants)</span>

Sequences fasta [alvarez-venegas_canonical_2019, hu_identification_2015]:

```fasta
>HTR701 LOC_Os06g06480.1
MRKGEEGGLDTDPHEADGVQVRDSRKQLATKATCKSAPATGGVKKPHRFRPGTVALREIR
KYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVAALQEAAEAYLVGLFEDTNLCAI
HAKRVTIMPKDIQLARRIRGERA
```
> поправила последовательность из [Rice Genome Annotation Project database](https://rice.uga.edu) так, чтобы она соответствовала данным из статьи (см. рис. 3) [hu_identification_2015]

```fasta
>HTR709 LOC_Os02g25910.1
MARTKQTARKSTGGKAPRKPLRAIAAVMPAPSSRGVARKSVPFIGVKKPRRHRPGTVALR
EIRKYQKNTELLIRKLPFQRLVREIAQHFKHDMRFQSHAVLALQEAAEAYLVGLFEDTNL
CAIHSKRVTIMSKDVQLARRIRGERL
```
> поправила последовательность из [Rice Genome Annotation Project database](https://rice.uga.edu) так, чтобы она соответствовала данным из статьи (см. рис. 3) [hu_identification_2015]

## <span style="color:green">Add sequences to H3.Y_(Primates)</span>

### <span style="color:green">Delete Macaque sequences to HISTDB_H3_Y_0 and HISTDB_H3_Y_1 [seal_standardized_2022]</span>

### <span style="color:green">Add Macaque sequence XP_001110711.1 as H3.Y.1_(Primates) [seal_standardized_2022]</span>

### <span style="color:green">Change sequence identificator and variant: HISTDB_H3_Y_2 -> XP_024212320.2 -> H3.Y.2_(Primates) [seal_standardized_2022]</span>

### <span style="color:green">Change sequence identificator and variant: HISTDB_H3_Y_3 -> HISTDB_H3_Y_0 -> H3.Y.1_(Primates) [wiedemann_identification_2010]</span>

## <span style="color:green">Add sequences to H3.4_(Mammalia)</span>

### <span style="color:green">Add sequence NP_001304932.1 as H3.4_(Mammalia) [ueda_testis-specific_2017]</span>

### <span style="color:green">Add fasta sequences to H3.4_(Mammalia)</span>

Sequences fasta [dong_structural_2020]:

```fasta
>uncharacterized protein [rat]
MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGTVALREIRRYQKSTELLIRKLPFQR
LVREIAQDFKTDLRFQSSAVMALQEACESYLVGLFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA
```

## <span style="color:green">Add sequences to H3.5_(Hominidae)</span>

### <span style="color:green">Add orangutan sequence XP_002823134.1, gorilla sequence ADW85799.1 as H3.5_(Hominidae) [schenk_h35_2011]</span>

### <span style="color:green">Delete chimpanzee sequence XP_003954426.1 (it is pseudogene [seal_standardized_2022])</span>

### <span style="color:green">Add alternate name H3.3C to H3.5_(Homo_sapiens)</span>

## <span style="color:green">Add sequences to H3.7_(Homo_sapiens)</span>

### <span style="color:green">Add sequence NP_001359034.1 to H3.7_(Homo_sapiens) [taguchi_crystal_2017, seal_standardized_2022]</span>

## <span style="color:green">Add sequences to H3.B_(Giardia)</span>

### <span style="color:green">Add sequence XP_767393.1 to H3.B_(Giardia) [dawson_cenh3_2007]</span>

## <span style="color:green">Add new node to cenH3 to cenH3_(Protists)</span>

## <span style="color:green">Add sequences to cenH3_(Protists)</span>

### <span style="color:green">Add sequence XP_771620.1 to cenH3_(Protists) [dawson_cenh3_2007]</span>

## <span style="color:green">Add sequences to cH3_(Protists)</span>

### <span style="color:green">Add sequence AAF00592.1 to cH3_(Protists) [dawson_cenh3_2007]</span>

## <span style="color:green">Add sequences to H3.P_(Euplotes_crassus)</span>

### <span style="color:green">Add sequence AAC47441.1 to H3.P_(Euplotes_crassus) [jahn_unusual_1997]</span>

## <span style="color:green">Add sequences to H3.V_(Trypanosomatidae)</span>

### <span style="color:green">Add sequence AAO24601.1 to H3.V_(Trypanosomatidae) [lowell_variant_2004]</span>

### <span style="color:green">Add fasta sequences to H3.V_(Trypanosomatidae)</span>

Sequences fasta [lowell_variant_2004]:

```fasta
>ACC histone H3 variant [Trypanosoma cruzi]
MGSLKKVASVEKHSAFSSESKLPKPRKNLASRKIHTEGRIVAKKESASAGTRKKHRWRPGTVVLREVRRYQSSTEFLIAKAP
FRRLVREIVSNLKDSFRMSATCVEALQESTELYVTSVLADANLCTLHANRVTVYPKDIQLALKLRGERL
>Q9U196 UniPrac [Leishmania major]
MAGITKAAVVASHPKKNVASRKMNKKSRSIAKKEAKAMRADSAGAKSRRWRPGTVALREVRKYQRSTELLIARTPFRRLVKE
IMSTFKDTMHMRHSALEAMQDATESYLVSLLCDANLCTIHAKRVTLYPKDLQLALRLRGERT
```

### <span style="color:green">Add sequence XP_828007.1 to H3.V_(Trypanosomatidae) [schulz_base_2016]</span>

### <span style="color:green">Add fasta sequences to H3.V_(Trypanosomatidae)</span>

Sequences fasta [siegel_four_2009]:

```fasta
>ACC histone H3 variant [Trypanosoma brucei]
MAQMKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWRPGTVALREIRRLQSSTDFLIQRAPFRRFLREVVSNLK
DSYRMSAACVDAIQEATETYITSVFMDANLCTLHANRVTLFPKDIQLALKLRGERN
```

## <span style="color:green">Add sequences to cH3_(Protists)</span>

### <span style="color:green">Add fasta sequences to cH3_(Protists)</span>

Sequences fasta [lowell_variant_2004]:

```fasta
>ACC histone H3 variant [Trypanosoma brucei]
MSRTKETARTKKTITSKKSKKASKGSDAASGVKTAQRRWRPGTVALREIROFQRSTDLLLQKAPFQRLVREVSGAQKEGLRF
QSSAILAAQEATESYIVSLLADTNRACIHSGRVTIQPKDIHLALCLRGERA
```

## <span style="color:green">Add sequences to H3.10_(Arabidopsis)</span>

### <span style="color:green">Add sequence NP_173418.1 to H3.10_(Arabidopsis) [borg_targeted_2020, alvarez-venegas_canonical_2019, okada_analysis_2005]</span>

## <span style="color:green">Add sequences to H3.14_(Arabidopsis)</span>

### <span style="color:green">Add sequence NP_177690.1 to H3.14_(Arabidopsis) [alvarez-venegas_canonical_2019, nunez-vazquez_histone_2025]</span>

# To Do H4

## <span style="color:green">Add sequences to H4.G_(Hominidae)</span>

### <span style="color:green">Add sequence NP_003538.1 as H4.G_(Hominidae) [ding_primate-specific_2021]</span>

## <span style="color:green">Add sequences to H4.V_(Trypanosomatidae)</span>

### <span style="color:green">Add fasta sequences to H4.V_(Trypanosomatidae)</span>

Sequences fasta [siegel_four_2009]:

```fasta
>ACC histone H4 variant [Trypanosoma brucei]
MAKGKRVGESKGAQKRQKKVLRDNVRGITRGSIRRLARRAGVKRISGVIYDEVRGVLKTFVESIVRDAGAYTEYSRKKTVTAAHVVFALRKRGKVLYGYD
```

## <span style="color:green">Add sequences to cH4_(Protists)</span>

### <span style="color:green">Add fasta sequences to cH4_(Protists)</span>

Sequences fasta [siegel_four_2009]:

```fasta
>ACC histone H4 variant [Trypanosoma brucei]
MAKGKKSGEAKGSQKRQKKVLRENVRGITRGSIRRLARRGGVKRISGVIYDEVRGVLKSFVEGVVRDATAYTEYSRKKTVTAVDVVNALRKRGKILYGYA
```

# DONE H3

## <span style="color:black">Add sequences to H3-like_(Plants)</span>

Sequences fasta [alvarez-venegas_canonical_2019, hu_identification_2015]:

```fasta
>HISTDB_H3_like_Plants_0 HTR701 LOC_Os06g06480.1 [Oryza sativa]
MRKGEEGGLDTDPHEADGVQVRDSRKQLATKATCKSAPATGGVKKPHRFRPGTVALREIR
KYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVAALQEAAEAYLVGLFEDTNLCAI
HAKRVTIMPKDIQLARRIRGERA
```
> поправила последовательность из [Rice Genome Annotation Project database](https://rice.uga.edu) так, чтобы она соответствовала данным из статьи (см. рис. 3) [hu_identification_2015]

```fasta
>HISTDB_H3_like_Plants_1 HTR709 LOC_Os02g25910.1 [Oryza sativa]
MARTKQTARKSTGGKAPRKPLRAIAAVMPAPSSRGVARKSVPFIGVKKPRRHRPGTVALR
EIRKYQKNTELLIRKLPFQRLVREIAQHFKHDMRFQSHAVLALQEAAEAYLVGLFEDTNL
CAIHSKRVTIMSKDVQLARRIRGERL
```
> поправила последовательность из [Rice Genome Annotation Project database](https://rice.uga.edu) так, чтобы она соответствовала данным из статьи (см. рис. 3) [hu_identification_2015]

In [8]:
query = "SELECT * FROM sequence WHERE variant='H3-like_(Plants)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,ABA97899.1,H3-like_(Plants),,,,39947,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTAGNVPRKLLVMKVARKSAPMMAGLKKPHRFNPWI...,
1,ABA97902.1,H3-like_(Plants),,,,39947,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTASNVPRKLLVMKVARKSAPTMAGLKKPHRFKPGT...,
2,BAA96098.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARPRKEAPQRNLDRDENARQQPTEEPQDEAPRNQGRQQQQQRPPA...,
3,BAE48427.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKHTARMSTGGKAPRKQLASKALRKAPPPPTKGVKQPHHYHLR...,
4,BAE48431.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKLNARMSTGGKAPRKQLAYKAVRKAAPPTIGVKLPNSYRPGD...,
5,BAE48433.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSIPTGMGGMKRPRRYRPG...,
6,BAE48435.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPTTGGVKKPHRYRPGT...,
7,NP_172794.1,H3-like_(Plants),,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQSARKSHGGKAPTKQLATKAARKSAPTTGGVKKPHRFRPGT...,
8,NP_196795.1,H3-like_(Plants),,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARSNQTARKATGGKAPHFAMRVWQHSTPPLKKPYRYKPGTVALRE...,
9,NP_201338.1,H3-like_(Plants),,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARISTGGKAPRKQLAPKAARQSAPATGGVKKPHRFRPGT...,


In [10]:
data_sequence = {
    "accession": "HISTDB_H3_like_Plants_0",
    "variant": "H3-like_(Plants)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 4530,
    "organism": "Oryza sativa",
    "phylum": "Streptophyta",
    "class": "Magnoliopsida",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MRKGEEGGLDTDPHEADGVQVRDSRKQLATKATCKSAPATGGVKKPHRFRPGTVALREIRKYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVAALQEAAEAYLVGLFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [11]:
data_sequence = {
    "accession": "HISTDB_H3_like_Plants_1",
    "variant": "H3-like_(Plants)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 4530,
    "organism": "Oryza sativa",
    "phylum": "Streptophyta",
    "class": "Magnoliopsida",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MARTKQTARKSTGGKAPRKPLRAIAAVMPAPSSRGVARKSVPFIGVKKPRRHRPGTVALREIRKYQKNTELLIRKLPFQRLVREIAQHFKHDMRFQSHAVLALQEAAEAYLVGLFEDTNLCAIHSKRVTIMSKDVQLARRIRGERL",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [12]:
query = "SELECT * FROM sequence WHERE variant='H3-like_(Plants)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,ABA97899.1,H3-like_(Plants),,,,39947,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTAGNVPRKLLVMKVARKSAPMMAGLKKPHRFNPWI...,
1,ABA97902.1,H3-like_(Plants),,,,39947,Oryza sativa Japonica Group,Streptophyta,Magnoliopsida,,,MARTKQTAKKSTASNVPRKLLVMKVARKSAPTMAGLKKPHRFKPGT...,
2,BAA96098.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARPRKEAPQRNLDRDENARQQPTEEPQDEAPRNQGRQQQQQRPPA...,
3,BAE48427.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKHTARMSTGGKAPRKQLASKALRKAPPPPTKGVKQPHHYHLR...,
4,BAE48431.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARMKLNARMSTGGKAPRKQLAYKAVRKAAPPTIGVKLPNSYRPGD...,
5,BAE48433.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSIPTGMGGMKRPRRYRPG...,
6,BAE48435.1,H3-like_(Plants),,,,4690,Lilium longiflorum,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPTTGGVKKPHRYRPGT...,
7,HISTDB_H3_like_Plants_0,H3-like_(Plants),,,,4530,Oryza sativa,Streptophyta,Magnoliopsida,,,MRKGEEGGLDTDPHEADGVQVRDSRKQLATKATCKSAPATGGVKKP...,
8,HISTDB_H3_like_Plants_1,H3-like_(Plants),,,,4530,Oryza sativa,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKPLRAIAAVMPAPSSRGVARKSVPFIGV...,
9,NP_172794.1,H3-like_(Plants),,,,3702,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQSARKSHGGKAPTKQLATKAARKSAPTTGGVKKPHRFRPGT...,


In [13]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [14]:
pids = ["alvarez-venegas_canonical_2019", "hu_identification_2015"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,
106,hu_identification_2015,,,,,


In [15]:
for acc in ["HISTDB_H3_like_Plants_0", "HISTDB_H3_like_Plants_1"]:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [16]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_H3_like_Plants_0", "HISTDB_H3_like_Plants_1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2575,HISTDB_H3_like_Plants_0,H3-like_(Plants),,,,4530.0,Oryza sativa,Streptophyta,Magnoliopsida,,,MRKGEEGGLDTDPHEADGVQVRDSRKQLATKATCKSAPATGGVKKP...,,HISTDB_H3_like_Plants_0,alvarez-venegas_canonical_2019
2576,HISTDB_H3_like_Plants_0,H3-like_(Plants),,,,4530.0,Oryza sativa,Streptophyta,Magnoliopsida,,,MRKGEEGGLDTDPHEADGVQVRDSRKQLATKATCKSAPATGGVKKP...,,HISTDB_H3_like_Plants_0,hu_identification_2015
2577,HISTDB_H3_like_Plants_1,H3-like_(Plants),,,,4530.0,Oryza sativa,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKPLRAIAAVMPAPSSRGVARKSVPFIGV...,,HISTDB_H3_like_Plants_1,alvarez-venegas_canonical_2019
2578,HISTDB_H3_like_Plants_1,H3-like_(Plants),,,,4530.0,Oryza sativa,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKAPRKPLRAIAAVMPAPSSRGVARKSVPFIGV...,,HISTDB_H3_like_Plants_1,hu_identification_2015


In [17]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H3.Y_(Primates)</span>

### <span style="color:black">Delete Macaque sequences to HISTDB_H3_Y_0 and HISTDB_H3_Y_1 [seal_standardized_2022]</span>

In [18]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_H3_Y_0", "HISTDB_H3_Y_1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
2255,HISTDB_H3_Y_0,H3.Y_(Primates),NOGI,,,9544.0,Macaca mulatta,Chordata,Mammalia,,,ARTKQTARKATNWQAPRKPLATKAAAKRAPPRGGIKKPHRYKPGTQ...,
2256,HISTDB_H3_Y_1,H3.Y_(Primates),NOGI,,,9544.0,Macaca mulatta,Chordata,Mammalia,,,ARTKQTARKATNWQAPRKPLATKAPGKRLPPRGGIKKPHRYRPGTQ...,


In [19]:
query = "DELETE FROM sequence WHERE accession='HISTDB_H3_Y_0'"
cursor.execute(query)
query = "DELETE FROM sequence WHERE accession='HISTDB_H3_Y_1'"
cursor.execute(query)

In [20]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_H3_Y_0", "HISTDB_H3_Y_1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [21]:
# Make sure data is committed to the database
conn.commit()

### <span style="color:black">Add Macaque sequence XP_001110711.1 as H3.Y.1_(Primates) [seal_standardized_2022]</span>

In [22]:
accessions = ["XP_001110711.1"]

## Add sequences to curatedDB

In [23]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [24]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.Y.1_(Primates)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 9544
accession XP_001110711.1 <class 'str'>
variant H3.Y.1_(Primates) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 9544 <class 'int'>
organism Macaca mulatta <class 'str'>
phylum Chordata <class 'str'>
class Mammalia <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKATNWQAPRKPLATKAPGKRLPPRGGIKKPHRYRPGTQALREIRKYQKSTQLLLRKLPFQRLVREIAQAISPDLRFQSAAIGALQEASEAYLVNLFEDTNLCAIHARRVTIMPRDMQLARRIRGEGA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [25]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [26]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
5701,XP_001110711.1,H3.Y.1_(Primates),,,,9544.0,Macaca mulatta,Chordata,Mammalia,,,MARTKQTARKATNWQAPRKPLATKAPGKRLPPRGGIKKPHRYRPGT...,


In [27]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [28]:
pids = ["seal_standardized_2022"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [29]:
data_publication = {
    "id": "seal_standardized_2022",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [30]:
pids = ["seal_standardized_2022"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
157,seal_standardized_2022,,,,,


In [31]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [32]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
6259,XP_001110711.1,H3.Y.1_(Primates),,,,9544.0,Macaca mulatta,Chordata,Mammalia,,,MARTKQTARKATNWQAPRKPLATKAPGKRLPPRGGIKKPHRYRPGT...,,XP_001110711.1,seal_standardized_2022


In [33]:
# Make sure data is committed to the database
conn.commit()

### <span style="color:black">Change sequence identificator and variant: HISTDB_H3_Y_2 -> XP_024212320.2 -> H3.Y.2_(Primates) [seal_standardized_2022]</span>

### <span style="color:black">Change sequence identificator and variant: HISTDB_H3_Y_3 -> HISTDB_H3_Y_0 -> H3.Y.1_(Primates) [wiedemann_identification_2010]</span>

In [34]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_H3_Y_2", "HISTDB_H3_Y_3"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
2255,HISTDB_H3_Y_2,H3.Y_(Primates),NOGI,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGTL...,
2256,HISTDB_H3_Y_3,H3.Y_(Primates),NOGI,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKPGTL...,


In [35]:
accessions = ["XP_024212320.2"]

## Add sequences to curatedDB

In [36]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [38]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.Y.2_(Primates)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 9598
accession XP_024212320.2 <class 'str'>
variant H3.Y.2_(Primates) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 9598 <class 'int'>
organism Pan troglodytes <class 'str'>
phylum Chordata <class 'str'>
class Mammalia <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGTLALREIRKYQKSTQLLLRKLPFQRLVREIAQAISPDLRFQSAAIGALQEASEAYLVQLFEDTNLCAIHARRVTIMPRDMQLARRLRREGP <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [39]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [40]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions+["HISTDB_H3_Y_2", "HISTDB_H3_Y_3"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
2255,HISTDB_H3_Y_2,H3.Y_(Primates),NOGI,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGTL...,
2256,HISTDB_H3_Y_3,H3.Y_(Primates),NOGI,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKPGTL...,
5884,XP_024212320.2,H3.Y.2_(Primates),,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGT...,


In [41]:
query = "DELETE FROM sequence WHERE accession='HISTDB_H3_Y_2'"
cursor.execute(query)

In [42]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions+["HISTDB_H3_Y_2", "HISTDB_H3_Y_3"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
2255,HISTDB_H3_Y_3,H3.Y_(Primates),NOGI,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKPGTL...,
5883,XP_024212320.2,H3.Y.2_(Primates),,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGT...,


In [43]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [44]:
query = "SELECT * FROM sequence_has_publication "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["sequence_accession"].isin(["HISTDB_H3_Y_0", "HISTDB_H3_Y_1", "HISTDB_H3_Y_2", "HISTDB_H3_Y_3"])]

Unnamed: 0,sequence_accession,publication_id


In [45]:
pids = ["seal_standardized_2022"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
157,seal_standardized_2022,,,,,


In [46]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [47]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
6498,XP_024212320.2,H3.Y.2_(Primates),,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGT...,,XP_024212320.2,seal_standardized_2022


In [48]:
# Make sure data is committed to the database
conn.commit()

In [49]:
query = "UPDATE sequence SET accession='HISTDB_H3_Y_0', variant='H3.Y.1_(Primates)' WHERE accession='HISTDB_H3_Y_3'"
cursor.execute(query)

In [51]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions+["HISTDB_H3_Y_0", "HISTDB_H3_Y_2", "HISTDB_H3_Y_3"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
2255,HISTDB_H3_Y_0,H3.Y.1_(Primates),NOGI,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKPGTL...,
5883,XP_024212320.2,H3.Y.2_(Primates),,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGT...,


## Add sequence publication

In [52]:
pids = ["wiedemann_identification_2010"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [53]:
data_publication = {
    "id": "wiedemann_identification_2010",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [54]:
pids = ["wiedemann_identification_2010"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
183,wiedemann_identification_2010,,,,,


In [55]:
for pid in pids:
    cursor.execute(add_sequence_has_publication, ("HISTDB_H3_Y_0", pid))

In [56]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions+["HISTDB_H3_Y_0", "HISTDB_H3_Y_2", "HISTDB_H3_Y_3"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2579,HISTDB_H3_Y_0,H3.Y.1_(Primates),NOGI,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKPGTL...,,HISTDB_H3_Y_0,wiedemann_identification_2010
6498,XP_024212320.2,H3.Y.2_(Primates),,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGT...,,XP_024212320.2,seal_standardized_2022


In [57]:
# Make sure data is committed to the database
conn.commit()

In [58]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["variant"].isin(["H3.Y.1_(Primates)", "H3.Y.2_(Primates)"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2579,HISTDB_H3_Y_0,H3.Y.1_(Primates),NOGI,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,ARTKQTARKATAWQAPRKPLATKAARKRASPTGGIKKPHRYKPGTL...,,HISTDB_H3_Y_0,wiedemann_identification_2010
6258,XP_001110711.1,H3.Y.1_(Primates),,,,9544.0,Macaca mulatta,Chordata,Mammalia,,,MARTKQTARKATNWQAPRKPLATKAPGKRLPPRGGIKKPHRYRPGT...,,XP_001110711.1,seal_standardized_2022
6498,XP_024212320.2,H3.Y.2_(Primates),,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKATAWQAPRKPLATKAAGKRAPPTGGIKKPHRYKPGT...,,XP_024212320.2,seal_standardized_2022


## <span style="color:black">Add sequences to H3.4_(Mammalia)</span>

### <span style="color:black">Add sequence NP_001304932.1 as H3.4_(Mammalia) [ueda_testis-specific_2017]</span>

In [59]:
accessions = ["NP_001304932.1"]

## Add sequences to curatedDB

In [60]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [61]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.4_(Mammalia)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 10090
accession NP_001304932.1 <class 'str'>
variant H3.4_(Mammalia) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 10090 <class 'int'>
organism Mus musculus <class 'str'>
phylum Chordata <class 'str'>
class Mammalia <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVMALQEACESYLVGLFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [62]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [63]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3461,NP_001304932.1,H3.4_(Mammalia),,,,10090.0,Mus musculus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGT...,


In [64]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [65]:
pids = ["ueda_testis-specific_2017"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
181,ueda_testis-specific_2017,,,,,


In [66]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [67]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3846,NP_001304932.1,H3.4_(Mammalia),,,,10090.0,Mus musculus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGT...,,NP_001304932.1,ueda_testis-specific_2017


In [68]:
# Make sure data is committed to the database
conn.commit()

### <span style="color:black">Add fasta sequences to H3.4_(Mammalia)</span>

Sequences fasta [dong_structural_2020]:

```fasta
>uncharacterized protein [rat]
MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGTVALREIRRYQKSTELLIRKLPFQR
LVREIAQDFKTDLRFQSSAVMALQEACESYLVGLFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA
```

In [69]:
query = "SELECT * FROM sequence WHERE variant='H3.4_(Mammalia)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,NP_001304932.1,H3.4_(Mammalia),,,,10090,Mus musculus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGT...,
1,XP_003804825.1,H3.4_(Mammalia),397466137.0,,,9597,Pan paniscus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLVTKVARKSAPATGGVKKPHRYRPGT...,


In [70]:
data_sequence = {
    "accession": "HISTDB_H3_4_Mammalia_0",
    "variant": "H3.4_(Mammalia)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 10114,
    "organism": "Rattus",
    "phylum": " Chordata",
    "class": "Mammalia",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVMALQEACESYLVGLFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [71]:
query = "SELECT * FROM sequence WHERE variant='H3.4_(Mammalia)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,HISTDB_H3_4_Mammalia_0,H3.4_(Mammalia),,,,10114,Rattus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGT...,
1,NP_001304932.1,H3.4_(Mammalia),,,,10090,Mus musculus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGT...,
2,XP_003804825.1,H3.4_(Mammalia),397466137.0,,,9597,Pan paniscus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLVTKVARKSAPATGGVKKPHRYRPGT...,


In [72]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [73]:
pids = ["dong_structural_2020"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
90,dong_structural_2020,,,,,


In [74]:
for pid in pids:
    cursor.execute(add_sequence_has_publication, ("HISTDB_H3_4_Mammalia_0", pid))

In [75]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_H3_4_Mammalia_0"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2530,HISTDB_H3_4_Mammalia_0,H3.4_(Mammalia),,,,10114.0,Rattus,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKVARKSAPATGGVKKPHRYHPGT...,,HISTDB_H3_4_Mammalia_0,dong_structural_2020


In [76]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H3.5_(Hominidae)</span>

### <span style="color:black">Add orangutan sequence XP_002823134.1, gorilla sequence ADW85799.1 as H3.5_(Hominidae) [schenk_h35_2011]</span>

In [77]:
accessions = ["XP_002823134.1", "ADW85799.1"]

## Add sequences to curatedDB

In [78]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [79]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.5_(Hominidae)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 9601
Fetched taxid from NCBI 9593
accession ADW85799.1 <class 'str'>
variant H3.5_(Hominidae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 9593 <class 'int'>
organism Gorilla gorilla <class 'str'>
phylum Chordata <class 'str'>
class Mammalia <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKKPHRYRPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFNTGLSFQSAAIGALQEASEAYLVGLLEDTNLCAIHAKRVTIMPKDIQLARRIRGERA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [80]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [81]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
238,ADW85799.1,H3.5_(Hominidae),,,,9593.0,Gorilla gorilla,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKKPHRYRPGT...,
5763,XP_002823134.1,H3.5_(Hominidae),,,,9601.0,Pongo abelii,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTGGVKKPHRHGPGT...,


In [82]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [83]:
pids = ["schenk_h35_2011"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [84]:
data_publication = {
    "id": "schenk_h35_2011",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [85]:
pids = ["schenk_h35_2011"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
155,schenk_h35_2011,,,,,


In [86]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [87]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
285,ADW85799.1,H3.5_(Hominidae),,,,9593.0,Gorilla gorilla,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTCGVKKPHRYRPGT...,,ADW85799.1,schenk_h35_2011
6342,XP_002823134.1,H3.5_(Hominidae),,,,9601.0,Pongo abelii,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTGGVKKPHRHGPGT...,,XP_002823134.1,schenk_h35_2011


In [88]:
# Make sure data is committed to the database
conn.commit()

### <span style="color:black">Delete chimpanzee sequence XP_003954426.1 (it is pseudogene [seal_standardized_2022])</span>

In [89]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["XP_003954426.1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
5810,XP_003954426.1,H3.5_(Hominidae),410046862,,,9598.0,Pan troglodytes,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSTPSTXGVKKPHRYRPGT...,


In [90]:
query = "DELETE FROM sequence WHERE accession='XP_003954426.1'"
cursor.execute(query)

In [91]:
query = "SELECT * FROM sequence"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["XP_003954426.1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [92]:
# Make sure data is committed to the database
conn.commit()

### <span style="color:black">Add alternate name H3.3C to H3.5_(Homo_sapiens)</span>

In [93]:
query = "SELECT * FROM alternative_name " "WHERE histone='H3.5_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone


In [94]:
data_alternate_name = {
    "name": "H3.3C",
    "taxonomy": None,
    "gene": None,
    "splice": None,
    "histone": "H3.5_(Homo_sapiens)",
}
cursor.execute(add_alternate_names, data_alternate_name)

In [95]:
query = "SELECT * FROM alternative_name " "WHERE histone='H3.5_(Homo_sapiens)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,name,taxonomy,gene,splice,histone
0,124,H3.3C,,,,H3.5_(Homo_sapiens)


In [96]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H3.7_(Homo_sapiens)</span>

### <span style="color:black">Add sequence NP_001359034.1 to H3.7_(Homo_sapiens) [taguchi_crystal_2017, seal_standardized_2022]</span>

In [97]:
accessions = ["NP_001359034.1"]

## Add sequences to curatedDB

In [98]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [99]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.7_(Homo_sapiens)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 9606
accession NP_001359034.1 <class 'str'>
variant H3.7_(Homo_sapiens) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 9606 <class 'int'>
organism Homo sapiens <class 'str'>
phylum Chordata <class 'str'>
class Mammalia <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQEFKTDLRFQSSAVMALQEAREAYLVGLFEDTNLCAIHAKRVTIMPKDIQLVSRIRGERA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [100]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [101]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3467,NP_001359034.1,H3.7_(Homo_sapiens),,,,9606.0,Homo sapiens,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGT...,


In [102]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [103]:
pids = ["taguchi_crystal_2017", "seal_standardized_2022"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
158,seal_standardized_2022,,,,,
174,taguchi_crystal_2017,,,,,


In [104]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [105]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3853,NP_001359034.1,H3.7_(Homo_sapiens),,,,9606.0,Homo sapiens,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGT...,,NP_001359034.1,seal_standardized_2022
3854,NP_001359034.1,H3.7_(Homo_sapiens),,,,9606.0,Homo sapiens,Chordata,Mammalia,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGT...,,NP_001359034.1,taguchi_crystal_2017


In [106]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H3.B_(Giardia)</span>

### <span style="color:black">Add sequence XP_767393.1 to H3.B_(Giardia) [dawson_cenh3_2007]</span>

In [107]:
accessions = ["XP_767393.1"]

## Add sequences to curatedDB

In [108]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [109]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.B_(Giardia)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 184922
accession XP_767393.1 <class 'str'>
variant H3.B_(Giardia) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 184922 <class 'int'>
organism Giardia lamblia ATCC 50803 <class 'str'>
phylum Fornicata <class 'str'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKNTAMDRSKSVHVNTARKGQHAPRKTILSKKTVARKAISKSEKAVTRRARPGSQVRKEITNMQRRVTSVIPIACFQRLVRDITCSLPSGGNEIRFQAQAIGALQEASEAMLSQVLGDCQILANHAHRVTIMDKDIQIYMRIVRPPWMNGIHGSML <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [110]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [111]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
5925,XP_767393.1,H3.B_(Giardia),,,,184922.0,Giardia lamblia ATCC 50803,Fornicata,,,,MARTKNTAMDRSKSVHVNTARKGQHAPRKTILSKKTVARKAISKSE...,


In [112]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [113]:
pids = ["dawson_cenh3_2007"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [114]:
data_publication = {
    "id": "dawson_cenh3_2007",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [115]:
pids = ["dawson_cenh3_2007"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
89,dawson_cenh3_2007,,,,,


In [116]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [117]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
6550,XP_767393.1,H3.B_(Giardia),,,,184922.0,Giardia lamblia ATCC 50803,Fornicata,,,,MARTKNTAMDRSKSVHVNTARKGQHAPRKTILSKKTVARKAISKSE...,,XP_767393.1,dawson_cenh3_2007


In [118]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add new node to cenH3 to cenH3_(Protists)</span>

In [119]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Protists)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations


In [122]:
data_histone = {
    "id": "cenH3_(Protists)",
    "level": "variant",
    "taxonomic_span": "SAR,Metamonada,Discoba,Amoebozoa",
    "taxonomic_span_id": "2698737,2611341,2611352,554915",
    "description": None,
    "parent": "cenH3",
}
cursor.execute(add_histone, data_histone)

In [123]:
query = (
    "SELECT * FROM histone h LEFT JOIN histone_description hd "
    "ON h.description = hd.id "
    "WHERE h.id='cenH3_(Protists)'"
)
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,id,level,taxonomic_span,taxonomic_span_id,description,parent,id.1,summary,taxonomy,genes,...,knock_out,function,sequence,localization,deposition,structure,interactions,disease,caveats,relations
0,cenH3_(Protists),variant,"SAR,Metamonada,Discoba,Amoebozoa",269873726113412611352554915,,cenH3,,,,,...,,,,,,,,,,


In [124]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to cenH3_(Protists)</span>

### <span style="color:black">Add sequence XP_771620.1 to cenH3_(Protists) [dawson_cenh3_2007]</span>

In [125]:
accessions = ["XP_771620.1"]

## Add sequences to curatedDB

In [126]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [127]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "cenH3_(Protists)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 184922
accession XP_771620.1 <class 'str'>
variant cenH3_(Protists) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 184922 <class 'int'>
organism Giardia lamblia ATCC 50803 <class 'str'>
phylum Fornicata <class 'str'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MSGGSRSQVARNTGHRRREISGRNMIPGVVVNARQSRSKLSSDPFSSVPRRPARVSHMEREIYHYQHNVDTLIQKLPFARLVQELVEQIAQRDGSKGPYRFQGMAMEALQSATEEYIVELFSTALLATYHANRVTLMSKDILLVLRIQQRNLNSLR <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [128]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [129]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
5926,XP_771620.1,cenH3_(Protists),,,,184922.0,Giardia lamblia ATCC 50803,Fornicata,,,,MSGGSRSQVARNTGHRRREISGRNMIPGVVVNARQSRSKLSSDPFS...,


In [130]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [131]:
pids = ["dawson_cenh3_2007"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
89,dawson_cenh3_2007,,,,,


In [132]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [133]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
6551,XP_771620.1,cenH3_(Protists),,,,184922.0,Giardia lamblia ATCC 50803,Fornicata,,,,MSGGSRSQVARNTGHRRREISGRNMIPGVVVNARQSRSKLSSDPFS...,,XP_771620.1,dawson_cenh3_2007


In [134]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to cH3_(Protists)</span>

### <span style="color:black">Add sequence AAF00592.1 to cH3_(Protists) [dawson_cenh3_2007]</span>

In [135]:
accessions = ["AAF00592.1"]

## Add sequences to curatedDB

In [136]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [137]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "cH3_(Protists)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 5741
accession AAF00592.1 <class 'str'>
variant cH3_(Protists) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 5741 <class 'int'>
organism Giardia duodenalis <class 'str'>
phylum Fornicata <class 'str'>
class None <class 'NoneType'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKHTARKTTSATKAPRKTIARKAARKTASSTSGIKKTGRKKQGMVAVKEIKKYQKSTDLLIRKLPFSKLVRDIVTSGLSKSDIRFQGAAVEALQESAENYIISLFVDTQLCAEHAKRVTIMKPDMELATRIGKRIEPEYRKGK <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [138]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [139]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
27,AAF00592.1,cH3_(Protists),,,,5741.0,Giardia duodenalis,Fornicata,,,,MARTKHTARKTTSATKAPRKTIARKAARKTASSTSGIKKTGRKKQG...,


In [140]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [141]:
pids = ["dawson_cenh3_2007"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
89,dawson_cenh3_2007,,,,,


In [142]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [143]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
35,AAF00592.1,cH3_(Protists),,,,5741.0,Giardia duodenalis,Fornicata,,,,MARTKHTARKTTSATKAPRKTIARKAARKTASSTSGIKKTGRKKQG...,,AAF00592.1,dawson_cenh3_2007


In [144]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H3.P_(Euplotes_crassus)</span>

### <span style="color:black">Add sequence AAC47441.1 to H3.P_(Euplotes_crassus) [jahn_unusual_1997]</span>

In [145]:
accessions = ["AAC47441.1"]

## Add sequences to curatedDB

In [146]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [147]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.P_(Euplotes_crassus)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 5936
accession AAC47441.1 <class 'str'>
variant H3.P_(Euplotes_crassus) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 5936 <class 'int'>
organism Moneuplotes crassus <class 'str'>
phylum Ciliophora <class 'str'>
class Spirotrichea <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKTTGQKAPRKSVGGSKAPIGAGKSVVKASRKNVPSIIAKQAIKKPHRFRPGTVALREIRKFQKSTDLLIRKLPFQRLVREIATEYKSDLRFQSQAVLALQEATEAYMVSLFEDTNLCAIHAKRVTIMPKDIHLARRIRGERS <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [148]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [149]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
26,AAC47441.1,H3.P_(Euplotes_crassus),,,,5936.0,Moneuplotes crassus,Ciliophora,Spirotrichea,,,MARTKQTARKTTGQKAPRKSVGGSKAPIGAGKSVVKASRKNVPSII...,


In [150]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [151]:
pids = ["jahn_unusual_1997"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [152]:
data_publication = {
    "id": "jahn_unusual_1997",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [153]:
pids = ["jahn_unusual_1997"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
113,jahn_unusual_1997,,,,,


In [154]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [155]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
34,AAC47441.1,H3.P_(Euplotes_crassus),,,,5936.0,Moneuplotes crassus,Ciliophora,Spirotrichea,,,MARTKQTARKTTGQKAPRKSVGGSKAPIGAGKSVVKASRKNVPSII...,,AAC47441.1,jahn_unusual_1997


In [156]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H3.V_(Trypanosomatidae)</span>

### <span style="color:black">Add sequence AAO24601.1 to H3.V_(Trypanosomatidae) [lowell_variant_2004]</span>

In [157]:
accessions = ["AAO24601.1"]

## Add sequences to curatedDB

In [158]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [159]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.V_(Trypanosomatidae)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 5702
accession AAO24601.1 <class 'str'>
variant H3.V_(Trypanosomatidae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 5702 <class 'int'>
organism Trypanosoma brucei brucei <class 'str'>
phylum Euglenozoa <class 'str'>
class Kinetoplastea <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MAQMKKITPRPVRPKSVASRPIQAVARAPVKKVENTPPQKRHHRWRPGTVALREIRRLQSSTDFLIQRAPFRRFLREVVSNLKDSYRMSAACVDAIQEATETYITSVFMDANLCTLHANRVTLFPKDIQLALKLRGERN <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [160]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [161]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
42,AAO24601.1,H3.V_(Trypanosomatidae),,,,5702.0,Trypanosoma brucei brucei,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQAVARAPVKKVENTPPQKRHHRWR...,


In [162]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [163]:
pids = ["lowell_variant_2004"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [164]:
data_publication = {
    "id": "lowell_variant_2004",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [165]:
pids = ["lowell_variant_2004"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
128,lowell_variant_2004,,,,,


In [166]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [167]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
52,AAO24601.1,H3.V_(Trypanosomatidae),,,,5702.0,Trypanosoma brucei brucei,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQAVARAPVKKVENTPPQKRHHRWR...,,AAO24601.1,lowell_variant_2004


In [168]:
# Make sure data is committed to the database
conn.commit()

### <span style="color:black">Add fasta sequences to H3.V_(Trypanosomatidae)</span>

Sequences fasta [lowell_variant_2004]:

```fasta
>ACC histone H3 variant [Trypanosoma cruzi]
MGSLKKVASVEKHSAFSSESKLPKPRKNLASRKIHTEGRIVAKKESASAGTRKKHRWRPGTVVLREVRRYQSSTEFLIAKAP
FRRLVREIVSNLKDSFRMSATCVEALQESTELYVTSVLADANLCTLHANRVTVYPKDIQLALKLRGERL
>Q9U196 UniPrac [Leishmania major]
MAGITKAAVVASHPKKNVASRKMNKKSRSIAKKEAKAMRADSAGAKSRRWRPGTVALREVRKYQRSTELLIARTPFRRLVKE
IMSTFKDTMHMRHSALEAMQDATESYLVSLLCDANLCTIHAKRVTLYPKDLQLALRLRGERT
```

In [169]:
query = "SELECT * FROM sequence WHERE variant='H3.V_(Trypanosomatidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,AAO24601.1,H3.V_(Trypanosomatidae),,,,5702,Trypanosoma brucei brucei,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQAVARAPVKKVENTPPQKRHHRWR...,


In [170]:
data_sequence = {
    "accession": "HISTDB_H3_V_Trypanosomatidae_0",
    "variant": "H3.V_(Trypanosomatidae)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 5693,
    "organism": "Trypanosoma cruzi",
    "phylum": "Euglenozoa",
    "class": "Kinetoplastea",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MGSLKKVASVEKHSAFSSESKLPKPRKNLASRKIHTEGRIVAKKESASAGTRKKHRWRPGTVVLREVRRYQSSTEFLIAKAPFRRLVREIVSNLKDSFRMSATCVEALQESTELYVTSVLADANLCTLHANRVTVYPKDIQLALKLRGERL",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [171]:
data_sequence = {
    "accession": "HISTDB_H3_V_Trypanosomatidae_1",
    "variant": "H3.V_(Trypanosomatidae)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 5664,
    "organism": "Leishmania major",
    "phylum": "Euglenozoa",
    "class": " Kinetoplastea",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MAGITKAAVVASHPKKNVASRKMNKKSRSIAKKEAKAMRADSAGAKSRRWRPGTVALREVRKYQRSTELLIARTPFRRLVKEIMSTFKDTMHMRHSALEAMQDATESYLVSLLCDANLCTIHAKRVTLYPKDLQLALRLRGERT",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [172]:
query = "SELECT * FROM sequence WHERE variant='H3.V_(Trypanosomatidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,AAO24601.1,H3.V_(Trypanosomatidae),,,,5702,Trypanosoma brucei brucei,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQAVARAPVKKVENTPPQKRHHRWR...,
1,HISTDB_H3_V_Trypanosomatidae_0,H3.V_(Trypanosomatidae),,,,5693,Trypanosoma cruzi,Euglenozoa,Kinetoplastea,,,MGSLKKVASVEKHSAFSSESKLPKPRKNLASRKIHTEGRIVAKKES...,
2,HISTDB_H3_V_Trypanosomatidae_1,H3.V_(Trypanosomatidae),,,,5664,Leishmania major,Euglenozoa,Kinetoplastea,,,MAGITKAAVVASHPKKNVASRKMNKKSRSIAKKEAKAMRADSAGAK...,


In [173]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [174]:
pids = ["lowell_variant_2004"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
128,lowell_variant_2004,,,,,


In [175]:
for acc in ["HISTDB_H3_V_Trypanosomatidae_0", "HISTDB_H3_V_Trypanosomatidae_1"]:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [176]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_H3_V_Trypanosomatidae_0", "HISTDB_H3_V_Trypanosomatidae_1"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2584,HISTDB_H3_V_Trypanosomatidae_0,H3.V_(Trypanosomatidae),,,,5693.0,Trypanosoma cruzi,Euglenozoa,Kinetoplastea,,,MGSLKKVASVEKHSAFSSESKLPKPRKNLASRKIHTEGRIVAKKES...,,HISTDB_H3_V_Trypanosomatidae_0,lowell_variant_2004
2585,HISTDB_H3_V_Trypanosomatidae_1,H3.V_(Trypanosomatidae),,,,5664.0,Leishmania major,Euglenozoa,Kinetoplastea,,,MAGITKAAVVASHPKKNVASRKMNKKSRSIAKKEAKAMRADSAGAK...,,HISTDB_H3_V_Trypanosomatidae_1,lowell_variant_2004


In [177]:
# Make sure data is committed to the database
conn.commit()

### <span style="color:black">Add sequence XP_828007.1 to H3.V_(Trypanosomatidae) [schulz_base_2016]</span>

In [178]:
accessions = ["XP_828007.1"]

## Add sequences to curatedDB

In [179]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [180]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.V_(Trypanosomatidae)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 185431
accession XP_828007.1 <class 'str'>
variant H3.V_(Trypanosomatidae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 185431 <class 'int'>
organism Trypanosoma brucei brucei TREU927 <class 'str'>
phylum Euglenozoa <class 'str'>
class Kinetoplastea <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MAQMKKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWRPGTVALREIRRLQSSTDFLIQRAPFRRFLREVVSNLKDSYRMSAACVDAIQEATETYITSVFMDANLCTLHANRVTLFPKDIQLALKLRGERN <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [181]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [182]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
5933,XP_828007.1,H3.V_(Trypanosomatidae),,,,185431.0,Trypanosoma brucei brucei TREU927,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWR...,


In [183]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [184]:
pids = ["schulz_base_2016"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [185]:
data_publication = {
    "id": "schulz_base_2016",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [186]:
pids = ["schulz_base_2016"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
159,schulz_base_2016,,,,,


In [187]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [188]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
6558,XP_828007.1,H3.V_(Trypanosomatidae),,,,185431.0,Trypanosoma brucei brucei TREU927,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWR...,,XP_828007.1,schulz_base_2016


In [189]:
# Make sure data is committed to the database
conn.commit()

### <span style="color:black">Add fasta sequences to H3.V_(Trypanosomatidae)</span>

Sequences fasta [siegel_four_2009]:

```fasta
>ACC histone H3 variant [Trypanosoma brucei]
MAQMKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWRPGTVALREIRRLQSSTDFLIQRAPFRRFLREVVSNLK
DSYRMSAACVDAIQEATETYITSVFMDANLCTLHANRVTLFPKDIQLALKLRGERN
```

In [190]:
query = "SELECT * FROM sequence WHERE variant='H3.V_(Trypanosomatidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,AAO24601.1,H3.V_(Trypanosomatidae),,,,5702,Trypanosoma brucei brucei,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQAVARAPVKKVENTPPQKRHHRWR...,
1,HISTDB_H3_V_Trypanosomatidae_0,H3.V_(Trypanosomatidae),,,,5693,Trypanosoma cruzi,Euglenozoa,Kinetoplastea,,,MGSLKKVASVEKHSAFSSESKLPKPRKNLASRKIHTEGRIVAKKES...,
2,HISTDB_H3_V_Trypanosomatidae_1,H3.V_(Trypanosomatidae),,,,5664,Leishmania major,Euglenozoa,Kinetoplastea,,,MAGITKAAVVASHPKKNVASRKMNKKSRSIAKKEAKAMRADSAGAK...,
3,XP_828007.1,H3.V_(Trypanosomatidae),,,,185431,Trypanosoma brucei brucei TREU927,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWR...,


In [191]:
data_sequence = {
    "accession": "HISTDB_H3_V_Trypanosomatidae_2",
    "variant": "H3.V_(Trypanosomatidae)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 5691,
    "organism": "Trypanosoma brucei",
    "phylum": "Euglenozoa",
    "class": "Kinetoplastea",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MAQMKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWRPGTVALREIRRLQSSTDFLIQRAPFRRFLREVVSNLKDSYRMSAACVDAIQEATETYITSVFMDANLCTLHANRVTLFPKDIQLALKLRGERN",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [192]:
query = "SELECT * FROM sequence WHERE variant='H3.V_(Trypanosomatidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,AAO24601.1,H3.V_(Trypanosomatidae),,,,5702,Trypanosoma brucei brucei,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQAVARAPVKKVENTPPQKRHHRWR...,
1,HISTDB_H3_V_Trypanosomatidae_0,H3.V_(Trypanosomatidae),,,,5693,Trypanosoma cruzi,Euglenozoa,Kinetoplastea,,,MGSLKKVASVEKHSAFSSESKLPKPRKNLASRKIHTEGRIVAKKES...,
2,HISTDB_H3_V_Trypanosomatidae_1,H3.V_(Trypanosomatidae),,,,5664,Leishmania major,Euglenozoa,Kinetoplastea,,,MAGITKAAVVASHPKKNVASRKMNKKSRSIAKKEAKAMRADSAGAK...,
3,HISTDB_H3_V_Trypanosomatidae_2,H3.V_(Trypanosomatidae),,,,5691,Trypanosoma brucei,Euglenozoa,Kinetoplastea,,,MAQMKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWRP...,
4,XP_828007.1,H3.V_(Trypanosomatidae),,,,185431,Trypanosoma brucei brucei TREU927,Euglenozoa,Kinetoplastea,,,MAQMKKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWR...,


In [193]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [194]:
pids = ["siegel_four_2009"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [195]:
data_publication = {
    "id": "siegel_four_2009",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [196]:
pids = ["siegel_four_2009"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
166,siegel_four_2009,,,,,


In [197]:
for acc in ["HISTDB_H3_V_Trypanosomatidae_2"]:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [198]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_H3_V_Trypanosomatidae_2"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2586,HISTDB_H3_V_Trypanosomatidae_2,H3.V_(Trypanosomatidae),,,,5691.0,Trypanosoma brucei,Euglenozoa,Kinetoplastea,,,MAQMKITPRPVRPKSVASRPIQSVARAPVKKVENTPPQKRHHRWRP...,,HISTDB_H3_V_Trypanosomatidae_2,siegel_four_2009


In [199]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to cH3_(Protists)</span>

### <span style="color:black">Add fasta sequences to cH3_(Protists)</span>

Sequences fasta [lowell_variant_2004]:

```fasta
>ACC histone H3 variant [Trypanosoma brucei]
MSRTKETARTKKTITSKKSKKASKGSDAASGVKTAQRRWRPGTVALREIROFQRSTDLLLQKAPFQRLVREVSGAQKEGLRF
QSSAILAAQEATESYIVSLLADTNRACIHSGRVTIQPKDIHLALCLRGERA
```

In [200]:
query = "SELECT * FROM sequence WHERE variant='cH3_(Protists)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,AAF00592.1,cH3_(Protists),,,,5741,Giardia duodenalis,Fornicata,,,,MARTKHTARKTTSATKAPRKTIARKAARKTASSTSGIKKTGRKKQG...,


In [201]:
data_sequence = {
    "accession": "HISTDB_cH3_Protists_0",
    "variant": "cH3_(Protists)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 5691,
    "organism": "Trypanosoma brucei",
    "phylum": "Euglenozoa",
    "class": "Kinetoplastea",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MSRTKETARTKKTITSKKSKKASKGSDAASGVKTAQRRWRPGTVALREIROFQRSTDLLLQKAPFQRLVREVSGAQKEGLRFQSSAILAAQEATESYIVSLLADTNRACIHSGRVTIQPKDIHLALCLRGERA",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [202]:
query = "SELECT * FROM sequence WHERE variant='cH3_(Protists)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,AAF00592.1,cH3_(Protists),,,,5741,Giardia duodenalis,Fornicata,,,,MARTKHTARKTTSATKAPRKTIARKAARKTASSTSGIKKTGRKKQG...,
1,HISTDB_cH3_Protists_0,cH3_(Protists),,,,5691,Trypanosoma brucei,Euglenozoa,Kinetoplastea,,,MSRTKETARTKKTITSKKSKKASKGSDAASGVKTAQRRWRPGTVAL...,


In [203]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [204]:
pids = ["lowell_variant_2004"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
128,lowell_variant_2004,,,,,


In [205]:
for acc in ["HISTDB_cH3_Protists_0"]:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [207]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_cH3_Protists_0"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2316,HISTDB_cH3_Protists_0,cH3_(Protists),,,,5691.0,Trypanosoma brucei,Euglenozoa,Kinetoplastea,,,MSRTKETARTKKTITSKKSKKASKGSDAASGVKTAQRRWRPGTVAL...,,HISTDB_cH3_Protists_0,lowell_variant_2004


In [208]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H3.10_(Arabidopsis)</span>

### <span style="color:black">Add sequence NP_173418.1 to H3.10_(Arabidopsis) [borg_targeted_2020, alvarez-venegas_canonical_2019, okada_analysis_2005]</span>

In [209]:
accessions = ["NP_173418.1"]

## Add sequences to curatedDB

In [210]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [211]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.10_(Arabidopsis)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 3702
accession NP_173418.1 <class 'str'>
variant H3.10_(Arabidopsis) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 3702 <class 'int'>
organism Arabidopsis thaliana <class 'str'>
phylum Streptophyta <class 'str'>
class Magnoliopsida <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKSTGGKGPRKELATKAARKTRRPYRGGVKRAHRFRPGTVALREIRKYQKSTDLLIRKLPFQRLVREIAQDFKVDLRFQSHAVLALQEAAEAYLVGLFEDTNLCAIHAKRVTIMSKDIQLARRIRGERA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [212]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [213]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3575,NP_173418.1,H3.10_(Arabidopsis),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKGPRKELATKAARKTRRPYRGGVKRAHRFRPG...,


In [214]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [216]:
pids = ["alvarez-venegas_canonical_2019", "okada_analysis_2005", "borg_targeted_2020"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,
148,okada_analysis_2005,,,,,


In [217]:
data_publication = {
    "id": "borg_targeted_2020",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [218]:
pids = ["alvarez-venegas_canonical_2019", "okada_analysis_2005", "borg_targeted_2020"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,
80,borg_targeted_2020,,,,,
149,okada_analysis_2005,,,,,


In [219]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [220]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
4008,NP_173418.1,H3.10_(Arabidopsis),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKGPRKELATKAARKTRRPYRGGVKRAHRFRPG...,,NP_173418.1,alvarez-venegas_canonical_2019
4009,NP_173418.1,H3.10_(Arabidopsis),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKGPRKELATKAARKTRRPYRGGVKRAHRFRPG...,,NP_173418.1,borg_targeted_2020
4010,NP_173418.1,H3.10_(Arabidopsis),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARKSTGGKGPRKELATKAARKTRRPYRGGVKRAHRFRPG...,,NP_173418.1,okada_analysis_2005


In [221]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H3.14_(Arabidopsis)</span>

### <span style="color:black">Add sequence NP_177690.1 to H3.14_(Arabidopsis) [alvarez-venegas_canonical_2019, nunez-vazquez_histone_2025]</span>

In [222]:
accessions = ["NP_177690.1"]

## Add sequences to curatedDB

In [223]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [224]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H3.14_(Arabidopsis)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 3702
accession NP_177690.1 <class 'str'>
variant H3.14_(Arabidopsis) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 3702 <class 'int'>
organism Arabidopsis thaliana <class 'str'>
phylum Streptophyta <class 'str'>
class Magnoliopsida <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MARTKQTARKSHGGKAPRTLLATKAARKSAPTTGGVKKPHRYRPGTVALREIRKYQKSTELLIRKLPFQRLVREIAQDYKTDLRFQSHAVLALQEAAEAYLVGLFEDTNLCAIHAKRVTIMPKDVQLARRIRGERA <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [225]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [226]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3579,NP_177690.1,H3.14_(Arabidopsis),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARKSHGGKAPRTLLATKAARKSAPTTGGVKKPHRYRPGT...,


In [227]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [228]:
pids = ["alvarez-venegas_canonical_2019", "nunez-vazquez_histone_2025"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,


In [229]:
data_publication = {
    "id": "nunez-vazquez_histone_2025",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [230]:
pids = ["alvarez-venegas_canonical_2019", "nunez-vazquez_histone_2025"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
74,alvarez-venegas_canonical_2019,,,,,
148,nunez-vazquez_histone_2025,,,,,


In [231]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [232]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
4017,NP_177690.1,H3.14_(Arabidopsis),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARKSHGGKAPRTLLATKAARKSAPTTGGVKKPHRYRPGT...,,NP_177690.1,alvarez-venegas_canonical_2019
4018,NP_177690.1,H3.14_(Arabidopsis),,,,3702.0,Arabidopsis thaliana,Streptophyta,Magnoliopsida,,,MARTKQTARKSHGGKAPRTLLATKAARKSAPTTGGVKKPHRYRPGT...,,NP_177690.1,nunez-vazquez_histone_2025


In [233]:
# Make sure data is committed to the database
conn.commit()

# DONE H4

## <span style="color:black">Add sequences to H4.G_(Hominidae)</span>

### <span style="color:black">Add sequence NP_003538.1 as H4.G_(Hominidae) [ding_primate-specific_2021]</span>

In [234]:
accessions = ["NP_003538.1"]

## Add sequences to curatedDB

In [235]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [236]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": "H4.G_(Hominidae)",
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 9606
accession NP_003538.1 <class 'str'>
variant H4.G_(Hominidae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 9606 <class 'int'>
organism Homo sapiens <class 'str'>
phylum Chordata <class 'str'>
class Mammalia <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGVKRILGLIYEETRRVFKVFLENVIWYAVTNTEHAKRKTVTAMAVVYVLKRQGRTL <class 'str'>
variant_under_consideration None <class 'NoneType'>


In [237]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [238]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3520,NP_003538.1,H4.G_(Hominidae),,,,9606.0,Homo sapiens,Chordata,Mammalia,,,MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGVKR...,


In [239]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [240]:
pids = ["ding_primate-specific_2021"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [241]:
data_publication = {
    "id": "ding_primate-specific_2021",
    "title": None,
    "doi": None,
    "author": None,
    "year": None,
}
cursor.execute(add_publication, data_publication)

In [242]:
pids = ["ding_primate-specific_2021"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
91,ding_primate-specific_2021,,,,,


In [243]:
for acc in accessions:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [244]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3933,NP_003538.1,H4.G_(Hominidae),,,,9606.0,Homo sapiens,Chordata,Mammalia,,,MSVRGKAGKGLGKGGAKCHRKVLSDNIQGITKCTIRRLARHGGVKR...,,NP_003538.1,ding_primate-specific_2021


In [245]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to H4.V_(Trypanosomatidae)</span>

### <span style="color:black">Add fasta sequences to H4.V_(Trypanosomatidae)</span>

Sequences fasta [siegel_four_2009]:

```fasta
>ACC histone H4 variant [Trypanosoma brucei]
MAKGKRVGESKGAQKRQKKVLRDNVRGITRGSIRRLARRAGVKRISGVIYDEVRGVLKTFVESIVRDAGAYTEYSRKKTVTAAHVVFALRKRGKVLYGYD
```

In [246]:
query = "SELECT * FROM sequence WHERE variant='H4.V_(Trypanosomatidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [247]:
data_sequence = {
    "accession": "HISTDB_H4_V_Trypanosomatidae_0",
    "variant": "H4.V_(Trypanosomatidae)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 5691,
    "organism": "Trypanosoma brucei",
    "phylum": "Euglenozoa",
    "class": "Kinetoplastea",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MAKGKRVGESKGAQKRQKKVLRDNVRGITRGSIRRLARRAGVKRISGVIYDEVRGVLKTFVESIVRDAGAYTEYSRKKTVTAAHVVFALRKRGKVLYGYD",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [248]:
query = "SELECT * FROM sequence WHERE variant='H4.V_(Trypanosomatidae)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,HISTDB_H4_V_Trypanosomatidae_0,H4.V_(Trypanosomatidae),,,,5691,Trypanosoma brucei,Euglenozoa,Kinetoplastea,,,MAKGKRVGESKGAQKRQKKVLRDNVRGITRGSIRRLARRAGVKRIS...,


In [249]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [250]:
pids = ["siegel_four_2009"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
169,siegel_four_2009,,,,,


In [251]:
for acc in ["HISTDB_H4_V_Trypanosomatidae_0"]:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [252]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_H4_V_Trypanosomatidae_0"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2675,HISTDB_H4_V_Trypanosomatidae_0,H4.V_(Trypanosomatidae),,,,5691.0,Trypanosoma brucei,Euglenozoa,Kinetoplastea,,,MAKGKRVGESKGAQKRQKKVLRDNVRGITRGSIRRLARRAGVKRIS...,,HISTDB_H4_V_Trypanosomatidae_0,siegel_four_2009


In [253]:
# Make sure data is committed to the database
conn.commit()

## <span style="color:black">Add sequences to cH4_(Protists)</span>

### <span style="color:black">Add fasta sequences to cH4_(Protists)</span>

Sequences fasta [siegel_four_2009]:

```fasta
>ACC histone H4 variant [Trypanosoma brucei]
MAKGKKSGEAKGSQKRQKKVLRENVRGITRGSIRRLARRGGVKRISGVIYDEVRGVLKSFVEGVVRDATAYTEYSRKKTVTAVDVVNALRKRGKILYGYA
```

In [254]:
query = "SELECT * FROM sequence WHERE variant='cH4_(Protists)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [255]:
data_sequence = {
    "accession": "HISTDB_cH4_Protists_0",
    "variant": "cH4_(Protists)",
    "gi": None,
    "ncbi_gene_id": None,
    "hgnc_gene_name": None,
    "taxonomy_id": 5691,
    "organism": "Trypanosoma brucei",
    "phylum": "Euglenozoa",
    "class": "Kinetoplastea",
    "taxonomy_group": None,
    "info": None,
    "sequence": "MAKGKKSGEAKGSQKRQKKVLRENVRGITRGSIRRLARRGGVKRISGVIYDEVRGVLKSFVEGVVRDATAYTEYSRKKTVTAVDVVNALRKRGKILYGYA",
    "variant_under_consideration": None,
}
cursor.execute(add_sequence, data_sequence)

In [256]:
query = "SELECT * FROM sequence WHERE variant='cH4_(Protists)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
0,HISTDB_cH4_Protists_0,cH4_(Protists),,,,5691,Trypanosoma brucei,Euglenozoa,Kinetoplastea,,,MAKGKKSGEAKGSQKRQKKVLRENVRGITRGSIRRLARRGGVKRIS...,


In [257]:
# Make sure data is committed to the database
conn.commit()

## Add sequence publication

In [258]:
pids = ["siegel_four_2009"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
169,siegel_four_2009,,,,,


In [259]:
for acc in ["HISTDB_cH4_Protists_0"]:
    for pid in pids:
        cursor.execute(add_sequence_has_publication, (acc, pid))

In [260]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(["HISTDB_cH4_Protists_0"])]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2317,HISTDB_cH4_Protists_0,cH4_(Protists),,,,5691.0,Trypanosoma brucei,Euglenozoa,Kinetoplastea,,,MAKGKKSGEAKGSQKRQKKVLRENVRGITRGSIRRLARRGGVKRIS...,,HISTDB_cH4_Protists_0,siegel_four_2009


In [261]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [262]:
cursor.close()
conn.close()
tunnel.stop()