In [62]:
import io
from io import StringIO

import pandas as pd
from Bio import Entrez, SeqIO
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

# Укажите ваш email (обязательно для использования Entrez)
Entrez.email = "your.email@example.com"

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

34153


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
# add_histone = (
#     "INSERT INTO histone "
#     "(id, level, taxonomic_span, taxonomic_span_id, description, parent) "
#     "VALUES (%(id)s, %(level)s, %(taxonomic_span)s, %(taxonomic_span_id)s, %(description)s, %(parent)s)"
# )
# add_histone_description = (
#     "INSERT INTO histone_description "
#     "(summary, taxonomy, genes, evolution, expression, knock_out, function, sequence, localization, deposition, structure, interactions, disease, caveats) "
#     "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
# )
add_publication = (
    "INSERT INTO publication "
    "(id, title, doi, author, year) "
    "VALUES (%(id)s, %(title)s, %(doi)s, %(author)s, %(year)s)"
)
add_sequence = (
    "INSERT INTO sequence "
    "(accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration) "
    "VALUES (%(accession)s, %(variant)s, %(gi)s, %(ncbi_gene_id)s, %(hgnc_gene_name)s, %(taxonomy_id)s, %(organism)s, %(phylum)s, %(class)s, %(taxonomy_group)s, %(info)s, %(sequence)s, %(variant_under_consideration)s)"
)
add_sequence_has_publication = (
    "INSERT INTO sequence_has_publication "
    "(sequence_accession, publication_id) "
    "VALUES (%s, %s)"
)
# add_alternate_names = (
#     "INSERT INTO alternative_name "
#     "(name, taxonomy, gene, splice, histone) "
#     "VALUES (%(name)s, %(taxonomy)s, %(gene)s, %(splice)s, %(histone)s)"
# )
# add_histone_has_publication = (
#     "INSERT INTO histone_has_publication "
#     "(histone_id, publication_id) "
#     "VALUES (%s, %s)"
# )

In [7]:
def get_taxonomy_data(record):
    import re
    import sys

    taxonomy_data = {}
    taxonomy_data["organism"] = record.annotations["organism"]
    try:
        for a in record.features[0].qualifiers["db_xref"]:
            text = re.search("(\S+):(\S+)", a).group(1)
            taxid = re.search("(\S+):(\S+)", a).group(2)
            if text == "taxon":
                print(f"Fetched taxid from NCBI {taxid}")
                taxonomy_data["taxonomy_id"] = int(taxid)
            else:
                continue
    except:
        print("!!!!!!Unable to get TAXID for this record setting it to 1")
        taxonomy_data["taxonomy_id"] = 1  # unable to identify

    lineage = dict()
    for i in range(10):
        try:
            handle = Entrez.efetch(
                id=taxonomy_data["taxonomy_id"], db="taxonomy", retmode="xml"
            )
            tax_data = Entrez.read(handle)
            lineage = {
                d["Rank"]: d["ScientificName"]
                for d in tax_data[0]["LineageEx"]
                if d["Rank"] in ["class", "phylum"]
            }
            break
        except:
            print(
                "Unexpected error: {}, Retrying, attempt {}".format(
                    sys.exc_info()[0], i
                )
            )
            if i == 9:
                print(
                    f"FATAL ERROR could not get class and phylum from NCBI after 10 attempts for taxid:{taxonomy_data['taxonomy_id']}. Will add None for class and phylum!"
                )
            else:
                continue
    taxonomy_data["phylum"] = lineage.get("phylum", None)
    taxonomy_data["class"] = lineage.get("class", None)
    if taxonomy_data["phylum"] is not None:
        taxonomy_data["phylum"] = str(taxonomy_data["phylum"])
    if taxonomy_data["class"] is not None:
        taxonomy_data["class"] = str(taxonomy_data["class"])
    return taxonomy_data

# To Do

## <span style="color:green">Add sequences for nodes cenH3.1_(Drosophilidae), cenH3.2_(Drosophila_eugracilis), cenH3.3_(Montium), cenH3.4_(Montium), cenH3.5_(Drosophila), cenH3.6_(Repleta)</span>
    Accessions: ARC76671.1 to ARC76850.1, ARC76868.1 to ARC76927.1, NP_523730.2
    Publications: kursel_recurrent_2017 (all seqs), kursel_gametic_2021 (Cid1 and Cid5), teixeira_concurrent_2018 (NP_523730.2, see below for others)

## <span style="color:green">Add sequences for nodes cenH3.1_(Culicidae), cenH3.2_(Culicidae), cenH3.3_(Aedes)</span>
    See nucleotide sequences below


# <span style="color:black">Add sequences for nodes cenH3.1_(Drosophilidae), cenH3.2_(Drosophila_eugracilis), cenH3.3_(Montium), cenH3.4_(Montium), cenH3.5_(Drosophila), cenH3.6_(Repleta)</span>
    Accessions: ARC76671.1 to ARC76850.1, ARC76868.1 to ARC76927.1, NP_523730.2, see below for others
    Publications: kursel_recurrent_2017 (ARC76671.1 to ARC76850.1, ARC76868.1 to ARC76927.1, NP_523730.2), kursel_gametic_2021 (Cid1 and Cid5), teixeira_concurrent_2018 (NP_523730.2, see below for others)

## Add accessions ARC76671.1 to ARC76850.1, ARC76868.1 to ARC76927.1

Publications: kursel_recurrent_2017 (all seqs), kursel_gametic_2021 (Cid1 and Cid5)

In [10]:
accessions = [f"ARC76{i}.1" for i in range(671, 851)] + [
    f"ARC76{i}.1" for i in range(868, 928)
]

### Add sequences to curatedDB

In [11]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [19]:
var_name_dict = {
    "Cid": "cenH3.1_(Drosophilidae)",
    "Cid1": "cenH3.1_(Drosophilidae)",
    "Cid2": "cenH3.2_(Drosophila_eugracilis)",
    "Cid3": "cenH3.3_(Montium)",
    "id3": "cenH3.3_(Montium)",
    "Cid4": "cenH3.4_(Montium)",
    "Cid5": "cenH3.5_(Drosophila)",
}

In [22]:
data_sequence_list = []
for a in accessions:
    with Entrez.efetch(db="protein", id=a, rettype="gb", retmode="text") as handle:
        record = SeqIO.read(handle, "genbank")
    taxonomy_data = get_taxonomy_data(record)
    data_sequence = {
        "accession": record.id,
        "variant": var_name_dict[record.description.split()[0]],
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq),
        "variant_under_consideration": None,
    }
    data_sequence.update(taxonomy_data)
    data_sequence_list.append(data_sequence)
for k, v in data_sequence_list[-1].items():
    print(k, v, type(v))

Fetched taxid from NCBI 29029
Fetched taxid from NCBI 29029
Fetched taxid from NCBI 29029
Fetched taxid from NCBI 29029
Fetched taxid from NCBI 29029
Fetched taxid from NCBI 29029
Fetched taxid from NCBI 29029
Fetched taxid from NCBI 29029
Fetched taxid from NCBI 47315
Fetched taxid from NCBI 137354
Fetched taxid from NCBI 112146
Fetched taxid from NCBI 46829
Fetched taxid from NCBI 103765
Fetched taxid from NCBI 61426
Fetched taxid from NCBI 137072
Fetched taxid from NCBI 137074
Fetched taxid from NCBI 73157
Fetched taxid from NCBI 60717
Fetched taxid from NCBI 67533
Fetched taxid from NCBI 73917
Fetched taxid from NCBI 7274
Fetched taxid from NCBI 67534
Fetched taxid from NCBI 132243
Fetched taxid from NCBI 94109
Fetched taxid from NCBI 40366
Fetched taxid from NCBI 40368
Fetched taxid from NCBI 47313
Fetched taxid from NCBI 40367
Fetched taxid from NCBI 50033
Fetched taxid from NCBI 40369
Fetched taxid from NCBI 47316
Fetched taxid from NCBI 40371
Fetched taxid from NCBI 40370
Fetch

In [23]:
len(accessions), len(data_sequence_list)

(240, 240)

In [24]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [25]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
494,ARC76671.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQAKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,
495,ARC76672.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQVKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,
496,ARC76673.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQAKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,
497,ARC76674.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQVKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,
498,ARC76675.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQAKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,ARC76923.1,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,
730,ARC76924.1,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,
731,ARC76925.1,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,
732,ARC76926.1,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,


In [28]:
df[df["accession"].isin(accessions)]["variant"].value_counts()

variant
cenH3.1_(Drosophilidae)            91
cenH3.3_(Montium)                  50
cenH3.4_(Montium)                  50
cenH3.5_(Drosophila)               41
cenH3.2_(Drosophila_eugracilis)     8
Name: count, dtype: int64

In [29]:
# Make sure data is committed to the database
conn.commit()

### Add sequence publication

In [30]:
pids = ["kursel_recurrent_2017", "kursel_gametic_2021"]
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [31]:
data_publication = [
    {
        "id": pid,
        "title": None,
        "doi": None,
        "author": None,
        "year": None,
    }
    for pid in pids
]

In [32]:
for dp in data_publication:
    cursor.execute(add_publication, dp)

In [33]:
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"].isin(pids)]

Unnamed: 0,id,title,doi,author,year,pubmed_id
117,kursel_gametic_2021,,,,,
118,kursel_recurrent_2017,,,,,


In [38]:
for ds in data_sequence_list:
    cursor.execute(
        add_sequence_has_publication, (ds["accession"], "kursel_recurrent_2017")
    )
    if ds["variant"] in ["cenH3.1_(Drosophilidae)", "cenH3.5_(Drosophila)"]:
        cursor.execute(
            add_sequence_has_publication, (ds["accession"], "kursel_gametic_2021")
        )

In [39]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
595,ARC76671.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQAKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,,ARC76671.1,kursel_recurrent_2017
596,ARC76672.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQVKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,,ARC76672.1,kursel_recurrent_2017
597,ARC76673.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQAKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,,ARC76673.1,kursel_recurrent_2017
598,ARC76674.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQVKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,,ARC76674.1,kursel_recurrent_2017
599,ARC76675.1,cenH3.2_(Drosophila_eugracilis),,,,29029.0,Drosophila eugracilis,Arthropoda,Insecta,,,MPRKSGAKRATNQAKPTLGDTDAESDDNTAFQSPEPDDDTDYGLEF...,,ARC76675.1,kursel_recurrent_2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
962,ARC76925.1,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,,ARC76925.1,kursel_recurrent_2017
963,ARC76926.1,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,,ARC76926.1,kursel_gametic_2021
964,ARC76926.1,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,,ARC76926.1,kursel_recurrent_2017
965,ARC76927.1,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,,ARC76927.1,kursel_gametic_2021


In [40]:
df[df["accession"].isin(accessions)]["publication_id"].value_counts()

publication_id
kursel_recurrent_2017    240
kursel_gametic_2021      132
Name: count, dtype: int64

In [41]:
# Make sure data is committed to the database
conn.commit()

## Add accession NP_523730.2

Publications: kursel_recurrent_2017, kursel_gametic_2021, teixeira_concurrent_2018

In [45]:
accession = "NP_523730.2"

In [46]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"] == accession]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3526,NP_523730.2,cenH3_(Animals),22024004,,,7227.0,Drosophila melanogaster,Arthropoda,Insecta,,,MPRHSRAKRAPRPSANNSKSPNDDDTAFRSPEPEDGTDYGLEFTTS...,


### Update sequence

In [48]:
query = f"UPDATE sequence SET variant='cenH3.1_(Drosophilidae)' WHERE accession='{accession}'"
print(query)
cursor.execute(query)

UPDATE sequence SET variant='cenH3.1_(Drosophilidae)' WHERE accession='NP_523730.2'


In [49]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"] == accession]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
3526,NP_523730.2,cenH3.1_(Drosophilidae),22024004,,,7227.0,Drosophila melanogaster,Arthropoda,Insecta,,,MPRHSRAKRAPRPSANNSKSPNDDDTAFRSPEPEDGTDYGLEFTTS...,


In [50]:
# Make sure data is committed to the database
conn.commit()

### Add sequence publication

In [52]:
pid = "teixeira_concurrent_2018"
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"] == pid]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [53]:
data_publication = [
    {
        "id": pid,
        "title": None,
        "doi": None,
        "author": None,
        "year": None,
    }
]

In [54]:
for dp in data_publication:
    cursor.execute(add_publication, dp)

In [55]:
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"] == pid]

Unnamed: 0,id,title,doi,author,year,pubmed_id
172,teixeira_concurrent_2018,,,,,


In [None]:
cursor.execute(add_sequence_has_publication, (accession, "kursel_recurrent_2017"))
cursor.execute(add_sequence_has_publication, (accession, "kursel_gametic_2021"))
cursor.execute(add_sequence_has_publication, (accession, "teixeira_concurrent_2018"))

In [60]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"] == accession]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
3961,NP_523730.2,cenH3.1_(Drosophilidae),22024004,,,7227.0,Drosophila melanogaster,Arthropoda,Insecta,,,MPRHSRAKRAPRPSANNSKSPNDDDTAFRSPEPEDGTDYGLEFTTS...,,NP_523730.2,kursel_gametic_2021
3962,NP_523730.2,cenH3.1_(Drosophilidae),22024004,,,7227.0,Drosophila melanogaster,Arthropoda,Insecta,,,MPRHSRAKRAPRPSANNSKSPNDDDTAFRSPEPEDGTDYGLEFTTS...,,NP_523730.2,kursel_recurrent_2017
3963,NP_523730.2,cenH3.1_(Drosophilidae),22024004,,,7227.0,Drosophila melanogaster,Arthropoda,Insecta,,,MPRHSRAKRAPRPSANNSKSPNDDDTAFRSPEPEDGTDYGLEFTTS...,,NP_523730.2,teixeira_concurrent_2018


In [61]:
# Make sure data is committed to the database
conn.commit()

## Add sequences from teixeira_concurrent_2018 (see below for others)



### Выравнивания из статьи teixeira_concurrent_2018

```
>Dnav_Cid1
ATGATTCATTCAGATAGCATTGCTGATGAGGAAAGTGCATTCCAAACGCCAGAGCACGACAACGAGACTGACTATGGATTAGAATTTACCACTAGTCGTACTAGTCGTACCACTCGTAAATGCTCAACGCTGCGCAAGAACTCTTCAGGAAACCGTGCTCAAAAGATCACAATAGATAATGTTAGCAGCGATGAAGAAAACTTATCACCGATAGCAACCCCTCCTAGTAGGCAACGCCAACAATCATCAGTATCCAACAAACATAACCCAAAGCAGCCGCCAGCAACCGCAAAATCATCGAGACGCAAAAAAAATGCTCCAGAGCATCGTTTGAAAAAACTGCACCGTGAAATTGAATGTTTACAGAAGCACCAGGGCTTTTTGATACCCCGATTAGCATTTTCACGCCTGTTACGTGAAATTTTGATACAACATTCAAAAATTCCATTCAAAATAACCACTGGCGCCCTAGAGGCTGTGCAGACCGCAACCGAGATGTATTTAACGCAACGCTTCCAAGATGCTTACTTGCTAACGCAATATCGCAGTCGGGTCACTTTAGAGGTTCGCGACATGGCATTAGTCGCATACTTCTGCAAAACCTACGGCAATCTGTAA

>Dmoj_Cid1
ATGATTCATTCAGATACCATTCCTGACGAGGAAAGTGCATTCCAAACGCCGGAGCACGAGAACGAGACCGACTATGGATTAGAATTTACCACTAGTCGTTTGGCAGAATTAAACGCATTTCCTCGAAGATGTTCAACGCTGCGCAAGAACTCTTCACGAAACCGTGCTCAAAAGATCACAATAGATAATGACAGCAGCGATGAAGAAAACTTATCACCGATAGCAACTCCTAGTAGGCAACGCCAACAAGCATTAGTATCCAGCAAACATAAGCCAAAGCAGCCGCCAGCAACCGCAAAGCCGCCGAGACGCAAAAAAAATGTTCCAGAGCATCGTTTGAAAAAACTACACCGTGAAATTGAATGTTTACAAAAGCACCAGGGATTTTTGATACCCCGATTAGCGTTTTCACGCCTGTTGCGTGAAATTCTGATACAACATTCAAAAATTCCATTCAAAATAACCACGGGCGCCCTGGAGGCTGTGCAGACCGCAACCGAGATGTACTTAACGCAGCGCTTTCAAGATGCTTACTTGCTAACGCAATATCGCAGTCGGGTCACTTTAGAGGTTCGCGACATGGCACTAGTCGCATATTTCTGCAAAACCTACGGCAATCTCTGA

>Dari_Cid1
ATGATTCATTCAGATACTATTCCTGACGAGGAAAGTGCATTCCAAACGCCGGAGCACGAGAACGAGACCGACTATGGATTAGAATTTACCACTAGCCGTTTGGCAGAATTAAACGCATTTCCTCGAAGATGTTCAACGCTGCGCAAGAACTCTTCACGAAACCGTGCTCAAAAGATCACAATAGATAATGACAGCAGCGATGAAGAAAACTTATCACCGATAGCAGCTCCTAGTAGGCAACGCCAACAAGCATCAGTATCCAACAAACATAAGCCAAAGCAGCCGCCAGCAACCGCAAAGCCGTCGAGACGCAAAAAAAATGTTCCAGAGCATCGTTTGAAAAAACTACACCGTGAAATCGAAGGTTTACAAAAGCACCAGGGATTTTTGATACCCCGATTAGCGTTTTCACGCCTGTTGCGTGAAATTCTGATACAACATTCAAAAATTCCATTCAAAATAACCACGGGCGCCCTGGAGGCTGTGCAGACCGCAACCGAGATGTATTTAACGCAGCGCTTTCAAGATGCTTACTTGCTAACGCAATATCGCAGTCGGGTCACTTTAGAGGTTCGCGACATGGCACTAGTCGCATATTTCTGCAAAACCTACGGAAATCTCTGA

>Dbuz_Cid6
ATGGGACGACCTGCAAAAAACTCAGCTAAAACCAAAACACAAAAGACTCAATCAGATTCAATTGGTTCTGATGATGAAACTGCATTCCAAACACCGGAGCATGAGAATGAAACTGACTACGGATTAGAATTTACCACTAGCCGTTTAGCACAATTAAACGCATTTCCTCGACAATGTTCAACGCTGCGCAAGAACTCTTCAAGGGACCGTGCTCAAAACACGACCACAACAGATGAAAGCAATGATGAAGAAAACTTGCCACCAGTATTAACCACTCCAACACGACAAAATTCTCGTAGCCAGCGCCAACAAGCAACAGTATCCCACAAATATAAACAAAAGCAGCCGCCAGAAACCGCAAAATCATCGAGACGCAAAAAAAATGCTCCAGAGCATCGTTTGAAAAAATTGCACCGTGAAATTGAATATTTACAAAAGCAACAAGGCTTCATGATACCACGATTGCCATTTTCACGTCTCTTGCGCGAAATTATGATCAAACATTCGAACACGCCATTTCAAATCACCGTGGGCGCCCTGGAGGCCGTGCAGACCGCAGCAGAGATGTATATAACGCAGCGATTCCAAGATGCTTACTTGCTGACCAAATATCGCAGTCGAGTCACATTAGAGGTACGCGACATGGCAATGGTCGCATATTTCTGCAAAACATATGGAAATCTCTGA

>Dser_Cid1 (опечатка?? это должен быть Cid6)
ATGGGACGACGACGACCTGCAAAAAACTCAACTAAGACCAAAACACAAACAACTCAATCAGATTCAATTGGTTCTGATGATGAAACTGCATTCCAAACACCGGAGCATGAAAATGAAACTGACTATGGATTAGAATTTACCACTAGCCGTTTAGCACAATTAAAGGCATTTCCACGAAGATGCTCAACGCTGCGCAAGAACTCTTCGAGAGACCGTGCTGAAAACAACACCTCAACAGATGATAGCAACGATGAAGAAAACTGGCCACCAGTATTAACCACTCCAACACGACAAAGTTCTCGTAGCCAACACCAGCAAGCATCAGTATCCCAGAAAAATATACAAAAGCAGCCGCCCGAAACCGCAAAATCGTCGAGACGCAAAAAAAATAATCCAGAGCATCGTTTGAAAAAATTGCACCGTGAAATTGAATATTTACAAAAGCAAAAAGGCTTCATGATACCACGATTGCCATTTTCACGTCTCGTGCGCGAAATTATGATAAACCATTCAATCACGCCATTTCAAATCACCACGGGCGCCCTGGAGGCCGTGCAGACCGCAGCAGAGATGTATGTAACGCAGCGCTTCCAAGATGCTTACTTGCTGACCAAATATCGCAGTCGAGTCACATTAGAGGTACGCGACATGGCAATGGTCGCATATTTCTGCAAAACATATGGAAATATCTGA

>Dvir_Cid1
ATGCGTCCACGCACTGTAAAAAATTCAACTGAAAAAAAGAAGAAATCAGAATCGCATTTAGATAATGTTGAGGATTCATATGAGAAAACAGCATTTCAAACACCGGATCGTGAAGACGAAACCGACTACGGCTTGGAGTTTACCACCAGCCGTTTGGCTGAATTGAACACATCTCCACGTCGGTGCTCTACGCTACGCAAAAACAATCCAAAAGACCGCCGTCGTGATATAGAACCATCCGAAGACAACAGTGATTCAGAGAATCAGCCACTGGCAGTACGACAAACACCCCGAAAAGTGCCGCTGCAAACACCCGCAGCGAGTATGAATAAGAAACATCAGGGGCCACTAACGTCAAGACCTGCTTCGAGACGCAAACAAAATAAACCGGAGCAACGTATAAAAAAATTGAACCGAGAAATTGAATGTTTACAAAAGAATGCAGGCTTCATGATACCGCGTTTACCTTTCTCGCGTTTGGTGCGCGAAATTATGATGAAACATACTTTAACGCCCTTTATGATAACTATGAGCGCCCTGGAGGCTATACAGACCGCGACAGAAATGTACTTAACCCAGCGCTTCCAGGATGCCTATTTACTTACTCAGTATCGCAGCCGTGTCACGCTAGAGGTGCGCGACATGGCGTTGGTGGCATATTTCTGCAAAACCTATGGTAATCTTTGA

>Dnav_Cid5
ATGCTTCGGTCTAACATGCCAAAATCTGATGATTCCGATTCGGACTTGTCTATTGCTTTTGGACTTGATGGTGTACCGCGCTGCTCCACGACTCGCAAGCAACAGAAGCACTTAGAGGAATCACAGAATGCAGACGGAATCAATGAGAACGAAGAGGAAGACGTTGCTGATGCCTGTCCAGCAAATTCTTCAATAAGAAACGTACTTTTACCAACGTTGCCGTGTCCACAAGAACTAACGTTTCCCCTAGAATCAGAGTGCCTAGTAGAACCAGCGTGTCCAACAGAACCAGCATGTGAACCAGAGTGTCCACTTGAACCAACATGTTCAAAACGCCGCAAGCAATCAAATCCTTTCAGACGAGCGCAAAAGTTCAAACGTGAAGTTCGCCTGCTGCAGCGTACGCCTAATTTTATGATTCCACGCATATCCTTTGGCCGGGTGGTTCGTGAGATTATGATGGAGAAGTGTGAATGTGAGCCGCATTTTCGCATCACAATGGGCGCCTTGGAGGCACTACAAACGGCGACAGAGATGTTCTTGACGCAACGCTTCCAAGACTCATATATGATGACCATGCACCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCCCTTATGGCATTCATATGTAAGCAGCACGGGCTGCTTTAA

>Dmoj_Cid5
ATGCGTCGGTCTGGTTTGCCAAATTCTGATGACTCCGGATCGGACTTGTCTATTGCGTTTGGACTTGATGGAGTTCCACGCTGCTCCACGACTCGCAAGCAACAGAAGCACTTAGATAAATCAAAGAATGCAGACGGAATTAATGACAACGAAGAGGAAGACGTTGTTGCTGTTTGTCCAGCAAATCCTTCAATTAGGAAAGGACTTTTACCGACGTTGCCGTGTCCACAAGAACCAGTGTTTCCACTAGATCCAGAGTGTCCACCAGAACCAGCGTGTGCAGTAGAACCGGAGCGTCCACTCGAACCAACAAGTTCAAAGCGCCGCAAGCAATCGAATCCTTTCAGACGAGCGCAGAAGTTCAAACGCGAAGTTCGTCTGCTGCAGCGTACGCCTAATTTTATGATTCCACGCATATCCTTTGGCCGGGTGGTTCGCGAGATTATGCTCGAGAGGTGTGACTGTGAGCCGCATTTTCGCATCACAATGGGCGCCTTGGAGGCGCTGCAAACGGCAACAGAGATGTTCTTGACGCAACGCTTCCAGGACTCCTATATGATGACCATGCACCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCCCTTATGGCATTCATATGTAAGCAGCACGGGCTGCTATGA

>Dari_Cid5
ATGCGTCGGTCTCGTTTGCAAAATTCGGATGACTCCGGATCGGACTTGTCTATTGCTTTTGGACTTGATGGAGTTCCACGCTGCTCCACGACTCGCAAGCAACAGAAGCACTTAGATAAATCAAAGAATGCAGACGGAATTAACGACAACGAAGATGAAGACGTTGTTGCTGTTTGTCCAGCAAATCCTTCAATTAGGAACGGACTGTTACCGACGTTGCCGTGTCCACAAGAACCAGTGTTTCCACTAGATCCAGAGTGTCCACCAGGCCCAGCGTGTGCAGTAGAACCGGAGTGTCCACTAGAACCAACAAGTTCAAAGCGCCGCAAGCAATCGAATCCTTTCAGACGAGCGCAGAAGTTCAAACGCGAAGTTCGTCTGCTGCAGCGTACACCTAATTTTATGATTCCACGCATATCCTTTGGCCGGGTGGTTCGCGAGATTATGCTGGAGAGGTGTGACTGTGAGCCGCATTTTCGCATCACAATGGGCGCCTTGGAGGCGCTGCAAACGGCGACAGAGATGTTCTTGACGCAACGCTTCCAGGACTCCTATATGATGACCATGCACCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCCCTTATGGCATTCATATGTAAGCAGCACGGGCTGCTATGA

>Dbuz_Cid5
ATGCGTCGATCTGCGTTACAAAATTCGGATGATTCAGATTCGGATTTGTCTATTGCCTTTGGAGCTGAGGGTGTCCCCTTTTGCTCCACGACTCGCAAGCAAAAGAATTGCAAAAAGCAACCGCAGGACCCAGATGCAATTATTGAGATGGCAGAGAATGATATTATTGAAGATTATCCACCAAATCCTCTCATGTTGCCGAGTACAACAACAGGAGTGTGTCCACCTGAAATAGCGTGTCCACCTGAACTAGAGTGCCCACCTGAACAAGCGTGTCCACAGGAGCCAGTGCATTCACCAGAACTAGTATGTCCATCAGAACCAGCGTGTCCAAGACGACGCAAGCAAACGAATCCTTTCAGACGAGCGCAGAGATTCAAACGCGAAGTTCGTCAACTGCAGCGTACGCCTAACTTTATGATTCCACGCCTCTCCTTTGGTCGCGTAGTTCGTGAGATTATGTTAGAGACTTCAGAATGTGAACCGCATTTTCGCATCACCATTGGTGCCTTGGAGGCACTGCAAACGGCAACAGAGATGTTCATGACGCAACGCTTCCAGGACTCCTATATGATGACCATGCATCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCTCTCATGGCATTCATATGCAAACAGCATGGGCTTCTATAA

>Dser_Cid5
ATGCGTCGATCTGCGTTACAAAAGTCGGATGATTCAGACTCGGATTTGTCTATTGCCTTTGGAGTTGACGGTGTTCCCTGCTGCTCCACGACTCGCAAGCAAAAGGAGCGCTTGAAGCAGCCGCAGGACCCAGATGCAATTATTGAGATGGCAGAGGATGATATTATTGATGATTATCCACCAAATCCTCTGATGTTGCCGTGTTCAACAGGAGTGTGTCCACCTGAAGTAATGTGTCCAACTGAACTAGAGTGTCCACTTGAACCAGAACTAGTATGTCCGACAGTACCGGCGTGTCCAAGACGACGCAAGCAAACAAATCCTTTCAGACGAGCACAGAGATTCAAACGCGAAGTTCGTCGACTGCAGCGTACGCCTAACTTTATGATTCCACGCCTCCCCTTTGGTCGTGTGGTTCGTGAGATTATGCTGGATAATTCACAATGTGAACCGCATTTTCGCATCACCATTGGCGCCTTGGAGGCACTGCAAACGGCAACAGAGATGTTCATGACGCAACGCTTCCAGGACTCCTATATGATGACCATGCATCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCTCTCATGGCATTCATATGCAAACAGCATGGGCTTTTATAA

>Dvir_Cid5
ATGAGTCAAGCTAATGCACAGAGCTCCAATGGATCCCTGGATGAATCAGACTTAACGGCGGCATTTGATTTGAACGTTCTGGGTATGTTGGCCATTGAACAACGCTGCTCGACGACACGCAAGCAGAAGCAACAATTGCAAGGCGAAGAGGAGACGGGTGTGGCAAATTTGGAGTCGCCAGTTGCAGGCGAGGAACCAGCACCTGATACCGTCGCTGTCACGGAACCACCGCCACCGTCACCGTCATCGCCACCGCCACCGCCACGGACACCGTCGCCGCCACAGTTACCGCCACCTACCCGAACAACACGCCGTAAACAGCCGTATCCTTTGCAGCGTGCCGCACTGTTCAGGCGCGAGGTGCGAACGCTGCAGCGTTCACCGCATTTTATGATACCGCGTTTGTCATTTGGGCGCGTGGTCCGTGAGATTATGCTGCAGCACACCGAATCGCCCTATCGCATCACCATTGGCGCTCTGGAGGCCCTACAGTCGGCCACGGAGATGTTTCTAACGCAACGCTTTCAGGACTCCTACCTGATGACCCTGCATCGCAGTCGGGTGACCCTAGAGGTGCGCGACATGGCCCTAATGGCATTCGTGTGCAAATTGCACGGACAACTCTGA```


### Идентификаторы (не использловала) из статьи teixeira_concurrent_2018

| Species | Cid1/Cid6 | Cid5 |
|---|---|---|
| Drosophila mojavensis | XM_002006887.2 | XM_002006612.1 |
| Drosophila arizonae | XM_018010248.1 | XM_018009273.1 |
| Drosophila navojoa | XM_018104694.1 | XM_018103086.1 |
| Drosophila buzzatii | http://dbuz.uab.cat/blast.php (D. buzzatii Freeze 1 Scaffolds) | Smth |
| Drosophila seriema | See Supplementary File S1 | Smth |
| Drosophila virilis | XM_002058834.2 | FlyBase: FBgn0208168 |

In [120]:
cdna_fasta = """>Dnav_Cid1
ATGATTCATTCAGATAGCATTGCTGATGAGGAAAGTGCATTCCAAACGCCAGAGCACGACAACGAGACTGACTATGGATTAGAATTTACCACTAGTCGTACTAGTCGTACCACTCGTAAATGCTCAACGCTGCGCAAGAACTCTTCAGGAAACCGTGCTCAAAAGATCACAATAGATAATGTTAGCAGCGATGAAGAAAACTTATCACCGATAGCAACCCCTCCTAGTAGGCAACGCCAACAATCATCAGTATCCAACAAACATAACCCAAAGCAGCCGCCAGCAACCGCAAAATCATCGAGACGCAAAAAAAATGCTCCAGAGCATCGTTTGAAAAAACTGCACCGTGAAATTGAATGTTTACAGAAGCACCAGGGCTTTTTGATACCCCGATTAGCATTTTCACGCCTGTTACGTGAAATTTTGATACAACATTCAAAAATTCCATTCAAAATAACCACTGGCGCCCTAGAGGCTGTGCAGACCGCAACCGAGATGTATTTAACGCAACGCTTCCAAGATGCTTACTTGCTAACGCAATATCGCAGTCGGGTCACTTTAGAGGTTCGCGACATGGCATTAGTCGCATACTTCTGCAAAACCTACGGCAATCTGTAA

>Dmoj_Cid1
ATGATTCATTCAGATACCATTCCTGACGAGGAAAGTGCATTCCAAACGCCGGAGCACGAGAACGAGACCGACTATGGATTAGAATTTACCACTAGTCGTTTGGCAGAATTAAACGCATTTCCTCGAAGATGTTCAACGCTGCGCAAGAACTCTTCACGAAACCGTGCTCAAAAGATCACAATAGATAATGACAGCAGCGATGAAGAAAACTTATCACCGATAGCAACTCCTAGTAGGCAACGCCAACAAGCATTAGTATCCAGCAAACATAAGCCAAAGCAGCCGCCAGCAACCGCAAAGCCGCCGAGACGCAAAAAAAATGTTCCAGAGCATCGTTTGAAAAAACTACACCGTGAAATTGAATGTTTACAAAAGCACCAGGGATTTTTGATACCCCGATTAGCGTTTTCACGCCTGTTGCGTGAAATTCTGATACAACATTCAAAAATTCCATTCAAAATAACCACGGGCGCCCTGGAGGCTGTGCAGACCGCAACCGAGATGTACTTAACGCAGCGCTTTCAAGATGCTTACTTGCTAACGCAATATCGCAGTCGGGTCACTTTAGAGGTTCGCGACATGGCACTAGTCGCATATTTCTGCAAAACCTACGGCAATCTCTGA

>Dari_Cid1
ATGATTCATTCAGATACTATTCCTGACGAGGAAAGTGCATTCCAAACGCCGGAGCACGAGAACGAGACCGACTATGGATTAGAATTTACCACTAGCCGTTTGGCAGAATTAAACGCATTTCCTCGAAGATGTTCAACGCTGCGCAAGAACTCTTCACGAAACCGTGCTCAAAAGATCACAATAGATAATGACAGCAGCGATGAAGAAAACTTATCACCGATAGCAGCTCCTAGTAGGCAACGCCAACAAGCATCAGTATCCAACAAACATAAGCCAAAGCAGCCGCCAGCAACCGCAAAGCCGTCGAGACGCAAAAAAAATGTTCCAGAGCATCGTTTGAAAAAACTACACCGTGAAATCGAAGGTTTACAAAAGCACCAGGGATTTTTGATACCCCGATTAGCGTTTTCACGCCTGTTGCGTGAAATTCTGATACAACATTCAAAAATTCCATTCAAAATAACCACGGGCGCCCTGGAGGCTGTGCAGACCGCAACCGAGATGTATTTAACGCAGCGCTTTCAAGATGCTTACTTGCTAACGCAATATCGCAGTCGGGTCACTTTAGAGGTTCGCGACATGGCACTAGTCGCATATTTCTGCAAAACCTACGGAAATCTCTGA

>Dbuz_Cid6
ATGGGACGACCTGCAAAAAACTCAGCTAAAACCAAAACACAAAAGACTCAATCAGATTCAATTGGTTCTGATGATGAAACTGCATTCCAAACACCGGAGCATGAGAATGAAACTGACTACGGATTAGAATTTACCACTAGCCGTTTAGCACAATTAAACGCATTTCCTCGACAATGTTCAACGCTGCGCAAGAACTCTTCAAGGGACCGTGCTCAAAACACGACCACAACAGATGAAAGCAATGATGAAGAAAACTTGCCACCAGTATTAACCACTCCAACACGACAAAATTCTCGTAGCCAGCGCCAACAAGCAACAGTATCCCACAAATATAAACAAAAGCAGCCGCCAGAAACCGCAAAATCATCGAGACGCAAAAAAAATGCTCCAGAGCATCGTTTGAAAAAATTGCACCGTGAAATTGAATATTTACAAAAGCAACAAGGCTTCATGATACCACGATTGCCATTTTCACGTCTCTTGCGCGAAATTATGATCAAACATTCGAACACGCCATTTCAAATCACCGTGGGCGCCCTGGAGGCCGTGCAGACCGCAGCAGAGATGTATATAACGCAGCGATTCCAAGATGCTTACTTGCTGACCAAATATCGCAGTCGAGTCACATTAGAGGTACGCGACATGGCAATGGTCGCATATTTCTGCAAAACATATGGAAATCTCTGA

>Dser_Cid6 (опечатка?? это должен быть Cid6)
ATGGGACGACGACGACCTGCAAAAAACTCAACTAAGACCAAAACACAAACAACTCAATCAGATTCAATTGGTTCTGATGATGAAACTGCATTCCAAACACCGGAGCATGAAAATGAAACTGACTATGGATTAGAATTTACCACTAGCCGTTTAGCACAATTAAAGGCATTTCCACGAAGATGCTCAACGCTGCGCAAGAACTCTTCGAGAGACCGTGCTGAAAACAACACCTCAACAGATGATAGCAACGATGAAGAAAACTGGCCACCAGTATTAACCACTCCAACACGACAAAGTTCTCGTAGCCAACACCAGCAAGCATCAGTATCCCAGAAAAATATACAAAAGCAGCCGCCCGAAACCGCAAAATCGTCGAGACGCAAAAAAAATAATCCAGAGCATCGTTTGAAAAAATTGCACCGTGAAATTGAATATTTACAAAAGCAAAAAGGCTTCATGATACCACGATTGCCATTTTCACGTCTCGTGCGCGAAATTATGATAAACCATTCAATCACGCCATTTCAAATCACCACGGGCGCCCTGGAGGCCGTGCAGACCGCAGCAGAGATGTATGTAACGCAGCGCTTCCAAGATGCTTACTTGCTGACCAAATATCGCAGTCGAGTCACATTAGAGGTACGCGACATGGCAATGGTCGCATATTTCTGCAAAACATATGGAAATATCTGA

>Dvir_Cid1
ATGCGTCCACGCACTGTAAAAAATTCAACTGAAAAAAAGAAGAAATCAGAATCGCATTTAGATAATGTTGAGGATTCATATGAGAAAACAGCATTTCAAACACCGGATCGTGAAGACGAAACCGACTACGGCTTGGAGTTTACCACCAGCCGTTTGGCTGAATTGAACACATCTCCACGTCGGTGCTCTACGCTACGCAAAAACAATCCAAAAGACCGCCGTCGTGATATAGAACCATCCGAAGACAACAGTGATTCAGAGAATCAGCCACTGGCAGTACGACAAACACCCCGAAAAGTGCCGCTGCAAACACCCGCAGCGAGTATGAATAAGAAACATCAGGGGCCACTAACGTCAAGACCTGCTTCGAGACGCAAACAAAATAAACCGGAGCAACGTATAAAAAAATTGAACCGAGAAATTGAATGTTTACAAAAGAATGCAGGCTTCATGATACCGCGTTTACCTTTCTCGCGTTTGGTGCGCGAAATTATGATGAAACATACTTTAACGCCCTTTATGATAACTATGAGCGCCCTGGAGGCTATACAGACCGCGACAGAAATGTACTTAACCCAGCGCTTCCAGGATGCCTATTTACTTACTCAGTATCGCAGCCGTGTCACGCTAGAGGTGCGCGACATGGCGTTGGTGGCATATTTCTGCAAAACCTATGGTAATCTTTGA

>Dnav_Cid5
ATGCTTCGGTCTAACATGCCAAAATCTGATGATTCCGATTCGGACTTGTCTATTGCTTTTGGACTTGATGGTGTACCGCGCTGCTCCACGACTCGCAAGCAACAGAAGCACTTAGAGGAATCACAGAATGCAGACGGAATCAATGAGAACGAAGAGGAAGACGTTGCTGATGCCTGTCCAGCAAATTCTTCAATAAGAAACGTACTTTTACCAACGTTGCCGTGTCCACAAGAACTAACGTTTCCCCTAGAATCAGAGTGCCTAGTAGAACCAGCGTGTCCAACAGAACCAGCATGTGAACCAGAGTGTCCACTTGAACCAACATGTTCAAAACGCCGCAAGCAATCAAATCCTTTCAGACGAGCGCAAAAGTTCAAACGTGAAGTTCGCCTGCTGCAGCGTACGCCTAATTTTATGATTCCACGCATATCCTTTGGCCGGGTGGTTCGTGAGATTATGATGGAGAAGTGTGAATGTGAGCCGCATTTTCGCATCACAATGGGCGCCTTGGAGGCACTACAAACGGCGACAGAGATGTTCTTGACGCAACGCTTCCAAGACTCATATATGATGACCATGCACCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCCCTTATGGCATTCATATGTAAGCAGCACGGGCTGCTTTAA

>Dmoj_Cid5
ATGCGTCGGTCTGGTTTGCCAAATTCTGATGACTCCGGATCGGACTTGTCTATTGCGTTTGGACTTGATGGAGTTCCACGCTGCTCCACGACTCGCAAGCAACAGAAGCACTTAGATAAATCAAAGAATGCAGACGGAATTAATGACAACGAAGAGGAAGACGTTGTTGCTGTTTGTCCAGCAAATCCTTCAATTAGGAAAGGACTTTTACCGACGTTGCCGTGTCCACAAGAACCAGTGTTTCCACTAGATCCAGAGTGTCCACCAGAACCAGCGTGTGCAGTAGAACCGGAGCGTCCACTCGAACCAACAAGTTCAAAGCGCCGCAAGCAATCGAATCCTTTCAGACGAGCGCAGAAGTTCAAACGCGAAGTTCGTCTGCTGCAGCGTACGCCTAATTTTATGATTCCACGCATATCCTTTGGCCGGGTGGTTCGCGAGATTATGCTCGAGAGGTGTGACTGTGAGCCGCATTTTCGCATCACAATGGGCGCCTTGGAGGCGCTGCAAACGGCAACAGAGATGTTCTTGACGCAACGCTTCCAGGACTCCTATATGATGACCATGCACCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCCCTTATGGCATTCATATGTAAGCAGCACGGGCTGCTATGA

>Dari_Cid5
ATGCGTCGGTCTCGTTTGCAAAATTCGGATGACTCCGGATCGGACTTGTCTATTGCTTTTGGACTTGATGGAGTTCCACGCTGCTCCACGACTCGCAAGCAACAGAAGCACTTAGATAAATCAAAGAATGCAGACGGAATTAACGACAACGAAGATGAAGACGTTGTTGCTGTTTGTCCAGCAAATCCTTCAATTAGGAACGGACTGTTACCGACGTTGCCGTGTCCACAAGAACCAGTGTTTCCACTAGATCCAGAGTGTCCACCAGGCCCAGCGTGTGCAGTAGAACCGGAGTGTCCACTAGAACCAACAAGTTCAAAGCGCCGCAAGCAATCGAATCCTTTCAGACGAGCGCAGAAGTTCAAACGCGAAGTTCGTCTGCTGCAGCGTACACCTAATTTTATGATTCCACGCATATCCTTTGGCCGGGTGGTTCGCGAGATTATGCTGGAGAGGTGTGACTGTGAGCCGCATTTTCGCATCACAATGGGCGCCTTGGAGGCGCTGCAAACGGCGACAGAGATGTTCTTGACGCAACGCTTCCAGGACTCCTATATGATGACCATGCACCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCCCTTATGGCATTCATATGTAAGCAGCACGGGCTGCTATGA

>Dbuz_Cid5
ATGCGTCGATCTGCGTTACAAAATTCGGATGATTCAGATTCGGATTTGTCTATTGCCTTTGGAGCTGAGGGTGTCCCCTTTTGCTCCACGACTCGCAAGCAAAAGAATTGCAAAAAGCAACCGCAGGACCCAGATGCAATTATTGAGATGGCAGAGAATGATATTATTGAAGATTATCCACCAAATCCTCTCATGTTGCCGAGTACAACAACAGGAGTGTGTCCACCTGAAATAGCGTGTCCACCTGAACTAGAGTGCCCACCTGAACAAGCGTGTCCACAGGAGCCAGTGCATTCACCAGAACTAGTATGTCCATCAGAACCAGCGTGTCCAAGACGACGCAAGCAAACGAATCCTTTCAGACGAGCGCAGAGATTCAAACGCGAAGTTCGTCAACTGCAGCGTACGCCTAACTTTATGATTCCACGCCTCTCCTTTGGTCGCGTAGTTCGTGAGATTATGTTAGAGACTTCAGAATGTGAACCGCATTTTCGCATCACCATTGGTGCCTTGGAGGCACTGCAAACGGCAACAGAGATGTTCATGACGCAACGCTTCCAGGACTCCTATATGATGACCATGCATCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCTCTCATGGCATTCATATGCAAACAGCATGGGCTTCTATAA

>Dser_Cid5
ATGCGTCGATCTGCGTTACAAAAGTCGGATGATTCAGACTCGGATTTGTCTATTGCCTTTGGAGTTGACGGTGTTCCCTGCTGCTCCACGACTCGCAAGCAAAAGGAGCGCTTGAAGCAGCCGCAGGACCCAGATGCAATTATTGAGATGGCAGAGGATGATATTATTGATGATTATCCACCAAATCCTCTGATGTTGCCGTGTTCAACAGGAGTGTGTCCACCTGAAGTAATGTGTCCAACTGAACTAGAGTGTCCACTTGAACCAGAACTAGTATGTCCGACAGTACCGGCGTGTCCAAGACGACGCAAGCAAACAAATCCTTTCAGACGAGCACAGAGATTCAAACGCGAAGTTCGTCGACTGCAGCGTACGCCTAACTTTATGATTCCACGCCTCCCCTTTGGTCGTGTGGTTCGTGAGATTATGCTGGATAATTCACAATGTGAACCGCATTTTCGCATCACCATTGGCGCCTTGGAGGCACTGCAAACGGCAACAGAGATGTTCATGACGCAACGCTTCCAGGACTCCTATATGATGACCATGCATCGCCAGCGGGTCACCCTGGAGCTGCGTGACATGGCTCTCATGGCATTCATATGCAAACAGCATGGGCTTTTATAA

>Dvir_Cid5
ATGAGTCAAGCTAATGCACAGAGCTCCAATGGATCCCTGGATGAATCAGACTTAACGGCGGCATTTGATTTGAACGTTCTGGGTATGTTGGCCATTGAACAACGCTGCTCGACGACACGCAAGCAGAAGCAACAATTGCAAGGCGAAGAGGAGACGGGTGTGGCAAATTTGGAGTCGCCAGTTGCAGGCGAGGAACCAGCACCTGATACCGTCGCTGTCACGGAACCACCGCCACCGTCACCGTCATCGCCACCGCCACCGCCACGGACACCGTCGCCGCCACAGTTACCGCCACCTACCCGAACAACACGCCGTAAACAGCCGTATCCTTTGCAGCGTGCCGCACTGTTCAGGCGCGAGGTGCGAACGCTGCAGCGTTCACCGCATTTTATGATACCGCGTTTGTCATTTGGGCGCGTGGTCCGTGAGATTATGCTGCAGCACACCGAATCGCCCTATCGCATCACCATTGGCGCTCTGGAGGCCCTACAGTCGGCCACGGAGATGTTTCTAACGCAACGCTTTCAGGACTCCTACCTGATGACCCTGCATCGCAGTCGGGTGACCCTAGAGGTGCGCGACATGGCCCTAATGGCATTCGTGTGCAAATTGCACGGACAACTCTGA"""
# print(cdna_fasta)

In [121]:
for record in SeqIO.parse(StringIO(cdna_fasta), format="fasta"):
    print(f"{record.id} {record.seq.translate().rstrip('*')}")

Dnav_Cid1 MIHSDSIADEESAFQTPEHDNETDYGLEFTTSRTSRTTRKCSTLRKNSSGNRAQKITIDNVSSDEENLSPIATPPSRQRQQSSVSNKHNPKQPPATAKSSRRKKNAPEHRLKKLHREIECLQKHQGFLIPRLAFSRLLREILIQHSKIPFKITTGALEAVQTATEMYLTQRFQDAYLLTQYRSRVTLEVRDMALVAYFCKTYGNL
Dmoj_Cid1 MIHSDTIPDEESAFQTPEHENETDYGLEFTTSRLAELNAFPRRCSTLRKNSSRNRAQKITIDNDSSDEENLSPIATPSRQRQQALVSSKHKPKQPPATAKPPRRKKNVPEHRLKKLHREIECLQKHQGFLIPRLAFSRLLREILIQHSKIPFKITTGALEAVQTATEMYLTQRFQDAYLLTQYRSRVTLEVRDMALVAYFCKTYGNL
Dari_Cid1 MIHSDTIPDEESAFQTPEHENETDYGLEFTTSRLAELNAFPRRCSTLRKNSSRNRAQKITIDNDSSDEENLSPIAAPSRQRQQASVSNKHKPKQPPATAKPSRRKKNVPEHRLKKLHREIEGLQKHQGFLIPRLAFSRLLREILIQHSKIPFKITTGALEAVQTATEMYLTQRFQDAYLLTQYRSRVTLEVRDMALVAYFCKTYGNL
Dbuz_Cid6 MGRPAKNSAKTKTQKTQSDSIGSDDETAFQTPEHENETDYGLEFTTSRLAQLNAFPRQCSTLRKNSSRDRAQNTTTTDESNDEENLPPVLTTPTRQNSRSQRQQATVSHKYKQKQPPETAKSSRRKKNAPEHRLKKLHREIEYLQKQQGFMIPRLPFSRLLREIMIKHSNTPFQITVGALEAVQTAAEMYITQRFQDAYLLTKYRSRVTLEVRDMAMVAYFCKTYGNL
Dser_Cid6 MGRRRPAKNSTKTKTQTTQSDSIGSDDETAFQTPEHENETDYGLEFTTSRLAQLKAFPRRCSTLRKNSSRDRAENNTSTDDSNDEENWPPVLTTPTRQS

### Add sequences to curatedDB

In [122]:
var_name_dict = {
    "Cid1": "cenH3.1_(Drosophilidae)",
    "Cid5": "cenH3.5_(Drosophila)",
    "Cid6": "cenH3.6_(Repleta)",
}
tax_name_dict = {
    "Dnav": {
        "taxonomy_id": 7232,
        "organism": "Drosophila navojoa",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Dmoj": {
        "taxonomy_id": 7230,
        "organism": "Drosophila mojavensis",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Dari": {
        "taxonomy_id": 7263,
        "organism": "Drosophila arizonae",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Dbuz": {
        "taxonomy_id": 7264,
        "organism": "Drosophila buzzatii",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Dser": {
        "taxonomy_id": 271509,
        "organism": "Drosophila seriema",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Dvir": {
        "taxonomy_id": 7244,
        "organism": "Drosophila virilis",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
}

In [123]:
data_sequence_list = []
var_count_dict = {
    "Cid1": 0,
    "Cid5": 0,
    "Cid6": 0,
}
accessions = []
for record in SeqIO.parse(StringIO(cdna_fasta), format="fasta"):
    tax_name, var_name = record.id.split("_")
    print("***", tax_name, var_name, "***")
    seq_id = f"HISTDB_{var_name_dict[var_name].split('_')[0].replace('.', '_')}_{var_count_dict[var_name]}"
    accessions.append(seq_id)
    data_sequence = {
        "accession": seq_id,
        "variant": var_name_dict[var_name],
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq.translate().rstrip("*")),
        "variant_under_consideration": None,
    }
    data_sequence.update(tax_name_dict[tax_name])
    data_sequence_list.append(data_sequence)
    var_count_dict[var_name] += 1
    for k, v in data_sequence.items():
        print(k, v, type(v))

*** Dnav Cid1 ***
accession HISTDB_cenH3_1_0 <class 'str'>
variant cenH3.1_(Drosophilidae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 7232 <class 'int'>
organism Drosophila navojoa <class 'str'>
phylum Arthropoda <class 'str'>
class Insecta <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MIHSDSIADEESAFQTPEHDNETDYGLEFTTSRTSRTTRKCSTLRKNSSGNRAQKITIDNVSSDEENLSPIATPPSRQRQQSSVSNKHNPKQPPATAKSSRRKKNAPEHRLKKLHREIECLQKHQGFLIPRLAFSRLLREILIQHSKIPFKITTGALEAVQTATEMYLTQRFQDAYLLTQYRSRVTLEVRDMALVAYFCKTYGNL <class 'str'>
variant_under_consideration None <class 'NoneType'>
*** Dmoj Cid1 ***
accession HISTDB_cenH3_1_1 <class 'str'>
variant cenH3.1_(Drosophilidae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 7230 <class 'int'>
organism Drosophila mojavensis <class 'str'>
phylum Arthropoda <class '

In [124]:
accessions

['HISTDB_cenH3_1_0',
 'HISTDB_cenH3_1_1',
 'HISTDB_cenH3_1_2',
 'HISTDB_cenH3_6_0',
 'HISTDB_cenH3_6_1',
 'HISTDB_cenH3_1_3',
 'HISTDB_cenH3_5_0',
 'HISTDB_cenH3_5_1',
 'HISTDB_cenH3_5_2',
 'HISTDB_cenH3_5_3',
 'HISTDB_cenH3_5_4',
 'HISTDB_cenH3_5_5']

In [92]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [93]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [94]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
1971,HISTDB_cenH3_1_0,cenH3.1_(Drosophilidae),,,,7232.0,Drosophila navojoa,Arthropoda,Insecta,,,MIHSDSIADEESAFQTPEHDNETDYGLEFTTSRTSRTTRKCSTLRK...,
1972,HISTDB_cenH3_1_1,cenH3.1_(Drosophilidae),,,,7230.0,Drosophila mojavensis,Arthropoda,Insecta,,,MIHSDTIPDEESAFQTPEHENETDYGLEFTTSRLAELNAFPRRCST...,
1973,HISTDB_cenH3_1_2,cenH3.1_(Drosophilidae),,,,7263.0,Drosophila arizonae,Arthropoda,Insecta,,,MIHSDTIPDEESAFQTPEHENETDYGLEFTTSRLAELNAFPRRCST...,
1974,HISTDB_cenH3_1_3,cenH3.1_(Drosophilidae),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MRPRTVKNSTEKKKKSESHLDNVEDSYEKTAFQTPDREDETDYGLE...,
1975,HISTDB_cenH3_5_0,cenH3.5_(Drosophila),,,,7232.0,Drosophila navojoa,Arthropoda,Insecta,,,MLRSNMPKSDDSDSDLSIAFGLDGVPRCSTTRKQQKHLEESQNADG...,
1976,HISTDB_cenH3_5_1,cenH3.5_(Drosophila),,,,7230.0,Drosophila mojavensis,Arthropoda,Insecta,,,MRRSGLPNSDDSGSDLSIAFGLDGVPRCSTTRKQQKHLDKSKNADG...,
1977,HISTDB_cenH3_5_2,cenH3.5_(Drosophila),,,,7263.0,Drosophila arizonae,Arthropoda,Insecta,,,MRRSRLQNSDDSGSDLSIAFGLDGVPRCSTTRKQQKHLDKSKNADG...,
1978,HISTDB_cenH3_5_3,cenH3.5_(Drosophila),,,,7264.0,Drosophila buzzatii,Arthropoda,Insecta,,,MRRSALQNSDDSDSDLSIAFGAEGVPFCSTTRKQKNCKKQPQDPDA...,
1979,HISTDB_cenH3_5_4,cenH3.5_(Drosophila),,,,271509.0,Drosophila seriema,Arthropoda,Insecta,,,MRRSALQKSDDSDSDLSIAFGVDGVPCCSTTRKQKERLKQPQDPDA...,
1980,HISTDB_cenH3_5_5,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,


In [95]:
df[df["accession"].isin(accessions)]["variant"].value_counts()

variant
cenH3.5_(Drosophila)       6
cenH3.1_(Drosophilidae)    4
cenH3.6_(Repleta)          2
Name: count, dtype: int64

In [96]:
# Make sure data is committed to the database
conn.commit()

In [125]:
len(accessions)

12

In [131]:
for acc in accessions:
    query = f"DELETE FROM sequence_has_publication WHERE sequence_accession='{acc}'"
    print(query)
    cursor.execute(query)

DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_1_0'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_1_1'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_1_2'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_6_0'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_6_1'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_1_3'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_5_0'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_5_1'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_5_2'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_5_3'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_5_4'
DELETE FROM sequence_has_publication WHERE sequence_accession='HISTDB_cenH3_5_5'


In [132]:
new_accessions = [f"HISTDB_cenH3_Drosophilidae_{i}" for i in range(0, 12)]

for acc, new_acc in zip(accessions, new_accessions):
    query = f"UPDATE sequence SET accession='{new_acc}' WHERE accession='{acc}'"
    print(query)
    cursor.execute(query)

UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_0' WHERE accession='HISTDB_cenH3_1_0'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_1' WHERE accession='HISTDB_cenH3_1_1'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_2' WHERE accession='HISTDB_cenH3_1_2'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_3' WHERE accession='HISTDB_cenH3_6_0'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_4' WHERE accession='HISTDB_cenH3_6_1'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_5' WHERE accession='HISTDB_cenH3_1_3'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_6' WHERE accession='HISTDB_cenH3_5_0'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_7' WHERE accession='HISTDB_cenH3_5_1'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_8' WHERE accession='HISTDB_cenH3_5_2'
UPDATE sequence SET accession='HISTDB_cenH3_Drosophilidae_9' WHERE accession='HISTDB_cenH3_5_3'
UPDATE sequence SET accession='HISTDB_ce

In [133]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [134]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(new_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
1971,HISTDB_cenH3_Drosophilidae_0,cenH3.1_(Drosophilidae),,,,7232.0,Drosophila navojoa,Arthropoda,Insecta,,,MIHSDSIADEESAFQTPEHDNETDYGLEFTTSRTSRTTRKCSTLRK...,
1972,HISTDB_cenH3_Drosophilidae_1,cenH3.1_(Drosophilidae),,,,7230.0,Drosophila mojavensis,Arthropoda,Insecta,,,MIHSDTIPDEESAFQTPEHENETDYGLEFTTSRLAELNAFPRRCST...,
1973,HISTDB_cenH3_Drosophilidae_10,cenH3.5_(Drosophila),,,,271509.0,Drosophila seriema,Arthropoda,Insecta,,,MRRSALQKSDDSDSDLSIAFGVDGVPCCSTTRKQKERLKQPQDPDA...,
1974,HISTDB_cenH3_Drosophilidae_11,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,
1975,HISTDB_cenH3_Drosophilidae_2,cenH3.1_(Drosophilidae),,,,7263.0,Drosophila arizonae,Arthropoda,Insecta,,,MIHSDTIPDEESAFQTPEHENETDYGLEFTTSRLAELNAFPRRCST...,
1976,HISTDB_cenH3_Drosophilidae_3,cenH3.6_(Repleta),,,,7264.0,Drosophila buzzatii,Arthropoda,Insecta,,,MGRPAKNSAKTKTQKTQSDSIGSDDETAFQTPEHENETDYGLEFTT...,
1977,HISTDB_cenH3_Drosophilidae_4,cenH3.6_(Repleta),,,,271509.0,Drosophila seriema,Arthropoda,Insecta,,,MGRRRPAKNSTKTKTQTTQSDSIGSDDETAFQTPEHENETDYGLEF...,
1978,HISTDB_cenH3_Drosophilidae_5,cenH3.1_(Drosophilidae),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MRPRTVKNSTEKKKKSESHLDNVEDSYEKTAFQTPDREDETDYGLE...,
1979,HISTDB_cenH3_Drosophilidae_6,cenH3.5_(Drosophila),,,,7232.0,Drosophila navojoa,Arthropoda,Insecta,,,MLRSNMPKSDDSDSDLSIAFGLDGVPRCSTTRKQQKHLEESQNADG...,
1980,HISTDB_cenH3_Drosophilidae_7,cenH3.5_(Drosophila),,,,7230.0,Drosophila mojavensis,Arthropoda,Insecta,,,MRRSGLPNSDDSGSDLSIAFGLDGVPRCSTTRKQQKHLDKSKNADG...,


### Add sequence publication

In [135]:
pid = "teixeira_concurrent_2018"

In [136]:
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"] == pid]

Unnamed: 0,id,title,doi,author,year,pubmed_id
172,teixeira_concurrent_2018,,,,,


In [138]:
for acc in new_accessions:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [139]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(new_accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2284,HISTDB_cenH3_Drosophilidae_0,cenH3.1_(Drosophilidae),,,,7232.0,Drosophila navojoa,Arthropoda,Insecta,,,MIHSDSIADEESAFQTPEHDNETDYGLEFTTSRTSRTTRKCSTLRK...,,HISTDB_cenH3_Drosophilidae_0,teixeira_concurrent_2018
2285,HISTDB_cenH3_Drosophilidae_1,cenH3.1_(Drosophilidae),,,,7230.0,Drosophila mojavensis,Arthropoda,Insecta,,,MIHSDTIPDEESAFQTPEHENETDYGLEFTTSRLAELNAFPRRCST...,,HISTDB_cenH3_Drosophilidae_1,teixeira_concurrent_2018
2286,HISTDB_cenH3_Drosophilidae_10,cenH3.5_(Drosophila),,,,271509.0,Drosophila seriema,Arthropoda,Insecta,,,MRRSALQKSDDSDSDLSIAFGVDGVPCCSTTRKQKERLKQPQDPDA...,,HISTDB_cenH3_Drosophilidae_10,teixeira_concurrent_2018
2287,HISTDB_cenH3_Drosophilidae_11,cenH3.5_(Drosophila),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MSQANAQSSNGSLDESDLTAAFDLNVLGMLAIEQRCSTTRKQKQQL...,,HISTDB_cenH3_Drosophilidae_11,teixeira_concurrent_2018
2288,HISTDB_cenH3_Drosophilidae_2,cenH3.1_(Drosophilidae),,,,7263.0,Drosophila arizonae,Arthropoda,Insecta,,,MIHSDTIPDEESAFQTPEHENETDYGLEFTTSRLAELNAFPRRCST...,,HISTDB_cenH3_Drosophilidae_2,teixeira_concurrent_2018
2289,HISTDB_cenH3_Drosophilidae_3,cenH3.6_(Repleta),,,,7264.0,Drosophila buzzatii,Arthropoda,Insecta,,,MGRPAKNSAKTKTQKTQSDSIGSDDETAFQTPEHENETDYGLEFTT...,,HISTDB_cenH3_Drosophilidae_3,teixeira_concurrent_2018
2290,HISTDB_cenH3_Drosophilidae_4,cenH3.6_(Repleta),,,,271509.0,Drosophila seriema,Arthropoda,Insecta,,,MGRRRPAKNSTKTKTQTTQSDSIGSDDETAFQTPEHENETDYGLEF...,,HISTDB_cenH3_Drosophilidae_4,teixeira_concurrent_2018
2291,HISTDB_cenH3_Drosophilidae_5,cenH3.1_(Drosophilidae),,,,7244.0,Drosophila virilis,Arthropoda,Insecta,,,MRPRTVKNSTEKKKKSESHLDNVEDSYEKTAFQTPDREDETDYGLE...,,HISTDB_cenH3_Drosophilidae_5,teixeira_concurrent_2018
2292,HISTDB_cenH3_Drosophilidae_6,cenH3.5_(Drosophila),,,,7232.0,Drosophila navojoa,Arthropoda,Insecta,,,MLRSNMPKSDDSDSDLSIAFGLDGVPRCSTTRKQQKHLEESQNADG...,,HISTDB_cenH3_Drosophilidae_6,teixeira_concurrent_2018
2293,HISTDB_cenH3_Drosophilidae_7,cenH3.5_(Drosophila),,,,7230.0,Drosophila mojavensis,Arthropoda,Insecta,,,MRRSGLPNSDDSGSDLSIAFGLDGVPRCSTTRKQQKHLDKSKNADG...,,HISTDB_cenH3_Drosophilidae_7,teixeira_concurrent_2018


In [140]:
df[df["accession"].isin(new_accessions)]["publication_id"].value_counts()

publication_id
teixeira_concurrent_2018    12
Name: count, dtype: int64

In [141]:
# Make sure data is committed to the database
conn.commit()

# <span style="color:black">Add sequences for nodes cenH3.1_(Culicidae), cenH3.2_(Culicidae), cenH3.3_(Aedes)</span>
    See nucleotide sequences below

## Выравнивания из статьи kursel_ancient_2020

### mosqCid untrimmed nucleotide sequences

```
>Culex_quinquefasciatus_mosqCid2_CPIJ018900
ATGCCTCGCCGCGGACCTGCACCGAAAAAGGCGGGCCCCAAACGGGGCGGACCAGCCCCCAAAAATACCAGAACCAAATCGCCAGTGTCCCCTCGTGTGCCACCTCCTCCCCCGCCCCCACCACCACCGCCGGCACAATCTCACCAGCAGCCCGTCTCCCAGCGGGACGTATTCGACGAGATGATGGGCTCGGAGATCAGCAGTGACAACTCTAGTCAGGAAGCCCCGCCCCGGGTTGCACTCCCTTCCAAACGCAAGTCACCTCGCTTCCAGGATGGCGCCGGCGCCGGAGCCGTCGCCAGCGACGACAGCTCCCTGTCGGAAGCGAACCCCGACAGATCCCGCCAGCAGCAGCCGCCGCACCGCCGCAAGGCCCCCGCCCCCAAAAAGAGCCAAACGGCGGCCCTCAAGGAGATCGCCAAGCTGCAGCGCACCACGAACCCCGTCATCCCGAAGTTGCCCTTCGCGCGGCTCATCCGGGAGATCCTGATGGAGTACAGCCACCGGGAGCTGCGCATCACGCCGGAGAGCTTGCAGTGTCTGCAGGAGTCGGCGGAGGTGTTTGCGGTGCAGCTGATGGAGGACGCGTACCGGTGCACGCTGCACCGCGACCGGCTCACGCTGATGCCCAAGGACATGAAGCTGGCGGTGATGCTGCGCAAGGATAGTGTGATGGTGTGA
>Culex_quinquefasciatus_mosqCid1_CPIJ008605
ATGCCGCGCCGCGTAAGAACCCCACCACGACGCATTCCGCCCCAACCATCGGCCAAGGACGGCCAACGTGCCGGTTCGTCCCGCAATCAGCCATCCCAACGAGACTTGCAGGAAGCTGGGCCATCCCGGGCAGGCACTCGGTCATCCCGCCGTTCGCGGTCCGAACCGCGACGTTCCGCCAACAGAGACGACAGCAGTAGCTCCAGCGAGGACGATCGTAGCTACCGGTTGCCCCGAATGTCCCGATCGCGCTCGGAGCAGCGCAACGCGCGGCCAACTCGACTCCACGGAGCTCGAGTTCTCCGGGAAATCACCCGTCTCCAGCTGACCACAGACCTACTGATCCCGAAGCTACCCTTTGCCCGGCTTATTCGCGAAGTTCTGCAGCAGTATTCTCAGCGGAACCTGCGAATAACCCCGGAGGCCTTGCTTTGCCTGCAAGAATCGTCTGAAATCTACCTGACGCAGATGTTCGAGGACGCGTACCGGTGCACGCTTCACCGGGAACGCGTTACGATGATGCCCAAGGACATGAACCTGGCGCTGTACCTGCGCGAACGGTGGGCTCGCTGA
>Anopheles_albimanus_mosqCid2
ATGGCGCCCCGAAAGAAAATAACGAAATCAACAAATAAAGCGCCTGCGCGAGCCGCGACCCGCGATGATACACCATCGCCAGAAACAAGCCAAGCCAACCCGATACCGGAATTCCGACAGTTATCCGCAGCAGAAGTAGCCGAAGCGATGGGCAACGAAACGGACAGCGATCTCAGCGAGGATGACCCAACGTACACAACCCAGTCGAAACCGAACTTCTCCTTCCTGCCTTCTAACAGGCACTCCAGCCCGCGGCGGGCCAACCGAAACAGATCTGGAAGCAGCGATGCTCCTTCGCGTTCTCCTTCGGCCGTTGTTCATCGATTGGCAAGCGCTTCGACGGTACCGAACTCAGAGCTCACGAGAACGCCAACAAGACGCAGCGGTCCACAAGGTGCAGAACCCGCCTCGGAACCCCAGCCGCCGGGCAGGGCACACAGGAGAAAACAAGACCGGCCGACAGACTGGAAGATTGTCAGGGAAATAGTGAAACTGCAGGCTGGCGTGAACAGTCTCATACCGAGGCTTTCTTTCGGCCGAGTGATAAGGGAAATCCTTAGCGAATATAGCGACAGTGGTATGAGGGTAACGGCCGAAATGTTGACCTGCCTACAGGAGGCCGCGGAAGTGTACATCGTGCAGATGTTCGAGGACGCTTATCGTTGCACGCTACACCGAGGGCGAGTCACATTGATTCCCAAGGACATGGAACTAGCATTATTGATCAGACGAGATGCCAGCTAA
>Anopheles_gambiae_mosqCid1_AGAP007508
ATGCCACGTCCGAAAAGTGCACCGAGATCACTATCGGAAAGAAAGGAGCGGAAAAGCAAAGCCCGAACATTGCGTAGCCAGGCCCAATCGGATCTGTTTTCTTCCAGCGAGGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTTCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGAAGCCAACAGACCGATGAGCCCAGAGCATCACGCAGCGCCGCCCGCCAGCATACACCTTCTTCCACAAGCGATGAAGAGGAGGAAGACGAGGAGGAGCACGATCCATCTCAGCGTAATCGACGAAGTCGCAGCAGCACACGCACACCGTCCGAACCCGTTGCTTCCACCTCACAGCGTCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAAGCCCAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAGCTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCTTGCACCGTGCCAGAGTGACGCTCGCCCCGAAGGACATTGAGCTGGTAATTATACTACGACGAGGGATCAAATAA
>Anopheles_gambiae_mosqCid2
ATGGCACCGCGGAAAAACACCAAAAAGCAACCGAAACCGACGGCACGCGCCCGGAAGCAAACGGTGGAACGTACACCAAGTCCACCGAAAAGTGCTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACCACCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCCACCGCCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGTATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCCGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCAGCATATCCGCCGCAGCCAACACTGGACGGAATTCAACCAAAGGGAAGCCTACCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAGGCTGCAAGGGACGGTGCACAATCTGATTCCCAAGCTAAGCTTCGGACGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCCCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTGATGCTGCGTCGCAATTGA
>Aedes_aegypti_mosqCid2_AAEL007783
ATGCCACCAAGAATCACGAAGAAATCAAAGAGTAAAAAACAGATCTCTGCAATACCCCAAGACCTGGAATTCATGCTGGGAGAGGAGATATCTTCACCACTGGACAGCCCACTTTCACCAACAGAGGCCGAATACTCGCTTCTTACTGCCTCGCCTCATGACGTGCTGGAAAATCTGCGATTGGTTGGCCTGGCTGACAACGAGACAACGAATGAAAGGGACCAATATGTGACGAGCAGTGGTAACTATAAGCCCAGATCAATTGTCGAGGTCATCCCAAAGCATTCTGTGCCTCAAATAATCAGTAACAAATCTATATCTAAAGGAAAAACAAAACAAAAGAAACCAGAACCATCGGATCGGAGAATCGAGTACTCCGAAGGCTTTACGCAATCTCAAACATCATCTGGTATAGAAAGAGAAATCAGTTCTAACCGTGAATCATCATTTGAAACGGACACTTATAGTTCAAACCTAACGGATTCAAGCACACAGGCTGTTAGACCCAACGATGCGTCCGAAAAACCTTCAAAATCGTCAAAAGGCCAGAAAACCAGTACCGATCGACGTAAAAGCTCGCCCACCAAGAGGAATGTTCCTCTAGGGTCAAAGAGGCAAACGATAGCCAATGATAGAGAACTTCAATTACTGCACAGCATCGCACGTCATCAAACGAGCACCGAATGTTTGATCCCGAAGCTCCCGTTTTCCCGATTGATCAAGGAAACTATGCAACAGTACTGTGGCAGAAACCTTCGCATTACTCCGGAGTGTCTGTTGTGCCTGCAGGAAGCGGCCGAAATCTATGCCGTCCAGGTGATGGAAGATGCCTACCGCTGCACGTTGCATCGAGGAAGGATAACGCTCACTGCCAAGGATATGAGACTGGCTTTGCTTTTGCGTAACGATAGTGTGATGATGTAG
>Aedes_aegypti_mosqCid1_AAEL009296
ATGCCTCGTCGCCAAAATAGACCCCCAACCCGCAATCCGCGAGGACTTGGCGCAGCACCAAGGAACGACTCTCCCGACAGAAGTGCACGCGCTTCGGCATCTTGCAACCAAAGGCGATCGCTGTCGGAATCGAATCTTCCCCGTTCAGCTGCTGCTGAAACGCCGCAAGCTGGAAGATCAAGGGCAGCCTCCGCGGTAAGAGGAACCAAGGTCCTGAACGAGATACGGCATCTGCAGCGGAGCACCGGTTTGTTGATTCCCAAGCTGCCCTTTGGACGGGTCATCCGAGAGGTCATGCTAGAATACAACGGGCGCCATCTGCGAATCACATATGATGCCCTGATGGCCATCCAGGAAGCGGCGGAGATGTACCTGGTGATGCTTTTCGAGGACTGCCAAAAGCTGGCCCTGCATCGACAGCGGGTCACGATTACCAAACGGGACATGGACCTGGCGCTTTACTTCCGGCTTTGA
>Aedes_aegypti_mosqCid3_AAEL009284
ATGCCTCGCCGAGTTCGTCCACCGACACGTTACCCAGCAGGAGGAAAACTTCAAACATTGACGACGAAAGGAAGCAAGACGAAAGCAGTTCCTGAACCACCAGCACCTAAATCTAAATCAAAACCATCACAAGCATCTGGCGCAAAATCGAAGGCACCAAGCTCACCAAAACAACAGAAAGCTGCAGGACCAAAACGGTCGGAACCATCGACGAGCTTACCCAAGCAACGGGAAGTTCCTAGTCCAGTGGAACCACACCCGAGAAGATCCCGCTCCGAGTCGCGCTTGTCCAGCAACAGCGGCGACGAAGATTTCCAACCCTCGATTCGCGTTCGAAATGCTTCCGAGTCGCGTTTATACAGAAGTCGCCAACAGATCGCTCTACAGGACATTTACCGGCTGCAATCGACCACCCAGTTGCTAATTCCAAAGTTGTCATTTTCCCGAGTCATCCGCGAAGTGCTGATGGAGTACATGTACCGGGACTTTCGTATCACGACCGAATGCCTCAATGCCCTCCAGGAAGCGTCCGAAATGTACCTGGTGCAGGTGTTTGAGGATTCGTACCGCTGCTGTCTACACCGAAACCGGGTAACGCTGGATGTGCCGGACATGAAGCTGGCCCTGTACCTGCGGGAGAAATGGCGCCCTTAG
>Aedes_albopictus_mosqCid1_AALF025877
ATGCCTCGACGCTGGGGAAGACAACCAACCCGCAATCCACAAGGACTGGGCACTGAAGAACAACCAAGCGACACTTCCTCCGACAGCGGTGCCTCCAATTCTCCGCCAGCTGCTGCTTCTCGTCAGACAAGAAGGCGATCATCGTCGGCACCCGCTCGTCGTAGTAGCAGAGCACAAGCCCCGGAACCACGGGCAGCCTCGGCGTTCAGAGGTACCAAGGCGCTGGCCGAGATTCGACACTTGCAGCGAACGACCGATATGCTCATTCCCAAGTTGCCCTTTGCCCGGGTTATCCGAGAAGTTATGCTGGATTACAGTGGCCGCAATCTGCGTATCACAGCGGAAGCCCTGATGGCCGTCCAAGAGGCAGCGGAAATCTATCTGGTTATGCTGTTCGAGGACTGCGAGAAGTTGGCATTGCACCGGCAGCGGGTGACTATTACCAAGCGGGACATGGACCTTGCGGTGTACTTCCGGATTCATTGA
>Aedes_albopictus_mosqCid3_AALF025880
ATGCCTCGCCGAGTTCGTCCGCCTCAACGACACGTGACTGCAGCAAAACTCTCAACATTGAAGCCCAAAGCGGCAGCGGAAAAAGCAGCAGAAGCAGCGCCTGAGCCGCCAGCAAAACCAGCAAAGGCGCCGAGCCTACCCAAGCAACAAAAAGTTTCGGTACCACCGCGAGCAACTAGACGATCCCGCTCCGAGTCGCGAATTTCCAGCAACAGCAGTGACGACGACTACCAGCCATCGATTCGCGTCCGAAATGCTTCCGAGTCGCGATTCGACCGCAGCCGGCAGGATGTGCAAATTCTGCAGGACATTCACCGATTGCAATCGACCACGCAGCTGCTGATTCCGAAGTTGCCTTTTGCCCGGGTCATCCGGGAAGTGCTCATGCAATACATGTACCGAGACTTTCGCATCACCCCGGAGTGCCTGTGCGCCATTCAGGAAGCGGCCGAAATGTACATGGTGCAGGTGTTTGAAGACTCGTACCGGTGCTGCCTGCACCGAAGCCGGGTTACTCTGGGAGTGCCGGACATGAAGCTAGCCCTGTATCTGCGGGAGAAATGGCGCCCGTAA
>Aedes_albopictus_mosqCid2_AALF012514
ATGCCACCAAGGATCACAAAAAAATCAAAAACTAAAAAACAAAAATCTGCGATACCCCATGATCTGGAATTCATGCTCGGTGAAGAAATTTCTTCGCCTCTGGACAGCCCTGTTTCTCCAACAGAGGCAGAATATTCGCTTATCACTGCACCACCTCGAGACGTGCTGGCGAACCTGCGATTGGTTGGCCTGGCTGGGAACGATACATCGAGTGGAACTGGATCCACATCATCGTCCAGCAGTGACCAAAGGGATCAACTTGTAACGAGTATTAATAATTATATGACTAGATTTGTGCCAGAGGTTATCCCAGAGGAACCTGAGCCTCGAACAGCTCGTGCGAAACCCAAATCCAAACAAAAAACTACACAAAAGAAAGCAGAACGACCGATTGATCATTTCGATGACGTATCGGGATCTCTTAGTGATGAATCAATGATAAATGCTAGCGATCGCCCATCTTATGTAGAGAATAATCCAGGAAATCGCGGTAGATCATCAGAAAACGAAACCCGTTCTAATGGTGATGATACAAATGGCTATACGAGTCCACAAAATAACAACTATTTGGCTCCAGATATGACCTATTCAAGCACGCAATCTAATGTTCCGAGCATCACCTCCAGGAATGCTTCAAAGTCATCAAAGGCTAAGAAATCCACCGGTTCAGTTAGACGCAAGAGTTCACCCCCAAAGAAAAATACCTCGGCTGGCCCCAAAACCCAAACAAGGCAAACCATTGGCAACGATCGGGACGTTAAACTGCTGCAGAACATCGCACGTCTCCAGGCGAGCACAGAATGTTTGATTCCGAAGCTTCCCTTTGCCCGATTGATCCGGGAAACCATGCAGATGTACTGCGGGCGCGATCTGCGGATAACACCCGAGTGTCTCCAGTGTCTGCAGGAGGCAGCCGAAATCTACGCCGTACAGGTCATGGAAGATGCCTATCGGTGTACGTTGCACCGCGACAGGATAACGCTCACGGCCAAGGATATGAAGCTGGCTTTGCTGCTGCGAAACGATAGTGTGATGATGAATATGTAG
>Anopheles_arabiensis_mosqCid1
ATGCCACGGCCGAAAAGTGCACCGAGATCACTATCGGAAAGAAAGGAGCGGAAAAGCAAAGTCCGAACATTGCGTAGCCAGGCCCAATCGGATCTGTTTTCTTCCAGCGAGGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTGCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGAAGCCAACAGACCGATGAGTCCAGAGCATCACGCAGCGCCGCCCGCCAGCATACACCTTCCACAAGCGATGAAGAGGAGGTAGACGAGGAGGAGCACGATCCGTCTCAGCGTAATCGACGAAGTCGCAGCAGCACACGCACACCGTCCGAACCCGTTGCTTCCACCTCACAGCGTCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAAGCCCAGAATAGCACCACTATTAAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAGCTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCTTGCACCGTGCCAGAGTGACGCTCTCCCCAAAGGACATTGAGCTGGTAATAATACTACGACGAGGGATCAAATAA
>Anopheles_arabiensis_mosqCid2_AARA014434
ATGGCACCGCGGAAAAACACCAAAAAGCAACCGAAACCGACGGCACGCGCCCGGAAGCAAACGGTGGAACGTACACCAAGTCCACCGAAAAGTGCTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACCACCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACACAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCCACCGCCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGTATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCCGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCAGCATATCCTCCGCAGCCAACACTGGACGGAATTCAACCAAAGGGAAGCCTACCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAGGCTGCAAGGGACGGTGCACAATCTGATTCCCAAGCTAAGCTTCGGACGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCCCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTGATGCTGCGTCGCAATTGA
>Anopheles_atroparvus_mosqCid2
ATGGCACCCAGGAAAATCGGCAACACAAAAAACGGGAGACCTAAAAAGACGACGGAACCGCCACAGCCTGAAGTACCGTCCGTGGCCAGTGCCCGCGACGGAAACCGTCTATTCATCCCTTCGAACGAAGGGTTAGAACACATGATGGGCTTCGATGTGGAGGACAGCAGTGACTTGTCCGACGATGCTACATATCAATCGCAACCAAACTTTTCCTTCCTACCGTCCCACAAACACTCGAGCCCACGCAACTCAAAGAAAAAGAACATTCCGGCGGTGCACCGACTGGCCAGCATGTCAACTGTGCCAAACTCAGACTTGGCTCAACCGAACACATCCAATGAACCGTCTAGTGGTTCCAGAAATGTTTCCTCAAATTCCTCACCTAGCGTGCCAAAGCCACTCTCAACACCGCCGACCAACAAAAAGGCGGAAGCGAAAAAGCAAAATCGAAAGCAAAAGACTCCCACCAAGCTTAAAGCATTAAGAGAAATCGTTCGGCTCCAGGGGACGGTGACAACGCTAATTCCAAAACTGTCTTTCGGGCGAGTCATTCGGGAGATATTGGCAGACTACTCTAATAGCAATCTGAGGGTGACGGTCGACATGTTACAGTGTTTGCAGGAAGCTGCAGAAATCTACATAGTGCAATTGTTCGAGGACGCCTACAAGTGTACCGTTCATCGCGGGCGTATAACCTTGATCCCCAAAGATATGCATTTAACACTAATGATCCGACGCGAGTCGTAA
>Anopheles_atroparvus_mosqCid1
ATGCCACGGAGAAAGAGTGTGCCACGAGCATCGCATCAAAGGGACGAGAGGAAAACACGAAGCACAACATCAAGGAACAGTTCACTCAACTTGAGCACGGACAGTTCGCCCTCGGACACGGAATCGCATCGCGCATCTCGGTCGCCGTTGAATCGCAGCAACAGCAGTGCAGCCTTGGGACCAACGTCGACGACGACGGCAGCTCAACCTAGCCGTAGATCCGTTTCTGCAGGTCCTCCTACTTCATCGAGAAGAGGACCAAAACTGCCACCGCTCCAGAAAGAGATGTGGAAACTGCAAAATAGCACGAAGCTTCTTATACCTAAATCGAGCATTTGTCGAGTGATACGTGAAGTAATGCTCTCCTACGGACAGTACAGAATAACGTTAGATGCGCTCGCTGCCCTACACGAGTCGAGCGAAATGTACTTGGTGAATCTATTCGAAGCATCGCACCGGTGCGCCCTACACCGCCAACGGGTTACGTTGATGCCGAAGGATATGCAGCTGGCGCTGTTTCTGAGGGGCGACGGGTGA
>Anopheles_chrysti_mosqCid2_ACHR014087
ATGGCACCACGAAAAAACACTAAAAAACAATCCAAGACGAGTGCCGGCGTCAGACAGCAAGCAACGGAACGTACTCCAAGCCCACCACGTAGAAGTCCCGTTGAAGAGCCAGCCTTTCGGTCGCTTAGAACCGTGAATGAGCTTTGTGATGTGATGGGTGACGAATCGGCAAGCGGCAGCGATATGGAATCGTACCGGGACAATACATCCCAATCCCGTCCAAACTTTTCCTTCCTGCCCTCGCACAAACATTCCAGCCCAAACCACCAAAACTACAGACAGGCCAAACAACCGCCAGCTACCGTACATCGACTCACCAGTATGCCCACCGTGCCCAACACAGGGCTAAATCACGAGGATACTGAGTCACCAGATCGAAGCAATCGTCATGGTACATCGTCCAGCAGCAGTATGTCTACCTTTACCAAAAGTGGACGGAATCATGAGGACACTCAACCGCCCGGTCCAAGCTCTAGAAGTCGTAAGACATCCAGAAGCGAGCGAGGCAACAGCAGCAACATTGGCCAGCCTACAGCTAGCTCCAGTGCTCCGCCAACATCGCAACCCGCGCGTCGAAAGCAGAAAACCCCTTCCAATCTTCAAGCGCTGAAAGAAATCCATAGGTTGCAAGGGACGGTACACAATCTGATACCTAAGCTATCTTTCGCACGTTTGATACGCGAAGTATTGAGCGAATATTCGCATCGACAGTTGAGGGTGACCGTGACGATGCTGGAATGTTTACAAGAATCGGCCGAAGTGTATTTAGTGCAGCTATTCGGCGACTCTTATCGGTGCACACTTCACCGGGAACGAGTGACTCTTATGCCCAAAGACATGCAATTGGCTGCAATGCTTCGGCGTGACTGA
>Anopheles_chrysti_mosqCid1
ATGCCACGACAGAAAAGTGCTCCAAGATCAATGTCGCGAAAAGCAGAAAAGAATAACGAATCCAGATCATCACGTAGCCGAGGCCGGGAGCTTACAACTTCTTCTGAAAGTGATGAGGAGGGAGAGGATGCGTCTCAGCGTAACCGACGCAGTCTAAGCAGCACCAGTTCGCTCTCTCCTAGATCTACTGCCTCAGGAACAACACGCCGATCGCGGTCAGTCGATCTACGACGTGACCCCAGAACATCACGTAGCCAATCCCGGCAGCTTACGCCTTTCAGTGATGAGGAGGAAGAAGATGCATTTCGGCGTGACCGACGCAGTCGAAGTATCACCAGTTCGTCAAACTCCGCAGAGCCTGTAGCCTCCAGCTCACAACGTCGATCATTGTCAGCCGATCCCCCGCTTCGTCCAACTTCGAAAAATGTGCCCCGGTCCAAAAAGGGAAAACAACGAGTAGCACCATTTTTAAAGGACATCCTTAAACTTCAGCTAACCTGGAACATGCTCATTCCGCGGGCGGCCTTTGGAAGACTGGTGCGTGAATTGTTTGACTATCGGTATCGCATCACGCCACAGGCACTGGAGGCAATGCATGAATCGACAGAACTATTTATGGTGCAACTATTTGAGGATGCGTACAAGTGCTGCCTGCATCGTGCCAGAATAACGCTGTCTCCGAAGGACGTCGAGCTGGTAATTTTACTAAGAAGAGGAATAAAATAA
>Anopheles_coluzzi_mosqCid1
ATGCCACGGCCGAAAAGTGCACCGAGATCACTATCGGAAAGAAAGGAGCGGAAAAGCAAAGCCCGAACATTGCGTAGCCAGGCCCAATCGGATCTGTTTTCTTCCAGCGAGGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTTCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGAAGCCAACAGACCGATGAGCCCAGAGCATCACGCAGCGCCGCCCGCCAGCATACACCTTCTTCCACAAGCGATGAAGAGGAGGAAGACGAGGAGGAGCACGATCCATCTCAGCGTAATCGACGAAGTCGCAGCAGCACACGCACACCGTCCGAACCCGTTGCTTCCACCTCACAGCGTCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAAGCCCAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAGCTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCTTGCACCGTGCCAGAGTGACGCTCGCCCCAAAGGACATTGAGCTGGTAATTATACTACGACGAGGGATCAAATAA
>Anopheles_coluzzi_mosqCid2_ACOM030600
ATGGCACCGCGGAAAAACACCAAAAAGCAACCGAAACCGACGGCACGCGCCCGGAAGCAAACGGTGGAACGGTCCCCAAGCCCACCGAGAAGTGCTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAACGTGATGGGCGACGAATCGGACGACCAGACAACCACCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCAGCCGCCGCCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGCATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCCGAACCCGGTCCCAGCTCCAGCCGCCGAAGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCAGCATATCCGCCGCAGCCAACACTGGACGGAATTCAACCAAAGGGAAGCCTACCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAGGCTGCAAGGGACGGTGCACAATCTGATTCCCAAGCTAAGCTTCGGACGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCCCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTGATGCTGCGTCGCAATTGA
>Anopheles_culicifacesA_mosqCid2
ATGGCGCCCAGGAAAAACAACGCAAAAGCAACGAAAGCAAAAAAACCACCCACAAGGGAACAGCCCAGATCACCATCCACCTTGGCAGAAGAATCGAATAGAAGCAGTCGAGCTAGGGAATACCGCTCCTTGAAAACAGCGGATGAGCTTCGCGATGTAATGGGCGCTGAATCGGACAGTTCCATTAGCAATATCGAGAATGATTCGTACCGCAGCAACACCATTCAATCGCGTCCAAACTTCTCGTTTCTGCCCTCGCACCAACATTCCAGCCCCAATAAAAACCACAGTCCGAAACGGACCAACGCTGCTGCTCCACATCGACTCGCCAGTATGCCCACCGTGCCCAACACGGATCATGAGGAAAACGATTCCACTTCCGCCGCTAGTACGCCCCGTAAATCATCACGCAAACCATCACCAACGAAAAGCAAAACAAAAAACCGTCAACAGGCACAGTCAAGCAATGATAGGCAAGCGCATACCCGAAAACAGAGAGCACCCGGACAACTGAAGGTACTCAAAGAGATAATAAATCTCCAAGGCACAGTGCACAATCTGATACCCAAGTTATCCTTTTCGCGTGTGATTCGAGAGGTCTTGTCCGAGTATTCGACTCGTTCCCTGAGGGTCACCCCACAAATGCTACTCTGTTTGCAGGAAGCGACGGAGATATATTTGGTGCAACTGTTCGAGGATTCCTATCGGTGTACGCTCCACCGAGACCGTGTAACGCTTATTCCCAAAGACATGCAATTAGCTTATATGCTTCGGCGAACTAACACTTAA
>Anopheles_culicifacesA_mosqCid1
ATGCCGCGACGGAAAAGTGTGCCAAAATCAGTACCACAAAGCAAAAACGATTCCAACGCTACACGCAGTCGTTCCAAAATCGATTTGGAACATGCCGCACAACGAAGCCGTCGCAGCCGTTCTACTGAGAGGGAGTTAGATGATGGCAACCGTGGCGGATCACACCGTAGATCGATATCGGCAGAAGCAGCACGTTCCACCAGCCGAGACAACCCAAGAACGACTAAAAGCTCACGGATAGCACCGTTTCTAAAGGAAATGCTTCATTTGCAACAAACGTACCACACGCTCATCCCTAAGGCAGCTTTCGGAAGGGTAGTACGGGAACTGTTTGACAGTCAGTATCGGATCACTGCTGAAGCCTTCGCCGCATTGCATGAAGCCTCTGAAATGTACTTGGTGAACCTGTTTACTGATGCCTACTTCTGCTGCCTGCATCGAACTAGGGTTACGCTAACCCCGAAAGATATGCAGCTGGTTCTTTCATTACGAAAACCATTCAACTAA
>Anopheles_darlingi_mosqCid2
ATGGCGCCACGGAAAAAAACAACGAAAGCCACGAGGGCAACGCCGGCACGGCCTGCTGCAGACGCCCGCGAGGAAACGCCATCGCCCGAACCAAACCAAGCCAACCCGATACCGGAATTCCGACAGTTAACCGCCGCGGAGATAGCGTCGGCGATGGGCAGCGAAACGGATAGTGATCTCAGCGAGGACGACCCAACGTACACCACCCAGTCGAAACCGAACTTCTCCTTCCTGCCCTCGCACAGGCACTCCAGCCCACGGCGGTCGGCATCCGGAAACCGTTCTGGAAGCAGCGATCCAGCTTCGCGATCTCCCGCCACCGTAGTCCATCGGTTGGCCAGCGCTTCAACGGTACCGAATTCAGATCTCACAAGCACGCCCACTAGACGCACCACTCAACAAAGCGCAGAGCCCGCATCGGAACCCCAGCCACGGGGCAGACCTAATCACAGGAGAAAACAAGATCGTCCGACAAACTGGAAGACCATCAAGGAAATCATCAATCTGCAGGCGACCGTGAACACTCTCATACCGAGGCTCACTTTCGGCCGAGTTATACGGGAAATCCTTACCGAATACAGCAGCAGCGATATGCGGGTGACGGCCGAAATGTTGACCTGTCTACAGGAGGCCGCGGAAGTGTACATCGTGCAGATGTTCGAAGACGCCTATCGTTGCACGCTACACCGAGGGCGAGTAACATTGATTCCCAAGGACATGGAGCTAGCGTTACTGATCCGACGAGATGCCAACTAA
>Anopheles_dirus_mosqCid1
ATGCCTCGACGCAAAAGTGTCCCAAGATCTCTCCCCCGACGAGAAACAGTAAATACGCAGAAACCCAATGAAGCTCGAGCATCACGCAGCGCCTCCCGACGGGAAGCGTCGTCGGAAAGTTCTAGCGAATCTCCATCCACTTCACGACGCAGCGTAAGCAGCACGGATTCACGCAGCACCCCGCAACGGGATAAATCCGATCGCTCGTCATCGAGGCGATCCTCATCGGCCGAATCACCGCGTAGTCGTCCTTCACCCAGACTCGCAAAATTTCTTAAGGAAACTCTAGCACTACAGTCGTCGACGCATTTGCTCATTCCGAAGGCATGCTTCGCCCGGGTTTTGCGAGAACTGCTAGACGGGCATCGGATCACATTTGAGGCGGTTGCCGCACTACATGAAGCGACCGAAACCTATCTGACGCAGCTGTTTCAGGACGCGAACATGTGCGCCCTGCATCGGACCAGAGTAACGCTTATGCCGAAAGATATCGATTTGGTGCTATTCCTAAGGCGTCACTGTGTTTAA
>Anopheles_dirus_mosqCid2
ATGGCACCAAGGAAAAAAGCACAAAAACCCAGTGCATCATCAACACACAAGAAACCAAGCCCGAAACCTCCACGAAACACAAATCCACAAGCTGAATCTTCAAACTCGAGCGTATCAAGGAATGGAACCAGAACCTCGAGATCGACTAGTATTATGAGTAATCCTATGGGTGATGTATCGGATATATCCTCTTGTAGTAGCGTAGAGCAAAGAAATGAGGAACCACAGCCTGAACCCTCAAACCCAACCGGAACAGCGACATTCTTCGGAGCCATGAAATCGACCGCTACTCTGAGCGAAATAATGGGTGATACAACAGATTCATCTTCCAGCAGTAACGTGGTGCAAGCATCTAACCAGGGTAGAGCTACGGATGAGGAAAGCGTGTCGTCCGAAGAAGAACCGAGCAGCAATCCGGCCACAAAAACAACGACCCCCGGACCCAAAAGCGCACCAAAACAACAGCGACGGAAACGAAAACAACCAAACAAGCTGAAAGTGCTGAAAGAAATGCTACACCTCCAAGGCACGACGCATCTTCTTATTCCTAAGCTAAGCTTCGGGCGTGTGATACGTGAAATTTTGTACGAATATTCACCGAACGGGATAAGGGTGACGCCCGAAATGCTAATGTGCCTGCAGGAAGCGGCTGAGATGTATACGGTGCAGCTCTTGCAGGACTCCTACCGGTGTACGTTCCATCGGGATCGGATAACCCTACAACCGAAGGATATTCAGCTGGCTCTAAGTCTTCGGAGGGAGTTATGA
>Anopheles_epiroticus_mosqCid1_AEPI009159
ATGCCACGACGTAAAAGTGTACCAAGATCGCAATCGAAGCGAGAAGAGCAGAAAAACATGGCCAGAGCATCACGTAGTCGATCGCGGCAGCTTTCGTCTGATTCCAGCTCAAGCGAGGGTGACGAGAGGGAGGAGCAGGAAGCGTCCCAACGTAACCGACGCAGCCAAAGTAGCACCAGATCACACACGCCGGAGACGAGTGCCTCTAGCTCACAGCGTCGATCACTGTCTGCTGACCCACCACGTTCGCGAGCGAATGCTGCGCCCCAGTCCCGAAACCAGGGCCATCGACGCATCGCACCATTCCTGAAGGAAATGCTACACCTACAGCAAACCTGGCATCTACTCATTCCAAAGGCAGCATTTGGACGCGTCGTGCGAGAGGTTTTCGATAACCGGTTTCGCATCACGACCGAGGCACTGCGTGCATTGCATGAATCGTCGGAAGTGTTTCTCGTGCAACTGTTCGAGGATGCGTACAAGTGCTGCATGCATCGAGCAAGGGTAACGCTGTCACCGATGGACATCCGGCTGGTAATCGACTTAAGGGGCGGAATCAAATAA
>Anopheles_epiroticus_mosqCid2_AEPI014069
ATGGCACCCCGTAAAAAGGATAACAAGCAGCCCAAACCACGGGCCCGACAGATAACCCCGGAACCTACCCATCATCCACCGAGACAACCCAATTCCGATGAACAATTTCGGTCGTTAAAACCAATAGACGATTTGCGTAATGTAATGGGAGAGGAATCTGACAACTCGGCGGCTACCGGCAGCGAGATGGAATCGTACCGTGACAATACATCCAATTCACGTCCAAACTTTTCCTTTCTGCCGTCCCACAAACACTCCAGCCCAAACCACAACGACAAACGGGCCAACGCACCTTCCACCAACGTACATCGTCTTGCCAGCATGTCCACGGTACCCAATTCTGGGCTGGATCACGAGGACACAGAATCTCCCAGGCCTACCACCAGTGCTCGCCGACCGTCCCGAAACGAACGTAGAGAGACTAGCAGTTTTAACGCTACAAACACCAACAGGACCACTACAACGCCAGCATCCAAGTCCGCTCAGCCTCACGGACGAAAGCAAAAAACACCCAGTAAGCTGAAGGTTCTGAAGGAAATCATTGATCTCCAGGGCACAGTGCACAATATCATTCCCAAGCTGAGTTTCGGACGCGTGATTCGTGAAGTTTTGAGCGAGTATTCGGACCGACCGTTAAGGGTGACCGTACAAATGTTGGAGTGTCTGCAAGAATCTGCTGAAATATTTCTGGTCCAGCTGTTCGAGGATTCTTACCGATGCACACTTCACCGCAATCGAGCGACGCTCATCCCCAAAGACATGCAGCTAGCTTACATGCTTCGCGGCAACTGA
>Anopheles_farauti_mosqCid1
ATGCCTAGACGTAAAAGTGCCCCGAAGCAACTCTCCAGACGAGAAGAAGATAAAGCCAGAGCCGCACGCAGCACTTCCAGACGCATGGCATTCGATAACGAGGAAACTTCGTCATCCCATTCACGACGCAGCAGTAGCGCTTCAAATTCAACTGCCACTTCGCACAACAGCGCCTCGCAGTCGCCAGGGAGGCGAAGCTCATCGGTTGGCCCACCAGCACGTTCTTCCATTGCACGGATGAGGCGAGAGGATCCGAAACTCGTAAAATTCCTAAAGAGTATGTTACATCTACAGGGCACAACGAATTTACTCATTCCGAAAGCCGCGTTCGCACGGGTATTACGTGAGCTACTCGACGGATACAGGGTAACGTTGGAGGCTGTCATGGCGCTTCACGAAGCGGCTGAAACCTATCTGGTTCAGCTGTTTCAAGATGCAAACCAGTGCGCCATGCACCGGGCCAAAGTAACGCTAATGCCGAAAGATATCGAATTGGTTCTATACATAAGGGCTCATACCTCGAGGTAA
>Anopheles_farauti_mosqCid2_AFAF009137
ATGGCTCCAAGGAAGAAATCGACAATGAAAGTGAAATCTAGCGAAACAACGAAAGGGAAGACAGGTACATCGAACGTTGCAACCGCCAGCAGGAGCGACGTGGGACAAATGACAAATAATAAACCCGTCTCCTTGAGTGATATTTTGGGCTCCTCAATAAGTACATCGGACAGCAATAGTACGGACTTATCCAACCGGAGCTTACTTGCAAATGAGGAGAACGTATCGATTGAAGAACCATATGGTAGCAACAATCGAGGAAGCTACGATCGGAGCAACAACACTACGAATGAGAATCTAGACGAACAACCGTCGAACAGCGCTCATACCACGAACTCTGTAACCGCCAAACCGAAAAAAAATCGACGGAAACCTTCAAAACCCTCAGACTGGAAGCTGATAAAAGATATGTTACATCTGCAGGGTACGGTGCACTACCTCATTCCCAAGCTAAGTTTCGGGCGTGTAATACGTGAAATCTTATCCGAATTTGCTCCCACCGGGCTGAGAGTGACGCCGCAAGCGCTTGAGTGTCTACAGGAATCGGCTGAGCTCTACACGGTGCAGCTTTTCCAAGACGCTTACCGCTGCACGTTCCATCGAGACCGGATAACGCTGCAACCGAAGGACATCCAGTTAGCTCTAATGCTTCGTCGAGAGCTGTAA
>Anopheles_funestus_mosqCid1
ATGACGCGACGCAAAAGTATACCCAGATCGCTGTTAAAAACTGCGCAAAGCAAAAGCGAATCCAGAAATACACGTAGCCAATCCCGGACGGCCAGCCAACCAGCCTCGGAATCGTCATACAGTTCACAACGAAGCCGTCGCAGCAGAAGTTCTTCGGACACGCAGTCCTCAGACGGAAATGTTCGTGGATCATACCGTAGATCGATATCGGCTGACATGGAACCTTCCTCTAGCAACAATAACCCAAGATCGGCTAGAGCTCCACGCATTGCCCCGTATCTCAAGGAAATGCTTCATCTGCAACAAACGTACCACATGCTCATCCCAAAGTTAGCTTTTGGAAGGGTGGTACGGGAGTTGTTTAACAATCGGTATCGGATCACAATGGAAGCTCTCACCGCATTGCACGAAGCTGCCGAAATGTACTTGGTGCACCTGTTTACTGATGCCTACATGTGCTGCATGCATCGGTCTAGAGTTACTTTAAGCAAAGAAGACATGCGGCTGGTTCTTTTAATACGAAAATCTACAATCTAA
>Anopheles_funestus_mosqCid2_AFUN003801
ATGGCACCAAGAAAAAAGACTGCAAAAGTAAAACCACCAGCCAAACGGAACATTCCTAATGCGTCCTCAGAAGAAACGGGAAGGGCCGAAGGAAAGACAGATTACCGCGCCCTTCGAACAGCTGAGGAGCTTCGCGATGTAATGGGTGCAGAATCGGACGATTCCCTAAGCAATAGCGAGAATGAATCTTTCCGCAACAATACGAATCAATCGCGCCCGAATTTCTCTTTCCTACCGTCTCACAAACACTCCAGCCCGAATAAAAACTACAGCCCGAAACGGACCAACGTTGGTTCAGCTCATCGACTCGCCAGCATGCCCACCGTGCCCAATACAGATCACGAGGAAACCTCTTCTACTGCCAGCACTAGCGCAACTCGTAAAACACCACGCAAATCAACAGCAGCCAGAGAAACGGTCCAACCGAGCACTTCAAGACAACCGCACATACGGAAACAGAAAATACCCGGCCAACTGAAGGTTCTTAAGGATATAATAAACCTTCAAAGTACGGTTCATAATCTGATTCCAAAGCTGTGTTTTGGACGCGTGATTCGCGAGATCTTATCCGAGTATTCGAACCGCTCGCTGAAGGTAACTCCAGATATGCTGCTTTGTTTGCAGGAAGCGTCGGAAATATATTTGGTACAGCTGTTCGAGGACGCCTATCGGTGTACGCTTCACCGGGACCGTGTAACGCTTATTCCCAAAGACATGCAACTAGCTTTCATGCTTCGGCGCAATTAA
>Anopheles_melas_mosqCid2_AMEC001928
ATGGCACCGCGGAAAAACACCAAAAAGCAACCCAAACCGATGGCACGCGCCCGGCAGCAAACGGTGGAACGTACCCCAAGCCCACCGAGAAGTGCTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACGGCCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGGCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGCATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCCGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCAGCAGCAGCATATCTGCCGCAGCCAACACTGTACGGAATTCAACCAAAGGGAAGCCTTCCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAAGCTGCAAGGGACGGTGCACAATCTGATTCCCAAGCTAAGCTTCGGACGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCGTTGAAGGTGACTCCGCAGATGCTGGAATGTCTGCAGGAATCGGCCGAAGTATACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTGCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTAATGCTGCGGCGCAATTGA
>Anopheles_melas_mosqCid1
ATGCCACGGCCGAAAAGTGCACCGAGATCACTGTCGGAAAGAACGGAGCGGAAAAGCAAAGCCCGAGCATTGCGTAGCCAGGCCCAGTCGGGTCCGTTTTCTTCCAGCGAGGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTGCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGCAGCCAACAGGCCGATGAGTCCAGAGCATCACGCAGCGTCTCCCGCCAGCATACACCTTCTTCCACAAGCGATGAAGAGGAGGAAGACGAGGAGGAGCACGATCCGTCTCAGCGTAATCGGCGAAGTCGCAGCAGCACCCGCGCACAGCCCGAACCCGTTGCTTCCAGCTCACAGCGCCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAGGCCGAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAACTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTATCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCCTGCACCGTGCCAGAGTGACGCTCGCCCCAAAGGACATAGAGCTGGTAATTATACTGCGACGAGGAATAAAATAA
>Anopheles_merus_mosqCid1
ATGCCACGGCCGAAAAGTGCACCAAGATCACTGTCGGAACGAGAGGAGCAGAAAAGCAAAGCCCGAACATTGCGTAGCCAGGCCCAATCGGGTCTGCTTTCTTCGAGCGACGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTGCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAATATCTACACAATCACAATCACAAAGCAGCCAACAGGCCGAAGAGTCCAGAGCATCACGTAGCGCCACCCGCCAGCATACACCTTCTTCCACAAGCGATGAAGAAGAGGAAGACGGGAAGGAGCACGATCCGTCGCAGCATAGTCGACGAAGTCGCAGCAGCACCCGCACACCGCCCGAACCCGTTGCTTCCAGCTCACAGCGCCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAAACCAAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCATCATCTCATCCCGAAGGCGAGCTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCAATGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCATACAAGTGCTGCCTGCACCGTGCCAGAGTGACGCTCGCCCCAAAGGACATTGAGCTGGTAATTATACTACGACGAGGGATCAAATAA
>Anopheles_merus_mosqCid2_AMEM014318
ATGGCACCGCGGAAAAACACCAAAAAGCAACCCAAACCGACGGCACGCGCCCGACAGCAAAGGGTGGAACGTACCCCAAGCCCACCGAGAAGTGTTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACGGCCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCCGCCGCCGCCGCCGCTGACCAACCTCCAGCAACGGTACACCGGCTCACCAGCATGTCCACCGTCCCCAACAATGGACTGGAACACGAGGACACTGACTCCGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCCGCATATCCGCCGCTTCCAACACTGGAGGGAATTCAACCAAAGGCCAGCCTTCCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAAGCTGCAAGGGACGGTGCATAATCTGATTCCCAAGCTAAGCTTCGGGCGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCTCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTAATGCTGCGGCGCAATTGA
>Anopheles_minimus_mosqCid1
ATGCCGCGTAAAAATGAATCCAAGACGGTGCAAGGAAAAACTAACGATTCCAAAGCTACACGTAGTCGTTCCAGAATCGAACAGCCGGACACCTCACAACGAAGCCGGCGCAGTAGAAGCAACGATGAGACGCCGTCTACGGGCGAAAATAGTCGAGGATCACACCGTAGATCGCTATCGGCTGATAAATATTCCACTGAACAAAGCAAGAGAACGCCAGGAGCTCAAAGAATAGCACCTTACATAAAGGAAATGCTTTATCTACAGCAAACATTCCACTTGCTCATCCCGAAGGCAGCTATCGGAAGGGTAATACGGGAACTGTTTGACAATCAGTTCCGGATAACACCAATGGCCTTGTGCGCATTGCATGAAGCTGCCGAAATGTATTTAGTTAACCTGTTCGCTGATGCTGACTTGTGCTGCAAACACCGCTCAAAAGTGACGCTAAAGCCGGATGATATACGGCTGGTTCTTTCCATGCGGAAAGGAAAGTAA
>Anopheles_minimus_mosqCid2
ATGGCGCCAAGAAAACCCAACAAAAAAGTGACGAAACCAGGAAAACTGCCAACCAGACAACAAAGCCCTTCCGCTTCAGATGAAGAAAAAAACGATCAAGGTAGAGAATACCGCTCCCTAAAATCAAGGGAAGACCTCCGTGATGTGATGGGCGCCGAATTGAACTATTCCCAAAGCGATGGCGAGAATGATTCCTACCGCAGCAACACCAATCAATCGCGTCCCAACTTTTCCTTTCTGCCGTCACACAAACACTCCAGCCCGAAACGAATCAACGCAGGAGTTGCTCATCGACTCGCCAGTATGCCAACCGTGCCCAACACAGATGACGACGAAAGCGACGCCACTTCCACTTCTACTTCCAACTCTGTCAGTTCGCCACGAAGATCGCCAAGAAAAGCAGCCGGAGCAAATAGCAAAACTAAGGACCGAGAGCAGGCACAACCGAACACTGCAAAAAAACCTCATTCCCGGAAGCAGAAAACACCCAAACAAATGAAACTAATCAAGGAAATAATAAATCTCCAAGGCACGGTACATAATCTGATTCCCAAGCTGAGCTTTGCGCGCGTGATTCGGGAGATCTTGCATGAGTTTTCGAACTGTTCGCTGAGGGTCACTCCAGAGACGCTACTGTGCCTGCAGGAAGCGACGGAGATATATTTGGTACAACTGTTTGAGGATTCTTATCGGTGTACGCTTCACCGAGACCGTGTGACGCTTATGCCCAAAGACATGCAGCTAGCTCATATGCTTCGGCGAGGCAATAGTTGA
>Anopheles_quadriannulatus_mosqCid1_AQUA004414
ATGCCAAGGCCAAAAAGTGCACCAAGATCACTGTCGGAAAGAGAGGAGCAGAAAAGCAAAGCCCGAACATTGCGTAGCCAGGTCCAATCGGGTCTGCTTTCTTCCAGCGACGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTGCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGCAGCCAACAGGCCGATGAGTCCAGAGCATCACGCAGCGTCTCCCGCCACCATACACCTTCTTCCACAAGCGATGAGGAGGAGGAAGACGAGGAGGAGCACGATCCGTCTCAGCGTAATCGACGAAGTCGCAGCAGCACCCGCACACCGCACGCGCCCGAACCCGTTGCTTCCAGCTCACAGCGCCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAGGCCCAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAACTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCCTGCACCGTGCCAGAGTGACGCTCGCCCCAAAGGACATTGAGCTGGTAATTATACTACGACGAGGGATCAAATAA
>Anopheles_quadriannulatus_mosqCid2_AQUA014106
ATGGCACCGCGGAAAAACACCAAAAAGCAACCCAAACCGACGGCACGCGCCCGGAAGCAAACGGTGGAACGGTCCCCAAGCCCACCGAGAAGTGCTGACCTAGAGCTAGCTTACCGGCCCCTGAAAACCGTCAACGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACCACCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCAGCCGCCGCCGCCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGCATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCTGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCAGCATATCCGCCGCTGCCAACACTGGACGGAATTCAACCAAAGGGAAGCCTACCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAGGCTGCAAGGGACGGTGCACAATCTGATTCCTAAGCTAAGCTTCGGGCGCTTGATCCGCGAAGTATTAAGCGAATATTCGCACCGTTCCCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTGATGCTGCGTCGCAATTGA
>Anopheles_sinensis_mosqCid1
ATGCCGCGAAGAAAGAGTGTGCCAAGAAGACAGTCAGAAGATGAATTTAAACGGGACACAACAGCAACAAGGGCAGCGAACACATCTACCTCATCGATTTCTGCATCGGACACATCCGACAGTGAACGGCATAGCCGAACAAGATCTCAACAACGGCGCAGCAGAAGCAGCGAAAGCTATGGGCATCCACCATCGACTTCCGCCAATCGAACCCATCGTAGAGCTGCTTCCGCAAATCCACCAAGGCGCCACCGGCAGTTAACACCGGCGCAGAGAGAAATAATACAGCTGCAGAAGACCACCAACCTGCTCATTCCAAAACTCAGCATTTCACGTGTAATTAGGGAGGTGATACACGCATTTGGCAACTTTAGACTCACAATGGGTGCTCTTGGGGCGCTACACGAGAGTAGCGAAATGTTTCTTATAGATTTGTTCGAAAGGGCACAAATGTGTGCCACACACCGGAATAGGGTCACTTTACAACCGAAGGACATGAAACTGGCACTGGCTCTCAGTGATAGATAA
>Anopheles_sinensis_mosqCid2
ATGGCTCCCAGAAAATCGATCACTGGCAAAAACAAGCGTGCTAAAACAGCACCAGAGTCACAAAAACCGCCACCGAAAGGTTCCTCACCTGGCAAAACGCAGAACCAAATCCGTCCATTCCCTTCGAACGCGGGACTGGAACAGATGATGGGCTTCGAAATGGATGACCAAAGTGAAATATCCGATGATAACACCATACAATCGCGACCCAACTTTTCGTACCTTCCGTCCCACCAACACTCCAGTCCACAGAAGGTGAAACAAGCTTACTTTCCAACGGTGCACCGCCTGGGTAGCATGTCAACGGTACCAAACTCGGACTTGGTCCAACCGGACACTTCACATGAACCTTCCACGAGTTATGGCATCGAAATGACGCCTGAATCCTCAAATTCGGTGCCAAATAAAGCACCGGCCAAGACAAAGGAGAATCAAAAGAGACAAAGTCGAAAGTCGAAGACGCCAATGAAGATGAACATAATGAAGGAAATTGTTAAACTGCAAAACACAGGTGACAGAATCATTCCAAAGTTGCCTTTCGGACGTGTAATTCGCGAAATTCTAACGGAGTATTCTGACTCCGGACTGAGAGTTACGTTAGAAATGCTAGAATGTTTGCAGGAAGCTGCCGAAATCTACATCGTGCAATTGTTTGAGGACGCCTACCGGTGTACGGTGCATCGCGGTCGGGTCACCTTAATTCCCAAGGATATACAATTAGCTTTAATGATCCGACGCGAATCGTAG
>Anopheles_stephensi_mosqCid1
ATGACCCGGCGAAAAAGCATACCTCGATCACTAGCGAGCACTGCTGCAAGTAAAAGCAGCGACAGGGCTACGCGTAGCCGATCCAGAACGGTACAACCAGATCCTGAAACGCCGCCCGTTTCGCCAAGAAGCCGTCGCAGCCGAAGCTCATCATATTTCACATCGCCCACGGAGGAAAATGATCGTGGAAGAAACGCGCGTAGATCGCTTTCGGTTGATGCGCCCCGTGTAGCTCCCAAAACTGTGCCAACAACATCGGCAGGCAGTCAGCGCATTGCACCGTGGATAAGGGAAATGATTGAGCTGCAGCAGACATGGCATCTGCTCATACCAAAAGCATGCTTCGCAAGGCTGGTAAGGGAACTGTTTAACTACCAGTACAGGATAACTGTAGAAGCACTCAGCGCATTGCACGAATCCTGCGAACTGTACATGGTGGAACTGTTTTCCGATGCCGATCTGTGCTGTAAGCACCGCAACAAGGTAACGTTGACCCTGCGCGATTTCAGACTGGCCTGTTTGTTTCGAGAAAAACGTTAA
>Anopheles_stephensi_mosqCid2
ATGATGGCCCTGCATTCCGCCAGACGACGACGGTTAGATTTGAAATTCGATGCGATTCATGGAATCGGTTTGACAGCAGTTGGTACAAAGAAAAAAGTAACGGTCCCGAAAGCCCGCATTCAGTTTTGGCGCACATTTCATCCCGTCCGGAGGTGTTTCTGGCTCTGTCTGAACCGAAATAAAGCATCGCTTTCATTTTCTGCCCGTACGACAATGGCTCCAAGAAAAAAGACAACGAAAAAGGCTCCTGCCAAACCGACTAATCCACCAGCCAGACGAGAAGCGCCAGAATCACCAGCGGAAACTGTGCGACGAACCGATCGGGGCGGCGGAGAATTCCGCTCACTGCGAACGGGGGACGAACTTCGAAACGTAATGGGCACCGAGACGGATGATTCCCTCAGTAACAGCGAGAATGAGTCGTACCGCAGCAATACCATCCAGTCTCGGCCCAACTTCTCCTTCCTGCCGTCGCACAAGCATTCCAGTCCGAACACCGACAAACGGACGCTTCCGACAGCCCATCGTCTCACCAGCATGTCCACCGTACCGAACACGGGTTTGGAACAACAGGAAAGTTCTTCCGCTTCGCGTACTCCTAGTGCCGGTTCGAACACCAACCGCAAATCGTCACGAGCAGTAAACCGCGAAGCCAGCAGCAGCACATCTAGTAGCCAACGGCCGCAGCGCGAAGAACCCCAGCCAAGCAATTCGAAGCAACCGCACAGCCGCAAACAGCAAAAACCCAACCAGCTGAAAATGCTGAAGGACGTCATCTATCTGCAAAGCACGGTGCACAATCTAATTCCGAAGATGTGCTTTGCGCGCGTGATTCGCGAAATTCTGAGCGAGTATTCGAGCCGGGCGATGCGGGTCACACCGGAGATGCTGTACTGCTTGCAGGAAGCGGCCGAGATCTACCTGGTGCAGCTGTTTGAAGATTCGTACCGCTGCACTATGCACCGGGATCGGATAACGCTGATGCCCAAGGACATGCAGCTAGCTTGCATACTGCGGCGCAAGTA
```

In [143]:
cdna_fasta = """>Culex_quinquefasciatus_mosqCid2_CPIJ018900
ATGCCTCGCCGCGGACCTGCACCGAAAAAGGCGGGCCCCAAACGGGGCGGACCAGCCCCCAAAAATACCAGAACCAAATCGCCAGTGTCCCCTCGTGTGCCACCTCCTCCCCCGCCCCCACCACCACCGCCGGCACAATCTCACCAGCAGCCCGTCTCCCAGCGGGACGTATTCGACGAGATGATGGGCTCGGAGATCAGCAGTGACAACTCTAGTCAGGAAGCCCCGCCCCGGGTTGCACTCCCTTCCAAACGCAAGTCACCTCGCTTCCAGGATGGCGCCGGCGCCGGAGCCGTCGCCAGCGACGACAGCTCCCTGTCGGAAGCGAACCCCGACAGATCCCGCCAGCAGCAGCCGCCGCACCGCCGCAAGGCCCCCGCCCCCAAAAAGAGCCAAACGGCGGCCCTCAAGGAGATCGCCAAGCTGCAGCGCACCACGAACCCCGTCATCCCGAAGTTGCCCTTCGCGCGGCTCATCCGGGAGATCCTGATGGAGTACAGCCACCGGGAGCTGCGCATCACGCCGGAGAGCTTGCAGTGTCTGCAGGAGTCGGCGGAGGTGTTTGCGGTGCAGCTGATGGAGGACGCGTACCGGTGCACGCTGCACCGCGACCGGCTCACGCTGATGCCCAAGGACATGAAGCTGGCGGTGATGCTGCGCAAGGATAGTGTGATGGTGTGA
>Culex_quinquefasciatus_mosqCid1_CPIJ008605
ATGCCGCGCCGCGTAAGAACCCCACCACGACGCATTCCGCCCCAACCATCGGCCAAGGACGGCCAACGTGCCGGTTCGTCCCGCAATCAGCCATCCCAACGAGACTTGCAGGAAGCTGGGCCATCCCGGGCAGGCACTCGGTCATCCCGCCGTTCGCGGTCCGAACCGCGACGTTCCGCCAACAGAGACGACAGCAGTAGCTCCAGCGAGGACGATCGTAGCTACCGGTTGCCCCGAATGTCCCGATCGCGCTCGGAGCAGCGCAACGCGCGGCCAACTCGACTCCACGGAGCTCGAGTTCTCCGGGAAATCACCCGTCTCCAGCTGACCACAGACCTACTGATCCCGAAGCTACCCTTTGCCCGGCTTATTCGCGAAGTTCTGCAGCAGTATTCTCAGCGGAACCTGCGAATAACCCCGGAGGCCTTGCTTTGCCTGCAAGAATCGTCTGAAATCTACCTGACGCAGATGTTCGAGGACGCGTACCGGTGCACGCTTCACCGGGAACGCGTTACGATGATGCCCAAGGACATGAACCTGGCGCTGTACCTGCGCGAACGGTGGGCTCGCTGA
>Anopheles_albimanus_mosqCid2
ATGGCGCCCCGAAAGAAAATAACGAAATCAACAAATAAAGCGCCTGCGCGAGCCGCGACCCGCGATGATACACCATCGCCAGAAACAAGCCAAGCCAACCCGATACCGGAATTCCGACAGTTATCCGCAGCAGAAGTAGCCGAAGCGATGGGCAACGAAACGGACAGCGATCTCAGCGAGGATGACCCAACGTACACAACCCAGTCGAAACCGAACTTCTCCTTCCTGCCTTCTAACAGGCACTCCAGCCCGCGGCGGGCCAACCGAAACAGATCTGGAAGCAGCGATGCTCCTTCGCGTTCTCCTTCGGCCGTTGTTCATCGATTGGCAAGCGCTTCGACGGTACCGAACTCAGAGCTCACGAGAACGCCAACAAGACGCAGCGGTCCACAAGGTGCAGAACCCGCCTCGGAACCCCAGCCGCCGGGCAGGGCACACAGGAGAAAACAAGACCGGCCGACAGACTGGAAGATTGTCAGGGAAATAGTGAAACTGCAGGCTGGCGTGAACAGTCTCATACCGAGGCTTTCTTTCGGCCGAGTGATAAGGGAAATCCTTAGCGAATATAGCGACAGTGGTATGAGGGTAACGGCCGAAATGTTGACCTGCCTACAGGAGGCCGCGGAAGTGTACATCGTGCAGATGTTCGAGGACGCTTATCGTTGCACGCTACACCGAGGGCGAGTCACATTGATTCCCAAGGACATGGAACTAGCATTATTGATCAGACGAGATGCCAGCTAA
>Anopheles_gambiae_mosqCid1_AGAP007508
ATGCCACGTCCGAAAAGTGCACCGAGATCACTATCGGAAAGAAAGGAGCGGAAAAGCAAAGCCCGAACATTGCGTAGCCAGGCCCAATCGGATCTGTTTTCTTCCAGCGAGGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTTCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGAAGCCAACAGACCGATGAGCCCAGAGCATCACGCAGCGCCGCCCGCCAGCATACACCTTCTTCCACAAGCGATGAAGAGGAGGAAGACGAGGAGGAGCACGATCCATCTCAGCGTAATCGACGAAGTCGCAGCAGCACACGCACACCGTCCGAACCCGTTGCTTCCACCTCACAGCGTCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAAGCCCAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAGCTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCTTGCACCGTGCCAGAGTGACGCTCGCCCCGAAGGACATTGAGCTGGTAATTATACTACGACGAGGGATCAAATAA
>Anopheles_gambiae_mosqCid2
ATGGCACCGCGGAAAAACACCAAAAAGCAACCGAAACCGACGGCACGCGCCCGGAAGCAAACGGTGGAACGTACACCAAGTCCACCGAAAAGTGCTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACCACCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCCACCGCCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGTATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCCGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCAGCATATCCGCCGCAGCCAACACTGGACGGAATTCAACCAAAGGGAAGCCTACCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAGGCTGCAAGGGACGGTGCACAATCTGATTCCCAAGCTAAGCTTCGGACGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCCCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTGATGCTGCGTCGCAATTGA
>Aedes_aegypti_mosqCid2_AAEL007783
ATGCCACCAAGAATCACGAAGAAATCAAAGAGTAAAAAACAGATCTCTGCAATACCCCAAGACCTGGAATTCATGCTGGGAGAGGAGATATCTTCACCACTGGACAGCCCACTTTCACCAACAGAGGCCGAATACTCGCTTCTTACTGCCTCGCCTCATGACGTGCTGGAAAATCTGCGATTGGTTGGCCTGGCTGACAACGAGACAACGAATGAAAGGGACCAATATGTGACGAGCAGTGGTAACTATAAGCCCAGATCAATTGTCGAGGTCATCCCAAAGCATTCTGTGCCTCAAATAATCAGTAACAAATCTATATCTAAAGGAAAAACAAAACAAAAGAAACCAGAACCATCGGATCGGAGAATCGAGTACTCCGAAGGCTTTACGCAATCTCAAACATCATCTGGTATAGAAAGAGAAATCAGTTCTAACCGTGAATCATCATTTGAAACGGACACTTATAGTTCAAACCTAACGGATTCAAGCACACAGGCTGTTAGACCCAACGATGCGTCCGAAAAACCTTCAAAATCGTCAAAAGGCCAGAAAACCAGTACCGATCGACGTAAAAGCTCGCCCACCAAGAGGAATGTTCCTCTAGGGTCAAAGAGGCAAACGATAGCCAATGATAGAGAACTTCAATTACTGCACAGCATCGCACGTCATCAAACGAGCACCGAATGTTTGATCCCGAAGCTCCCGTTTTCCCGATTGATCAAGGAAACTATGCAACAGTACTGTGGCAGAAACCTTCGCATTACTCCGGAGTGTCTGTTGTGCCTGCAGGAAGCGGCCGAAATCTATGCCGTCCAGGTGATGGAAGATGCCTACCGCTGCACGTTGCATCGAGGAAGGATAACGCTCACTGCCAAGGATATGAGACTGGCTTTGCTTTTGCGTAACGATAGTGTGATGATGTAG
>Aedes_aegypti_mosqCid1_AAEL009296
ATGCCTCGTCGCCAAAATAGACCCCCAACCCGCAATCCGCGAGGACTTGGCGCAGCACCAAGGAACGACTCTCCCGACAGAAGTGCACGCGCTTCGGCATCTTGCAACCAAAGGCGATCGCTGTCGGAATCGAATCTTCCCCGTTCAGCTGCTGCTGAAACGCCGCAAGCTGGAAGATCAAGGGCAGCCTCCGCGGTAAGAGGAACCAAGGTCCTGAACGAGATACGGCATCTGCAGCGGAGCACCGGTTTGTTGATTCCCAAGCTGCCCTTTGGACGGGTCATCCGAGAGGTCATGCTAGAATACAACGGGCGCCATCTGCGAATCACATATGATGCCCTGATGGCCATCCAGGAAGCGGCGGAGATGTACCTGGTGATGCTTTTCGAGGACTGCCAAAAGCTGGCCCTGCATCGACAGCGGGTCACGATTACCAAACGGGACATGGACCTGGCGCTTTACTTCCGGCTTTGA
>Aedes_aegypti_mosqCid3_AAEL009284
ATGCCTCGCCGAGTTCGTCCACCGACACGTTACCCAGCAGGAGGAAAACTTCAAACATTGACGACGAAAGGAAGCAAGACGAAAGCAGTTCCTGAACCACCAGCACCTAAATCTAAATCAAAACCATCACAAGCATCTGGCGCAAAATCGAAGGCACCAAGCTCACCAAAACAACAGAAAGCTGCAGGACCAAAACGGTCGGAACCATCGACGAGCTTACCCAAGCAACGGGAAGTTCCTAGTCCAGTGGAACCACACCCGAGAAGATCCCGCTCCGAGTCGCGCTTGTCCAGCAACAGCGGCGACGAAGATTTCCAACCCTCGATTCGCGTTCGAAATGCTTCCGAGTCGCGTTTATACAGAAGTCGCCAACAGATCGCTCTACAGGACATTTACCGGCTGCAATCGACCACCCAGTTGCTAATTCCAAAGTTGTCATTTTCCCGAGTCATCCGCGAAGTGCTGATGGAGTACATGTACCGGGACTTTCGTATCACGACCGAATGCCTCAATGCCCTCCAGGAAGCGTCCGAAATGTACCTGGTGCAGGTGTTTGAGGATTCGTACCGCTGCTGTCTACACCGAAACCGGGTAACGCTGGATGTGCCGGACATGAAGCTGGCCCTGTACCTGCGGGAGAAATGGCGCCCTTAG
>Aedes_albopictus_mosqCid1_AALF025877
ATGCCTCGACGCTGGGGAAGACAACCAACCCGCAATCCACAAGGACTGGGCACTGAAGAACAACCAAGCGACACTTCCTCCGACAGCGGTGCCTCCAATTCTCCGCCAGCTGCTGCTTCTCGTCAGACAAGAAGGCGATCATCGTCGGCACCCGCTCGTCGTAGTAGCAGAGCACAAGCCCCGGAACCACGGGCAGCCTCGGCGTTCAGAGGTACCAAGGCGCTGGCCGAGATTCGACACTTGCAGCGAACGACCGATATGCTCATTCCCAAGTTGCCCTTTGCCCGGGTTATCCGAGAAGTTATGCTGGATTACAGTGGCCGCAATCTGCGTATCACAGCGGAAGCCCTGATGGCCGTCCAAGAGGCAGCGGAAATCTATCTGGTTATGCTGTTCGAGGACTGCGAGAAGTTGGCATTGCACCGGCAGCGGGTGACTATTACCAAGCGGGACATGGACCTTGCGGTGTACTTCCGGATTCATTGA
>Aedes_albopictus_mosqCid3_AALF025880
ATGCCTCGCCGAGTTCGTCCGCCTCAACGACACGTGACTGCAGCAAAACTCTCAACATTGAAGCCCAAAGCGGCAGCGGAAAAAGCAGCAGAAGCAGCGCCTGAGCCGCCAGCAAAACCAGCAAAGGCGCCGAGCCTACCCAAGCAACAAAAAGTTTCGGTACCACCGCGAGCAACTAGACGATCCCGCTCCGAGTCGCGAATTTCCAGCAACAGCAGTGACGACGACTACCAGCCATCGATTCGCGTCCGAAATGCTTCCGAGTCGCGATTCGACCGCAGCCGGCAGGATGTGCAAATTCTGCAGGACATTCACCGATTGCAATCGACCACGCAGCTGCTGATTCCGAAGTTGCCTTTTGCCCGGGTCATCCGGGAAGTGCTCATGCAATACATGTACCGAGACTTTCGCATCACCCCGGAGTGCCTGTGCGCCATTCAGGAAGCGGCCGAAATGTACATGGTGCAGGTGTTTGAAGACTCGTACCGGTGCTGCCTGCACCGAAGCCGGGTTACTCTGGGAGTGCCGGACATGAAGCTAGCCCTGTATCTGCGGGAGAAATGGCGCCCGTAA
>Aedes_albopictus_mosqCid2_AALF012514
ATGCCACCAAGGATCACAAAAAAATCAAAAACTAAAAAACAAAAATCTGCGATACCCCATGATCTGGAATTCATGCTCGGTGAAGAAATTTCTTCGCCTCTGGACAGCCCTGTTTCTCCAACAGAGGCAGAATATTCGCTTATCACTGCACCACCTCGAGACGTGCTGGCGAACCTGCGATTGGTTGGCCTGGCTGGGAACGATACATCGAGTGGAACTGGATCCACATCATCGTCCAGCAGTGACCAAAGGGATCAACTTGTAACGAGTATTAATAATTATATGACTAGATTTGTGCCAGAGGTTATCCCAGAGGAACCTGAGCCTCGAACAGCTCGTGCGAAACCCAAATCCAAACAAAAAACTACACAAAAGAAAGCAGAACGACCGATTGATCATTTCGATGACGTATCGGGATCTCTTAGTGATGAATCAATGATAAATGCTAGCGATCGCCCATCTTATGTAGAGAATAATCCAGGAAATCGCGGTAGATCATCAGAAAACGAAACCCGTTCTAATGGTGATGATACAAATGGCTATACGAGTCCACAAAATAACAACTATTTGGCTCCAGATATGACCTATTCAAGCACGCAATCTAATGTTCCGAGCATCACCTCCAGGAATGCTTCAAAGTCATCAAAGGCTAAGAAATCCACCGGTTCAGTTAGACGCAAGAGTTCACCCCCAAAGAAAAATACCTCGGCTGGCCCCAAAACCCAAACAAGGCAAACCATTGGCAACGATCGGGACGTTAAACTGCTGCAGAACATCGCACGTCTCCAGGCGAGCACAGAATGTTTGATTCCGAAGCTTCCCTTTGCCCGATTGATCCGGGAAACCATGCAGATGTACTGCGGGCGCGATCTGCGGATAACACCCGAGTGTCTCCAGTGTCTGCAGGAGGCAGCCGAAATCTACGCCGTACAGGTCATGGAAGATGCCTATCGGTGTACGTTGCACCGCGACAGGATAACGCTCACGGCCAAGGATATGAAGCTGGCTTTGCTGCTGCGAAACGATAGTGTGATGATGAATATGTAG
>Anopheles_arabiensis_mosqCid1
ATGCCACGGCCGAAAAGTGCACCGAGATCACTATCGGAAAGAAAGGAGCGGAAAAGCAAAGTCCGAACATTGCGTAGCCAGGCCCAATCGGATCTGTTTTCTTCCAGCGAGGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTGCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGAAGCCAACAGACCGATGAGTCCAGAGCATCACGCAGCGCCGCCCGCCAGCATACACCTTCCACAAGCGATGAAGAGGAGGTAGACGAGGAGGAGCACGATCCGTCTCAGCGTAATCGACGAAGTCGCAGCAGCACACGCACACCGTCCGAACCCGTTGCTTCCACCTCACAGCGTCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAAGCCCAGAATAGCACCACTATTAAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAGCTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCTTGCACCGTGCCAGAGTGACGCTCTCCCCAAAGGACATTGAGCTGGTAATAATACTACGACGAGGGATCAAATAA
>Anopheles_arabiensis_mosqCid2_AARA014434
ATGGCACCGCGGAAAAACACCAAAAAGCAACCGAAACCGACGGCACGCGCCCGGAAGCAAACGGTGGAACGTACACCAAGTCCACCGAAAAGTGCTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACCACCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACACAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCCACCGCCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGTATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCCGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCAGCATATCCTCCGCAGCCAACACTGGACGGAATTCAACCAAAGGGAAGCCTACCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAGGCTGCAAGGGACGGTGCACAATCTGATTCCCAAGCTAAGCTTCGGACGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCCCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTGATGCTGCGTCGCAATTGA
>Anopheles_atroparvus_mosqCid2
ATGGCACCCAGGAAAATCGGCAACACAAAAAACGGGAGACCTAAAAAGACGACGGAACCGCCACAGCCTGAAGTACCGTCCGTGGCCAGTGCCCGCGACGGAAACCGTCTATTCATCCCTTCGAACGAAGGGTTAGAACACATGATGGGCTTCGATGTGGAGGACAGCAGTGACTTGTCCGACGATGCTACATATCAATCGCAACCAAACTTTTCCTTCCTACCGTCCCACAAACACTCGAGCCCACGCAACTCAAAGAAAAAGAACATTCCGGCGGTGCACCGACTGGCCAGCATGTCAACTGTGCCAAACTCAGACTTGGCTCAACCGAACACATCCAATGAACCGTCTAGTGGTTCCAGAAATGTTTCCTCAAATTCCTCACCTAGCGTGCCAAAGCCACTCTCAACACCGCCGACCAACAAAAAGGCGGAAGCGAAAAAGCAAAATCGAAAGCAAAAGACTCCCACCAAGCTTAAAGCATTAAGAGAAATCGTTCGGCTCCAGGGGACGGTGACAACGCTAATTCCAAAACTGTCTTTCGGGCGAGTCATTCGGGAGATATTGGCAGACTACTCTAATAGCAATCTGAGGGTGACGGTCGACATGTTACAGTGTTTGCAGGAAGCTGCAGAAATCTACATAGTGCAATTGTTCGAGGACGCCTACAAGTGTACCGTTCATCGCGGGCGTATAACCTTGATCCCCAAAGATATGCATTTAACACTAATGATCCGACGCGAGTCGTAA
>Anopheles_atroparvus_mosqCid1
ATGCCACGGAGAAAGAGTGTGCCACGAGCATCGCATCAAAGGGACGAGAGGAAAACACGAAGCACAACATCAAGGAACAGTTCACTCAACTTGAGCACGGACAGTTCGCCCTCGGACACGGAATCGCATCGCGCATCTCGGTCGCCGTTGAATCGCAGCAACAGCAGTGCAGCCTTGGGACCAACGTCGACGACGACGGCAGCTCAACCTAGCCGTAGATCCGTTTCTGCAGGTCCTCCTACTTCATCGAGAAGAGGACCAAAACTGCCACCGCTCCAGAAAGAGATGTGGAAACTGCAAAATAGCACGAAGCTTCTTATACCTAAATCGAGCATTTGTCGAGTGATACGTGAAGTAATGCTCTCCTACGGACAGTACAGAATAACGTTAGATGCGCTCGCTGCCCTACACGAGTCGAGCGAAATGTACTTGGTGAATCTATTCGAAGCATCGCACCGGTGCGCCCTACACCGCCAACGGGTTACGTTGATGCCGAAGGATATGCAGCTGGCGCTGTTTCTGAGGGGCGACGGGTGA
>Anopheles_chrysti_mosqCid2_ACHR014087
ATGGCACCACGAAAAAACACTAAAAAACAATCCAAGACGAGTGCCGGCGTCAGACAGCAAGCAACGGAACGTACTCCAAGCCCACCACGTAGAAGTCCCGTTGAAGAGCCAGCCTTTCGGTCGCTTAGAACCGTGAATGAGCTTTGTGATGTGATGGGTGACGAATCGGCAAGCGGCAGCGATATGGAATCGTACCGGGACAATACATCCCAATCCCGTCCAAACTTTTCCTTCCTGCCCTCGCACAAACATTCCAGCCCAAACCACCAAAACTACAGACAGGCCAAACAACCGCCAGCTACCGTACATCGACTCACCAGTATGCCCACCGTGCCCAACACAGGGCTAAATCACGAGGATACTGAGTCACCAGATCGAAGCAATCGTCATGGTACATCGTCCAGCAGCAGTATGTCTACCTTTACCAAAAGTGGACGGAATCATGAGGACACTCAACCGCCCGGTCCAAGCTCTAGAAGTCGTAAGACATCCAGAAGCGAGCGAGGCAACAGCAGCAACATTGGCCAGCCTACAGCTAGCTCCAGTGCTCCGCCAACATCGCAACCCGCGCGTCGAAAGCAGAAAACCCCTTCCAATCTTCAAGCGCTGAAAGAAATCCATAGGTTGCAAGGGACGGTACACAATCTGATACCTAAGCTATCTTTCGCACGTTTGATACGCGAAGTATTGAGCGAATATTCGCATCGACAGTTGAGGGTGACCGTGACGATGCTGGAATGTTTACAAGAATCGGCCGAAGTGTATTTAGTGCAGCTATTCGGCGACTCTTATCGGTGCACACTTCACCGGGAACGAGTGACTCTTATGCCCAAAGACATGCAATTGGCTGCAATGCTTCGGCGTGACTGA
>Anopheles_chrysti_mosqCid1
ATGCCACGACAGAAAAGTGCTCCAAGATCAATGTCGCGAAAAGCAGAAAAGAATAACGAATCCAGATCATCACGTAGCCGAGGCCGGGAGCTTACAACTTCTTCTGAAAGTGATGAGGAGGGAGAGGATGCGTCTCAGCGTAACCGACGCAGTCTAAGCAGCACCAGTTCGCTCTCTCCTAGATCTACTGCCTCAGGAACAACACGCCGATCGCGGTCAGTCGATCTACGACGTGACCCCAGAACATCACGTAGCCAATCCCGGCAGCTTACGCCTTTCAGTGATGAGGAGGAAGAAGATGCATTTCGGCGTGACCGACGCAGTCGAAGTATCACCAGTTCGTCAAACTCCGCAGAGCCTGTAGCCTCCAGCTCACAACGTCGATCATTGTCAGCCGATCCCCCGCTTCGTCCAACTTCGAAAAATGTGCCCCGGTCCAAAAAGGGAAAACAACGAGTAGCACCATTTTTAAAGGACATCCTTAAACTTCAGCTAACCTGGAACATGCTCATTCCGCGGGCGGCCTTTGGAAGACTGGTGCGTGAATTGTTTGACTATCGGTATCGCATCACGCCACAGGCACTGGAGGCAATGCATGAATCGACAGAACTATTTATGGTGCAACTATTTGAGGATGCGTACAAGTGCTGCCTGCATCGTGCCAGAATAACGCTGTCTCCGAAGGACGTCGAGCTGGTAATTTTACTAAGAAGAGGAATAAAATAA
>Anopheles_coluzzi_mosqCid1
ATGCCACGGCCGAAAAGTGCACCGAGATCACTATCGGAAAGAAAGGAGCGGAAAAGCAAAGCCCGAACATTGCGTAGCCAGGCCCAATCGGATCTGTTTTCTTCCAGCGAGGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTTCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGAAGCCAACAGACCGATGAGCCCAGAGCATCACGCAGCGCCGCCCGCCAGCATACACCTTCTTCCACAAGCGATGAAGAGGAGGAAGACGAGGAGGAGCACGATCCATCTCAGCGTAATCGACGAAGTCGCAGCAGCACACGCACACCGTCCGAACCCGTTGCTTCCACCTCACAGCGTCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAAGCCCAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAGCTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCTTGCACCGTGCCAGAGTGACGCTCGCCCCAAAGGACATTGAGCTGGTAATTATACTACGACGAGGGATCAAATAA
>Anopheles_coluzzi_mosqCid2_ACOM030600
ATGGCACCGCGGAAAAACACCAAAAAGCAACCGAAACCGACGGCACGCGCCCGGAAGCAAACGGTGGAACGGTCCCCAAGCCCACCGAGAAGTGCTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAACGTGATGGGCGACGAATCGGACGACCAGACAACCACCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCAGCCGCCGCCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGCATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCCGAACCCGGTCCCAGCTCCAGCCGCCGAAGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCAGCATATCCGCCGCAGCCAACACTGGACGGAATTCAACCAAAGGGAAGCCTACCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAGGCTGCAAGGGACGGTGCACAATCTGATTCCCAAGCTAAGCTTCGGACGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCCCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTGATGCTGCGTCGCAATTGA
>Anopheles_culicifacesA_mosqCid2
ATGGCGCCCAGGAAAAACAACGCAAAAGCAACGAAAGCAAAAAAACCACCCACAAGGGAACAGCCCAGATCACCATCCACCTTGGCAGAAGAATCGAATAGAAGCAGTCGAGCTAGGGAATACCGCTCCTTGAAAACAGCGGATGAGCTTCGCGATGTAATGGGCGCTGAATCGGACAGTTCCATTAGCAATATCGAGAATGATTCGTACCGCAGCAACACCATTCAATCGCGTCCAAACTTCTCGTTTCTGCCCTCGCACCAACATTCCAGCCCCAATAAAAACCACAGTCCGAAACGGACCAACGCTGCTGCTCCACATCGACTCGCCAGTATGCCCACCGTGCCCAACACGGATCATGAGGAAAACGATTCCACTTCCGCCGCTAGTACGCCCCGTAAATCATCACGCAAACCATCACCAACGAAAAGCAAAACAAAAAACCGTCAACAGGCACAGTCAAGCAATGATAGGCAAGCGCATACCCGAAAACAGAGAGCACCCGGACAACTGAAGGTACTCAAAGAGATAATAAATCTCCAAGGCACAGTGCACAATCTGATACCCAAGTTATCCTTTTCGCGTGTGATTCGAGAGGTCTTGTCCGAGTATTCGACTCGTTCCCTGAGGGTCACCCCACAAATGCTACTCTGTTTGCAGGAAGCGACGGAGATATATTTGGTGCAACTGTTCGAGGATTCCTATCGGTGTACGCTCCACCGAGACCGTGTAACGCTTATTCCCAAAGACATGCAATTAGCTTATATGCTTCGGCGAACTAACACTTAA
>Anopheles_culicifacesA_mosqCid1
ATGCCGCGACGGAAAAGTGTGCCAAAATCAGTACCACAAAGCAAAAACGATTCCAACGCTACACGCAGTCGTTCCAAAATCGATTTGGAACATGCCGCACAACGAAGCCGTCGCAGCCGTTCTACTGAGAGGGAGTTAGATGATGGCAACCGTGGCGGATCACACCGTAGATCGATATCGGCAGAAGCAGCACGTTCCACCAGCCGAGACAACCCAAGAACGACTAAAAGCTCACGGATAGCACCGTTTCTAAAGGAAATGCTTCATTTGCAACAAACGTACCACACGCTCATCCCTAAGGCAGCTTTCGGAAGGGTAGTACGGGAACTGTTTGACAGTCAGTATCGGATCACTGCTGAAGCCTTCGCCGCATTGCATGAAGCCTCTGAAATGTACTTGGTGAACCTGTTTACTGATGCCTACTTCTGCTGCCTGCATCGAACTAGGGTTACGCTAACCCCGAAAGATATGCAGCTGGTTCTTTCATTACGAAAACCATTCAACTAA
>Anopheles_darlingi_mosqCid2
ATGGCGCCACGGAAAAAAACAACGAAAGCCACGAGGGCAACGCCGGCACGGCCTGCTGCAGACGCCCGCGAGGAAACGCCATCGCCCGAACCAAACCAAGCCAACCCGATACCGGAATTCCGACAGTTAACCGCCGCGGAGATAGCGTCGGCGATGGGCAGCGAAACGGATAGTGATCTCAGCGAGGACGACCCAACGTACACCACCCAGTCGAAACCGAACTTCTCCTTCCTGCCCTCGCACAGGCACTCCAGCCCACGGCGGTCGGCATCCGGAAACCGTTCTGGAAGCAGCGATCCAGCTTCGCGATCTCCCGCCACCGTAGTCCATCGGTTGGCCAGCGCTTCAACGGTACCGAATTCAGATCTCACAAGCACGCCCACTAGACGCACCACTCAACAAAGCGCAGAGCCCGCATCGGAACCCCAGCCACGGGGCAGACCTAATCACAGGAGAAAACAAGATCGTCCGACAAACTGGAAGACCATCAAGGAAATCATCAATCTGCAGGCGACCGTGAACACTCTCATACCGAGGCTCACTTTCGGCCGAGTTATACGGGAAATCCTTACCGAATACAGCAGCAGCGATATGCGGGTGACGGCCGAAATGTTGACCTGTCTACAGGAGGCCGCGGAAGTGTACATCGTGCAGATGTTCGAAGACGCCTATCGTTGCACGCTACACCGAGGGCGAGTAACATTGATTCCCAAGGACATGGAGCTAGCGTTACTGATCCGACGAGATGCCAACTAA
>Anopheles_dirus_mosqCid1
ATGCCTCGACGCAAAAGTGTCCCAAGATCTCTCCCCCGACGAGAAACAGTAAATACGCAGAAACCCAATGAAGCTCGAGCATCACGCAGCGCCTCCCGACGGGAAGCGTCGTCGGAAAGTTCTAGCGAATCTCCATCCACTTCACGACGCAGCGTAAGCAGCACGGATTCACGCAGCACCCCGCAACGGGATAAATCCGATCGCTCGTCATCGAGGCGATCCTCATCGGCCGAATCACCGCGTAGTCGTCCTTCACCCAGACTCGCAAAATTTCTTAAGGAAACTCTAGCACTACAGTCGTCGACGCATTTGCTCATTCCGAAGGCATGCTTCGCCCGGGTTTTGCGAGAACTGCTAGACGGGCATCGGATCACATTTGAGGCGGTTGCCGCACTACATGAAGCGACCGAAACCTATCTGACGCAGCTGTTTCAGGACGCGAACATGTGCGCCCTGCATCGGACCAGAGTAACGCTTATGCCGAAAGATATCGATTTGGTGCTATTCCTAAGGCGTCACTGTGTTTAA
>Anopheles_dirus_mosqCid2
ATGGCACCAAGGAAAAAAGCACAAAAACCCAGTGCATCATCAACACACAAGAAACCAAGCCCGAAACCTCCACGAAACACAAATCCACAAGCTGAATCTTCAAACTCGAGCGTATCAAGGAATGGAACCAGAACCTCGAGATCGACTAGTATTATGAGTAATCCTATGGGTGATGTATCGGATATATCCTCTTGTAGTAGCGTAGAGCAAAGAAATGAGGAACCACAGCCTGAACCCTCAAACCCAACCGGAACAGCGACATTCTTCGGAGCCATGAAATCGACCGCTACTCTGAGCGAAATAATGGGTGATACAACAGATTCATCTTCCAGCAGTAACGTGGTGCAAGCATCTAACCAGGGTAGAGCTACGGATGAGGAAAGCGTGTCGTCCGAAGAAGAACCGAGCAGCAATCCGGCCACAAAAACAACGACCCCCGGACCCAAAAGCGCACCAAAACAACAGCGACGGAAACGAAAACAACCAAACAAGCTGAAAGTGCTGAAAGAAATGCTACACCTCCAAGGCACGACGCATCTTCTTATTCCTAAGCTAAGCTTCGGGCGTGTGATACGTGAAATTTTGTACGAATATTCACCGAACGGGATAAGGGTGACGCCCGAAATGCTAATGTGCCTGCAGGAAGCGGCTGAGATGTATACGGTGCAGCTCTTGCAGGACTCCTACCGGTGTACGTTCCATCGGGATCGGATAACCCTACAACCGAAGGATATTCAGCTGGCTCTAAGTCTTCGGAGGGAGTTATGA
>Anopheles_epiroticus_mosqCid1_AEPI009159
ATGCCACGACGTAAAAGTGTACCAAGATCGCAATCGAAGCGAGAAGAGCAGAAAAACATGGCCAGAGCATCACGTAGTCGATCGCGGCAGCTTTCGTCTGATTCCAGCTCAAGCGAGGGTGACGAGAGGGAGGAGCAGGAAGCGTCCCAACGTAACCGACGCAGCCAAAGTAGCACCAGATCACACACGCCGGAGACGAGTGCCTCTAGCTCACAGCGTCGATCACTGTCTGCTGACCCACCACGTTCGCGAGCGAATGCTGCGCCCCAGTCCCGAAACCAGGGCCATCGACGCATCGCACCATTCCTGAAGGAAATGCTACACCTACAGCAAACCTGGCATCTACTCATTCCAAAGGCAGCATTTGGACGCGTCGTGCGAGAGGTTTTCGATAACCGGTTTCGCATCACGACCGAGGCACTGCGTGCATTGCATGAATCGTCGGAAGTGTTTCTCGTGCAACTGTTCGAGGATGCGTACAAGTGCTGCATGCATCGAGCAAGGGTAACGCTGTCACCGATGGACATCCGGCTGGTAATCGACTTAAGGGGCGGAATCAAATAA
>Anopheles_epiroticus_mosqCid2_AEPI014069
ATGGCACCCCGTAAAAAGGATAACAAGCAGCCCAAACCACGGGCCCGACAGATAACCCCGGAACCTACCCATCATCCACCGAGACAACCCAATTCCGATGAACAATTTCGGTCGTTAAAACCAATAGACGATTTGCGTAATGTAATGGGAGAGGAATCTGACAACTCGGCGGCTACCGGCAGCGAGATGGAATCGTACCGTGACAATACATCCAATTCACGTCCAAACTTTTCCTTTCTGCCGTCCCACAAACACTCCAGCCCAAACCACAACGACAAACGGGCCAACGCACCTTCCACCAACGTACATCGTCTTGCCAGCATGTCCACGGTACCCAATTCTGGGCTGGATCACGAGGACACAGAATCTCCCAGGCCTACCACCAGTGCTCGCCGACCGTCCCGAAACGAACGTAGAGAGACTAGCAGTTTTAACGCTACAAACACCAACAGGACCACTACAACGCCAGCATCCAAGTCCGCTCAGCCTCACGGACGAAAGCAAAAAACACCCAGTAAGCTGAAGGTTCTGAAGGAAATCATTGATCTCCAGGGCACAGTGCACAATATCATTCCCAAGCTGAGTTTCGGACGCGTGATTCGTGAAGTTTTGAGCGAGTATTCGGACCGACCGTTAAGGGTGACCGTACAAATGTTGGAGTGTCTGCAAGAATCTGCTGAAATATTTCTGGTCCAGCTGTTCGAGGATTCTTACCGATGCACACTTCACCGCAATCGAGCGACGCTCATCCCCAAAGACATGCAGCTAGCTTACATGCTTCGCGGCAACTGA
>Anopheles_farauti_mosqCid1
ATGCCTAGACGTAAAAGTGCCCCGAAGCAACTCTCCAGACGAGAAGAAGATAAAGCCAGAGCCGCACGCAGCACTTCCAGACGCATGGCATTCGATAACGAGGAAACTTCGTCATCCCATTCACGACGCAGCAGTAGCGCTTCAAATTCAACTGCCACTTCGCACAACAGCGCCTCGCAGTCGCCAGGGAGGCGAAGCTCATCGGTTGGCCCACCAGCACGTTCTTCCATTGCACGGATGAGGCGAGAGGATCCGAAACTCGTAAAATTCCTAAAGAGTATGTTACATCTACAGGGCACAACGAATTTACTCATTCCGAAAGCCGCGTTCGCACGGGTATTACGTGAGCTACTCGACGGATACAGGGTAACGTTGGAGGCTGTCATGGCGCTTCACGAAGCGGCTGAAACCTATCTGGTTCAGCTGTTTCAAGATGCAAACCAGTGCGCCATGCACCGGGCCAAAGTAACGCTAATGCCGAAAGATATCGAATTGGTTCTATACATAAGGGCTCATACCTCGAGGTAA
>Anopheles_farauti_mosqCid2_AFAF009137
ATGGCTCCAAGGAAGAAATCGACAATGAAAGTGAAATCTAGCGAAACAACGAAAGGGAAGACAGGTACATCGAACGTTGCAACCGCCAGCAGGAGCGACGTGGGACAAATGACAAATAATAAACCCGTCTCCTTGAGTGATATTTTGGGCTCCTCAATAAGTACATCGGACAGCAATAGTACGGACTTATCCAACCGGAGCTTACTTGCAAATGAGGAGAACGTATCGATTGAAGAACCATATGGTAGCAACAATCGAGGAAGCTACGATCGGAGCAACAACACTACGAATGAGAATCTAGACGAACAACCGTCGAACAGCGCTCATACCACGAACTCTGTAACCGCCAAACCGAAAAAAAATCGACGGAAACCTTCAAAACCCTCAGACTGGAAGCTGATAAAAGATATGTTACATCTGCAGGGTACGGTGCACTACCTCATTCCCAAGCTAAGTTTCGGGCGTGTAATACGTGAAATCTTATCCGAATTTGCTCCCACCGGGCTGAGAGTGACGCCGCAAGCGCTTGAGTGTCTACAGGAATCGGCTGAGCTCTACACGGTGCAGCTTTTCCAAGACGCTTACCGCTGCACGTTCCATCGAGACCGGATAACGCTGCAACCGAAGGACATCCAGTTAGCTCTAATGCTTCGTCGAGAGCTGTAA
>Anopheles_funestus_mosqCid1
ATGACGCGACGCAAAAGTATACCCAGATCGCTGTTAAAAACTGCGCAAAGCAAAAGCGAATCCAGAAATACACGTAGCCAATCCCGGACGGCCAGCCAACCAGCCTCGGAATCGTCATACAGTTCACAACGAAGCCGTCGCAGCAGAAGTTCTTCGGACACGCAGTCCTCAGACGGAAATGTTCGTGGATCATACCGTAGATCGATATCGGCTGACATGGAACCTTCCTCTAGCAACAATAACCCAAGATCGGCTAGAGCTCCACGCATTGCCCCGTATCTCAAGGAAATGCTTCATCTGCAACAAACGTACCACATGCTCATCCCAAAGTTAGCTTTTGGAAGGGTGGTACGGGAGTTGTTTAACAATCGGTATCGGATCACAATGGAAGCTCTCACCGCATTGCACGAAGCTGCCGAAATGTACTTGGTGCACCTGTTTACTGATGCCTACATGTGCTGCATGCATCGGTCTAGAGTTACTTTAAGCAAAGAAGACATGCGGCTGGTTCTTTTAATACGAAAATCTACAATCTAA
>Anopheles_funestus_mosqCid2_AFUN003801
ATGGCACCAAGAAAAAAGACTGCAAAAGTAAAACCACCAGCCAAACGGAACATTCCTAATGCGTCCTCAGAAGAAACGGGAAGGGCCGAAGGAAAGACAGATTACCGCGCCCTTCGAACAGCTGAGGAGCTTCGCGATGTAATGGGTGCAGAATCGGACGATTCCCTAAGCAATAGCGAGAATGAATCTTTCCGCAACAATACGAATCAATCGCGCCCGAATTTCTCTTTCCTACCGTCTCACAAACACTCCAGCCCGAATAAAAACTACAGCCCGAAACGGACCAACGTTGGTTCAGCTCATCGACTCGCCAGCATGCCCACCGTGCCCAATACAGATCACGAGGAAACCTCTTCTACTGCCAGCACTAGCGCAACTCGTAAAACACCACGCAAATCAACAGCAGCCAGAGAAACGGTCCAACCGAGCACTTCAAGACAACCGCACATACGGAAACAGAAAATACCCGGCCAACTGAAGGTTCTTAAGGATATAATAAACCTTCAAAGTACGGTTCATAATCTGATTCCAAAGCTGTGTTTTGGACGCGTGATTCGCGAGATCTTATCCGAGTATTCGAACCGCTCGCTGAAGGTAACTCCAGATATGCTGCTTTGTTTGCAGGAAGCGTCGGAAATATATTTGGTACAGCTGTTCGAGGACGCCTATCGGTGTACGCTTCACCGGGACCGTGTAACGCTTATTCCCAAAGACATGCAACTAGCTTTCATGCTTCGGCGCAATTAA
>Anopheles_melas_mosqCid2_AMEC001928
ATGGCACCGCGGAAAAACACCAAAAAGCAACCCAAACCGATGGCACGCGCCCGGCAGCAAACGGTGGAACGTACCCCAAGCCCACCGAGAAGTGCTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACGGCCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGGCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGCATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCCGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCAGCAGCAGCATATCTGCCGCAGCCAACACTGTACGGAATTCAACCAAAGGGAAGCCTTCCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAAGCTGCAAGGGACGGTGCACAATCTGATTCCCAAGCTAAGCTTCGGACGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCGTTGAAGGTGACTCCGCAGATGCTGGAATGTCTGCAGGAATCGGCCGAAGTATACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTGCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTAATGCTGCGGCGCAATTGA
>Anopheles_melas_mosqCid1
ATGCCACGGCCGAAAAGTGCACCGAGATCACTGTCGGAAAGAACGGAGCGGAAAAGCAAAGCCCGAGCATTGCGTAGCCAGGCCCAGTCGGGTCCGTTTTCTTCCAGCGAGGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTGCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGCAGCCAACAGGCCGATGAGTCCAGAGCATCACGCAGCGTCTCCCGCCAGCATACACCTTCTTCCACAAGCGATGAAGAGGAGGAAGACGAGGAGGAGCACGATCCGTCTCAGCGTAATCGGCGAAGTCGCAGCAGCACCCGCGCACAGCCCGAACCCGTTGCTTCCAGCTCACAGCGCCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAGGCCGAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAACTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTATCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCCTGCACCGTGCCAGAGTGACGCTCGCCCCAAAGGACATAGAGCTGGTAATTATACTGCGACGAGGAATAAAATAA
>Anopheles_merus_mosqCid1
ATGCCACGGCCGAAAAGTGCACCAAGATCACTGTCGGAACGAGAGGAGCAGAAAAGCAAAGCCCGAACATTGCGTAGCCAGGCCCAATCGGGTCTGCTTTCTTCGAGCGACGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTGCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAATATCTACACAATCACAATCACAAAGCAGCCAACAGGCCGAAGAGTCCAGAGCATCACGTAGCGCCACCCGCCAGCATACACCTTCTTCCACAAGCGATGAAGAAGAGGAAGACGGGAAGGAGCACGATCCGTCGCAGCATAGTCGACGAAGTCGCAGCAGCACCCGCACACCGCCCGAACCCGTTGCTTCCAGCTCACAGCGCCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAAACCAAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCATCATCTCATCCCGAAGGCGAGCTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCAATGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCATACAAGTGCTGCCTGCACCGTGCCAGAGTGACGCTCGCCCCAAAGGACATTGAGCTGGTAATTATACTACGACGAGGGATCAAATAA
>Anopheles_merus_mosqCid2_AMEM014318
ATGGCACCGCGGAAAAACACCAAAAAGCAACCCAAACCGACGGCACGCGCCCGACAGCAAAGGGTGGAACGTACCCCAAGCCCACCGAGAAGTGTTGACCTAGAGCTAGCTTACCGGCCCTTGAAAACCGTCAATGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACGGCCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCCGCCGCCGCCGCCGCTGACCAACCTCCAGCAACGGTACACCGGCTCACCAGCATGTCCACCGTCCCCAACAATGGACTGGAACACGAGGACACTGACTCCGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCCGCATATCCGCCGCTTCCAACACTGGAGGGAATTCAACCAAAGGCCAGCCTTCCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAAGCTGCAAGGGACGGTGCATAATCTGATTCCCAAGCTAAGCTTCGGGCGCTTGATCCGCGAAGTATTGAGCGAATATTCGCACCGTTCTCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTAATGCTGCGGCGCAATTGA
>Anopheles_minimus_mosqCid1
ATGCCGCGTAAAAATGAATCCAAGACGGTGCAAGGAAAAACTAACGATTCCAAAGCTACACGTAGTCGTTCCAGAATCGAACAGCCGGACACCTCACAACGAAGCCGGCGCAGTAGAAGCAACGATGAGACGCCGTCTACGGGCGAAAATAGTCGAGGATCACACCGTAGATCGCTATCGGCTGATAAATATTCCACTGAACAAAGCAAGAGAACGCCAGGAGCTCAAAGAATAGCACCTTACATAAAGGAAATGCTTTATCTACAGCAAACATTCCACTTGCTCATCCCGAAGGCAGCTATCGGAAGGGTAATACGGGAACTGTTTGACAATCAGTTCCGGATAACACCAATGGCCTTGTGCGCATTGCATGAAGCTGCCGAAATGTATTTAGTTAACCTGTTCGCTGATGCTGACTTGTGCTGCAAACACCGCTCAAAAGTGACGCTAAAGCCGGATGATATACGGCTGGTTCTTTCCATGCGGAAAGGAAAGTAA
>Anopheles_minimus_mosqCid2
ATGGCGCCAAGAAAACCCAACAAAAAAGTGACGAAACCAGGAAAACTGCCAACCAGACAACAAAGCCCTTCCGCTTCAGATGAAGAAAAAAACGATCAAGGTAGAGAATACCGCTCCCTAAAATCAAGGGAAGACCTCCGTGATGTGATGGGCGCCGAATTGAACTATTCCCAAAGCGATGGCGAGAATGATTCCTACCGCAGCAACACCAATCAATCGCGTCCCAACTTTTCCTTTCTGCCGTCACACAAACACTCCAGCCCGAAACGAATCAACGCAGGAGTTGCTCATCGACTCGCCAGTATGCCAACCGTGCCCAACACAGATGACGACGAAAGCGACGCCACTTCCACTTCTACTTCCAACTCTGTCAGTTCGCCACGAAGATCGCCAAGAAAAGCAGCCGGAGCAAATAGCAAAACTAAGGACCGAGAGCAGGCACAACCGAACACTGCAAAAAAACCTCATTCCCGGAAGCAGAAAACACCCAAACAAATGAAACTAATCAAGGAAATAATAAATCTCCAAGGCACGGTACATAATCTGATTCCCAAGCTGAGCTTTGCGCGCGTGATTCGGGAGATCTTGCATGAGTTTTCGAACTGTTCGCTGAGGGTCACTCCAGAGACGCTACTGTGCCTGCAGGAAGCGACGGAGATATATTTGGTACAACTGTTTGAGGATTCTTATCGGTGTACGCTTCACCGAGACCGTGTGACGCTTATGCCCAAAGACATGCAGCTAGCTCATATGCTTCGGCGAGGCAATAGTTGA
>Anopheles_quadriannulatus_mosqCid1_AQUA004414
ATGCCAAGGCCAAAAAGTGCACCAAGATCACTGTCGGAAAGAGAGGAGCAGAAAAGCAAAGCCCGAACATTGCGTAGCCAGGTCCAATCGGGTCTGCTTTCTTCCAGCGACGAGGAAGAAGATGCTTCCCAGCGTAACCGACGCGGTGCAAGTAGCGCAAGTGCACAATCTTCGGGCACACAAAACCGGTCACGCTCGGTCGAGACACCACGTCCAACATCTACACAATCACAATCACAAAGCAGCCAACAGGCCGATGAGTCCAGAGCATCACGCAGCGTCTCCCGCCACCATACACCTTCTTCCACAAGCGATGAGGAGGAGGAAGACGAGGAGGAGCACGATCCGTCTCAGCGTAATCGACGAAGTCGCAGCAGCACCCGCACACCGCACGCGCCCGAACCCGTTGCTTCCAGCTCACAGCGCCGGTCAGTGTCTGCCGGTCCGCCGGGAGGTGGTTCAACATCACAAAATGTACGACCGAACCGAAGGCCCAGAATAGCACCACTATTGAAGGAAATGCTCAGACTACAGCTATCCTGGCACCATCTCATCCCGAAGGCGAACTTCGGCCGATTGGTGCGGGAACTGTTTGACCACCGGTATCGCATCACACCGCAGGCACTGGAGGCATTGCATGAAGCGACGGAAGTGTTTCTCGTGCAGCTGTTTGAGGATGCGTACAAGTGCTGCCTGCACCGTGCCAGAGTGACGCTCGCCCCAAAGGACATTGAGCTGGTAATTATACTACGACGAGGGATCAAATAA
>Anopheles_quadriannulatus_mosqCid2_AQUA014106
ATGGCACCGCGGAAAAACACCAAAAAGCAACCCAAACCGACGGCACGCGCCCGGAAGCAAACGGTGGAACGGTCCCCAAGCCCACCGAGAAGTGCTGACCTAGAGCTAGCTTACCGGCCCCTGAAAACCGTCAACGAGCTTCGTAATGTGATGGGCGACGAATCGGACGACCAGACAACCACCGGGAGTGTGATGGAATCGTACCGCGACAGTACAACCCAATCGCGGCCTAACTTTTCCTTTCTGCCGTCGCATAAACACTCCAGCCCAAACAATCACGGCAATAAACGGACCGCAGCCGCCGCCGCCGCCGCCGCCGCTGACCAACCTCCAGCGACGGTACACCGGCTCACCAGCATGTCCACCGTCCCCAACACTGGACTGGAACACGAGGACACTGAATCTGAACCCGGTCCCAGCACCAGCCGCCGGGGATCATCCCGAAGCGAACGCGACGAGACAGGCGGCAGCAGCATATCCGCCGCTGCCAACACTGGACGGAATTCAACCAAAGGGAAGCCTACCAGAAGTCGAAAGCAGAAAATCCCTACCAATCTGAATGTGCTGAAAGAAATCCATAGGCTGCAAGGGACGGTGCACAATCTGATTCCTAAGCTAAGCTTCGGGCGCTTGATCCGCGAAGTATTAAGCGAATATTCGCACCGTTCCCTGAAGGTGACTCCGCAGATGCTGGAATGTTTGCAGGAATCGGCCGAAGTGTACCTAATGCAGGTGTTCAGTGACTCCTATCGTTGCACCCTTCACCGGGGCCGAGTGACGCTCATTCCCAAAGACATGGAGCTGGCTTTGATGCTGCGTCGCAATTGA
>Anopheles_sinensis_mosqCid1
ATGCCGCGAAGAAAGAGTGTGCCAAGAAGACAGTCAGAAGATGAATTTAAACGGGACACAACAGCAACAAGGGCAGCGAACACATCTACCTCATCGATTTCTGCATCGGACACATCCGACAGTGAACGGCATAGCCGAACAAGATCTCAACAACGGCGCAGCAGAAGCAGCGAAAGCTATGGGCATCCACCATCGACTTCCGCCAATCGAACCCATCGTAGAGCTGCTTCCGCAAATCCACCAAGGCGCCACCGGCAGTTAACACCGGCGCAGAGAGAAATAATACAGCTGCAGAAGACCACCAACCTGCTCATTCCAAAACTCAGCATTTCACGTGTAATTAGGGAGGTGATACACGCATTTGGCAACTTTAGACTCACAATGGGTGCTCTTGGGGCGCTACACGAGAGTAGCGAAATGTTTCTTATAGATTTGTTCGAAAGGGCACAAATGTGTGCCACACACCGGAATAGGGTCACTTTACAACCGAAGGACATGAAACTGGCACTGGCTCTCAGTGATAGATAA
>Anopheles_sinensis_mosqCid2
ATGGCTCCCAGAAAATCGATCACTGGCAAAAACAAGCGTGCTAAAACAGCACCAGAGTCACAAAAACCGCCACCGAAAGGTTCCTCACCTGGCAAAACGCAGAACCAAATCCGTCCATTCCCTTCGAACGCGGGACTGGAACAGATGATGGGCTTCGAAATGGATGACCAAAGTGAAATATCCGATGATAACACCATACAATCGCGACCCAACTTTTCGTACCTTCCGTCCCACCAACACTCCAGTCCACAGAAGGTGAAACAAGCTTACTTTCCAACGGTGCACCGCCTGGGTAGCATGTCAACGGTACCAAACTCGGACTTGGTCCAACCGGACACTTCACATGAACCTTCCACGAGTTATGGCATCGAAATGACGCCTGAATCCTCAAATTCGGTGCCAAATAAAGCACCGGCCAAGACAAAGGAGAATCAAAAGAGACAAAGTCGAAAGTCGAAGACGCCAATGAAGATGAACATAATGAAGGAAATTGTTAAACTGCAAAACACAGGTGACAGAATCATTCCAAAGTTGCCTTTCGGACGTGTAATTCGCGAAATTCTAACGGAGTATTCTGACTCCGGACTGAGAGTTACGTTAGAAATGCTAGAATGTTTGCAGGAAGCTGCCGAAATCTACATCGTGCAATTGTTTGAGGACGCCTACCGGTGTACGGTGCATCGCGGTCGGGTCACCTTAATTCCCAAGGATATACAATTAGCTTTAATGATCCGACGCGAATCGTAG
>Anopheles_stephensi_mosqCid1
ATGACCCGGCGAAAAAGCATACCTCGATCACTAGCGAGCACTGCTGCAAGTAAAAGCAGCGACAGGGCTACGCGTAGCCGATCCAGAACGGTACAACCAGATCCTGAAACGCCGCCCGTTTCGCCAAGAAGCCGTCGCAGCCGAAGCTCATCATATTTCACATCGCCCACGGAGGAAAATGATCGTGGAAGAAACGCGCGTAGATCGCTTTCGGTTGATGCGCCCCGTGTAGCTCCCAAAACTGTGCCAACAACATCGGCAGGCAGTCAGCGCATTGCACCGTGGATAAGGGAAATGATTGAGCTGCAGCAGACATGGCATCTGCTCATACCAAAAGCATGCTTCGCAAGGCTGGTAAGGGAACTGTTTAACTACCAGTACAGGATAACTGTAGAAGCACTCAGCGCATTGCACGAATCCTGCGAACTGTACATGGTGGAACTGTTTTCCGATGCCGATCTGTGCTGTAAGCACCGCAACAAGGTAACGTTGACCCTGCGCGATTTCAGACTGGCCTGTTTGTTTCGAGAAAAACGTTAA
>Anopheles_stephensi_mosqCid2
ATGATGGCCCTGCATTCCGCCAGACGACGACGGTTAGATTTGAAATTCGATGCGATTCATGGAATCGGTTTGACAGCAGTTGGTACAAAGAAAAAAGTAACGGTCCCGAAAGCCCGCATTCAGTTTTGGCGCACATTTCATCCCGTCCGGAGGTGTTTCTGGCTCTGTCTGAACCGAAATAAAGCATCGCTTTCATTTTCTGCCCGTACGACAATGGCTCCAAGAAAAAAGACAACGAAAAAGGCTCCTGCCAAACCGACTAATCCACCAGCCAGACGAGAAGCGCCAGAATCACCAGCGGAAACTGTGCGACGAACCGATCGGGGCGGCGGAGAATTCCGCTCACTGCGAACGGGGGACGAACTTCGAAACGTAATGGGCACCGAGACGGATGATTCCCTCAGTAACAGCGAGAATGAGTCGTACCGCAGCAATACCATCCAGTCTCGGCCCAACTTCTCCTTCCTGCCGTCGCACAAGCATTCCAGTCCGAACACCGACAAACGGACGCTTCCGACAGCCCATCGTCTCACCAGCATGTCCACCGTACCGAACACGGGTTTGGAACAACAGGAAAGTTCTTCCGCTTCGCGTACTCCTAGTGCCGGTTCGAACACCAACCGCAAATCGTCACGAGCAGTAAACCGCGAAGCCAGCAGCAGCACATCTAGTAGCCAACGGCCGCAGCGCGAAGAACCCCAGCCAAGCAATTCGAAGCAACCGCACAGCCGCAAACAGCAAAAACCCAACCAGCTGAAAATGCTGAAGGACGTCATCTATCTGCAAAGCACGGTGCACAATCTAATTCCGAAGATGTGCTTTGCGCGCGTGATTCGCGAAATTCTGAGCGAGTATTCGAGCCGGGCGATGCGGGTCACACCGGAGATGCTGTACTGCTTGCAGGAAGCGGCCGAGATCTACCTGGTGCAGCTGTTTGAAGATTCGTACCGCTGCACTATGCACCGGGATCGGATAACGCTGATGCCCAAGGACATGCAGCTAGCTTGCATACTGCGGCGCAAGTA"""
# print(cdna_fasta)

In [144]:
for record in SeqIO.parse(StringIO(cdna_fasta), format="fasta"):
    # print(f"{record.id} {record.seq.translate().rstrip('*')}")
    print(f"{record.id.split('mosqCid')[0].rstrip('_').replace('_', ' ')}") # species
    # print(f"{record.id.split('mosqCid')[1].split('_')[0]}") # gene num

Culex quinquefasciatus
Culex quinquefasciatus
Anopheles albimanus
Anopheles gambiae
Anopheles gambiae
Aedes aegypti
Aedes aegypti
Aedes aegypti
Aedes albopictus
Aedes albopictus
Aedes albopictus
Anopheles arabiensis
Anopheles arabiensis
Anopheles atroparvus
Anopheles atroparvus
Anopheles chrysti
Anopheles chrysti
Anopheles coluzzi
Anopheles coluzzi
Anopheles culicifacesA
Anopheles culicifacesA
Anopheles darlingi
Anopheles dirus
Anopheles dirus
Anopheles epiroticus
Anopheles epiroticus
Anopheles farauti
Anopheles farauti
Anopheles funestus
Anopheles funestus
Anopheles melas
Anopheles melas
Anopheles merus
Anopheles merus
Anopheles minimus
Anopheles minimus
Anopheles quadriannulatus
Anopheles quadriannulatus
Anopheles sinensis
Anopheles sinensis
Anopheles stephensi
Anopheles stephensi


### Add sequences to curatedDB

In [152]:
var_name_dict = {
    "1": "cenH3.1_(Culicidae)",
    "2": "cenH3.2_(Culicidae)",
    "3": "cenH3.3_(Aedes)",
}
tax_name_dict = {
    "Culex quinquefasciatus": {
        "taxonomy_id": 7176,
        "organism": "Culex quinquefasciatus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles albimanus": {
        "taxonomy_id": 7167,
        "organism": "Anopheles albimanus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles gambiae": {
        "taxonomy_id": 7165,
        "organism": "Anopheles gambiae",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Aedes aegypti": {
        "taxonomy_id": 7159,
        "organism": "Aedes aegypti",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Aedes albopictus": {
        "taxonomy_id": 7160,
        "organism": "Aedes albopictus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles arabiensis": {
        "taxonomy_id": 7173,
        "organism": "Anopheles arabiensis",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles atroparvus": {
        "taxonomy_id": 41427,
        "organism": "Anopheles atroparvus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles chrysti": {
        "taxonomy_id": 43041,
        "organism": "Anopheles christyi",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles coluzzi": {
        "taxonomy_id": 1518534,
        "organism": "Anopheles coluzzii",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles culicifacesA": {
        "taxonomy_id": 63366,
        "organism": "Anopheles culicifacies A",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles darlingi": {
        "taxonomy_id": 43151,
        "organism": "Anopheles darlingi",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles dirus": {
        "taxonomy_id": 7168,
        "organism": "Anopheles dirus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles epiroticus": {
        "taxonomy_id": 199890,
        "organism": "Anopheles epiroticus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles farauti": {
        "taxonomy_id": 69004,
        "organism": "Anopheles farauti",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles funestus": {
        "taxonomy_id": 62324,
        "organism": "Anopheles funestus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles melas": {
        "taxonomy_id": 34690,
        "organism": "Anopheles melas",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles merus": {
        "taxonomy_id": 30066,
        "organism": "Anopheles merus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles minimus": {
        "taxonomy_id": 112268,
        "organism": "Anopheles minimus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles quadriannulatus": {
        "taxonomy_id": 34691,
        "organism": "Anopheles quadriannulatus",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles sinensis": {
        "taxonomy_id": 74873,
        "organism": "Anopheles sinensis",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
    "Anopheles stephensi": {
        "taxonomy_id": 30069,
        "organism": "Anopheles stephensi",
        "phylum": "Arthropoda",
        "class": "Insecta",
    },
}

In [153]:
data_sequence_list = []
accessions = []
seq_count = 0
for record in SeqIO.parse(StringIO(cdna_fasta), format="fasta"):
    species, gene_num = record.id.split("mosqCid")
    species = species.rstrip('_').replace('_', ' ')
    gene_num = gene_num.split('_')[0]
    print("***", species, gene_num, "***")
    seq_id = f"HISTDB_Culicidae_{seq_count}"
    accessions.append(seq_id)
    data_sequence = {
        "accession": seq_id,
        "variant": var_name_dict[gene_num],
        "gi": None,
        "ncbi_gene_id": None,
        "hgnc_gene_name": None,
        "taxonomy_id": None,
        "organism": None,
        "phylum": None,
        "class": None,
        "taxonomy_group": None,
        "info": None,
        "sequence": str(record.seq.translate().rstrip("*")),
        "variant_under_consideration": None,
    }
    data_sequence.update(tax_name_dict[species])
    data_sequence_list.append(data_sequence)
    seq_count += 1
    for k, v in data_sequence.items():
        print(k, v, type(v))

*** Culex quinquefasciatus 2 ***
accession HISTDB_Culicidae_0 <class 'str'>
variant cenH3.2_(Culicidae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 7176 <class 'int'>
organism Culex quinquefasciatus <class 'str'>
phylum Arthropoda <class 'str'>
class Insecta <class 'str'>
taxonomy_group None <class 'NoneType'>
info None <class 'NoneType'>
sequence MPRRGPAPKKAGPKRGGPAPKNTRTKSPVSPRVPPPPPPPPPPPAQSHQQPVSQRDVFDEMMGSEISSDNSSQEAPPRVALPSKRKSPRFQDGAGAGAVASDDSSLSEANPDRSRQQQPPHRRKAPAPKKSQTAALKEIAKLQRTTNPVIPKLPFARLIREILMEYSHRELRITPESLQCLQESAEVFAVQLMEDAYRCTLHRDRLTLMPKDMKLAVMLRKDSVMV <class 'str'>
variant_under_consideration None <class 'NoneType'>
*** Culex quinquefasciatus 1 ***
accession HISTDB_Culicidae_1 <class 'str'>
variant cenH3.1_(Culicidae) <class 'str'>
gi None <class 'NoneType'>
ncbi_gene_id None <class 'NoneType'>
hgnc_gene_name None <class 'NoneType'>
taxonomy_id 7176 <class 'int'>
organism Culex quinq

In [154]:
accessions

['HISTDB_Culicidae_0',
 'HISTDB_Culicidae_1',
 'HISTDB_Culicidae_2',
 'HISTDB_Culicidae_3',
 'HISTDB_Culicidae_4',
 'HISTDB_Culicidae_5',
 'HISTDB_Culicidae_6',
 'HISTDB_Culicidae_7',
 'HISTDB_Culicidae_8',
 'HISTDB_Culicidae_9',
 'HISTDB_Culicidae_10',
 'HISTDB_Culicidae_11',
 'HISTDB_Culicidae_12',
 'HISTDB_Culicidae_13',
 'HISTDB_Culicidae_14',
 'HISTDB_Culicidae_15',
 'HISTDB_Culicidae_16',
 'HISTDB_Culicidae_17',
 'HISTDB_Culicidae_18',
 'HISTDB_Culicidae_19',
 'HISTDB_Culicidae_20',
 'HISTDB_Culicidae_21',
 'HISTDB_Culicidae_22',
 'HISTDB_Culicidae_23',
 'HISTDB_Culicidae_24',
 'HISTDB_Culicidae_25',
 'HISTDB_Culicidae_26',
 'HISTDB_Culicidae_27',
 'HISTDB_Culicidae_28',
 'HISTDB_Culicidae_29',
 'HISTDB_Culicidae_30',
 'HISTDB_Culicidae_31',
 'HISTDB_Culicidae_32',
 'HISTDB_Culicidae_33',
 'HISTDB_Culicidae_34',
 'HISTDB_Culicidae_35',
 'HISTDB_Culicidae_36',
 'HISTDB_Culicidae_37',
 'HISTDB_Culicidae_38',
 'HISTDB_Culicidae_39',
 'HISTDB_Culicidae_40',
 'HISTDB_Culicidae_41']

In [155]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [156]:
for ds in data_sequence_list:
    cursor.execute(add_sequence, ds)

In [157]:
query = "SELECT * FROM sequence "
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration
1985,HISTDB_Culicidae_0,cenH3.2_(Culicidae),,,,7176.0,Culex quinquefasciatus,Arthropoda,Insecta,,,MPRRGPAPKKAGPKRGGPAPKNTRTKSPVSPRVPPPPPPPPPPPAQ...,
1986,HISTDB_Culicidae_1,cenH3.1_(Culicidae),,,,7176.0,Culex quinquefasciatus,Arthropoda,Insecta,,,MPRRVRTPPRRIPPQPSAKDGQRAGSSRNQPSQRDLQEAGPSRAGT...,
1987,HISTDB_Culicidae_10,cenH3.2_(Culicidae),,,,7160.0,Aedes albopictus,Arthropoda,Insecta,,,MPPRITKKSKTKKQKSAIPHDLEFMLGEEISSPLDSPVSPTEAEYS...,
1988,HISTDB_Culicidae_11,cenH3.1_(Culicidae),,,,7173.0,Anopheles arabiensis,Arthropoda,Insecta,,,MPRPKSAPRSLSERKERKSKVRTLRSQAQSDLFSSSEEEEDASQRN...,
1989,HISTDB_Culicidae_12,cenH3.2_(Culicidae),,,,7173.0,Anopheles arabiensis,Arthropoda,Insecta,,,MAPRKNTKKQPKPTARARKQTVERTPSPPKSADLELAYRPLKTVNE...,
1990,HISTDB_Culicidae_13,cenH3.2_(Culicidae),,,,41427.0,Anopheles atroparvus,Arthropoda,Insecta,,,MAPRKIGNTKNGRPKKTTEPPQPEVPSVASARDGNRLFIPSNEGLE...,
1991,HISTDB_Culicidae_14,cenH3.1_(Culicidae),,,,41427.0,Anopheles atroparvus,Arthropoda,Insecta,,,MPRRKSVPRASHQRDERKTRSTTSRNSSLNLSTDSSPSDTESHRAS...,
1992,HISTDB_Culicidae_15,cenH3.2_(Culicidae),,,,43041.0,Anopheles christyi,Arthropoda,Insecta,,,MAPRKNTKKQSKTSAGVRQQATERTPSPPRRSPVEEPAFRSLRTVN...,
1993,HISTDB_Culicidae_16,cenH3.1_(Culicidae),,,,43041.0,Anopheles christyi,Arthropoda,Insecta,,,MPRQKSAPRSMSRKAEKNNESRSSRSRGRELTTSSESDEEGEDASQ...,
1994,HISTDB_Culicidae_17,cenH3.1_(Culicidae),,,,1518534.0,Anopheles coluzzii,Arthropoda,Insecta,,,MPRPKSAPRSLSERKERKSKARTLRSQAQSDLFSSSEEEEDASQRN...,


In [158]:
df[df["accession"].isin(accessions)]["variant"].value_counts()

variant
cenH3.2_(Culicidae)    21
cenH3.1_(Culicidae)    19
cenH3.3_(Aedes)         2
Name: count, dtype: int64

In [159]:
# Make sure data is committed to the database
conn.commit()

### Add sequence publication

In [160]:
pid = "kursel_ancient_2020"
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"] == pid]

Unnamed: 0,id,title,doi,author,year,pubmed_id


In [161]:
data_publication = [
    {
        "id": pid,
        "title": None,
        "doi": None,
        "author": None,
        "year": None,
    }
]

In [162]:
for dp in data_publication:
    cursor.execute(add_publication, dp)

In [163]:
query = f"SELECT * FROM publication"
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["id"] == pid]

Unnamed: 0,id,title,doi,author,year,pubmed_id
117,kursel_ancient_2020,,,,,


In [164]:
for acc in accessions:
    cursor.execute(add_sequence_has_publication, (acc, pid))

In [165]:
query = (
    "SELECT * FROM sequence s LEFT JOIN sequence_has_publication sp "
    "ON s.accession = sp.sequence_accession "
)
cursor.execute(query)
df = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])
df[df["accession"].isin(accessions)]

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration,sequence_accession,publication_id
2298,HISTDB_Culicidae_0,cenH3.2_(Culicidae),,,,7176.0,Culex quinquefasciatus,Arthropoda,Insecta,,,MPRRGPAPKKAGPKRGGPAPKNTRTKSPVSPRVPPPPPPPPPPPAQ...,,HISTDB_Culicidae_0,kursel_ancient_2020
2299,HISTDB_Culicidae_1,cenH3.1_(Culicidae),,,,7176.0,Culex quinquefasciatus,Arthropoda,Insecta,,,MPRRVRTPPRRIPPQPSAKDGQRAGSSRNQPSQRDLQEAGPSRAGT...,,HISTDB_Culicidae_1,kursel_ancient_2020
2300,HISTDB_Culicidae_10,cenH3.2_(Culicidae),,,,7160.0,Aedes albopictus,Arthropoda,Insecta,,,MPPRITKKSKTKKQKSAIPHDLEFMLGEEISSPLDSPVSPTEAEYS...,,HISTDB_Culicidae_10,kursel_ancient_2020
2301,HISTDB_Culicidae_11,cenH3.1_(Culicidae),,,,7173.0,Anopheles arabiensis,Arthropoda,Insecta,,,MPRPKSAPRSLSERKERKSKVRTLRSQAQSDLFSSSEEEEDASQRN...,,HISTDB_Culicidae_11,kursel_ancient_2020
2302,HISTDB_Culicidae_12,cenH3.2_(Culicidae),,,,7173.0,Anopheles arabiensis,Arthropoda,Insecta,,,MAPRKNTKKQPKPTARARKQTVERTPSPPKSADLELAYRPLKTVNE...,,HISTDB_Culicidae_12,kursel_ancient_2020
2303,HISTDB_Culicidae_13,cenH3.2_(Culicidae),,,,41427.0,Anopheles atroparvus,Arthropoda,Insecta,,,MAPRKIGNTKNGRPKKTTEPPQPEVPSVASARDGNRLFIPSNEGLE...,,HISTDB_Culicidae_13,kursel_ancient_2020
2304,HISTDB_Culicidae_14,cenH3.1_(Culicidae),,,,41427.0,Anopheles atroparvus,Arthropoda,Insecta,,,MPRRKSVPRASHQRDERKTRSTTSRNSSLNLSTDSSPSDTESHRAS...,,HISTDB_Culicidae_14,kursel_ancient_2020
2305,HISTDB_Culicidae_15,cenH3.2_(Culicidae),,,,43041.0,Anopheles christyi,Arthropoda,Insecta,,,MAPRKNTKKQSKTSAGVRQQATERTPSPPRRSPVEEPAFRSLRTVN...,,HISTDB_Culicidae_15,kursel_ancient_2020
2306,HISTDB_Culicidae_16,cenH3.1_(Culicidae),,,,43041.0,Anopheles christyi,Arthropoda,Insecta,,,MPRQKSAPRSMSRKAEKNNESRSSRSRGRELTTSSESDEEGEDASQ...,,HISTDB_Culicidae_16,kursel_ancient_2020
2307,HISTDB_Culicidae_17,cenH3.1_(Culicidae),,,,1518534.0,Anopheles coluzzii,Arthropoda,Insecta,,,MPRPKSAPRSLSERKERKSKARTLRSQAQSDLFSSSEEEEDASQRN...,,HISTDB_Culicidae_17,kursel_ancient_2020


In [166]:
# Make sure data is committed to the database
conn.commit()

# Close connections

In [167]:
cursor.close()
conn.close()
tunnel.stop()