In [54]:
import io
import json
import os
import shutil
import subprocess
import sys

import numpy as np
import pandas as pd
from Bio import Entrez, SearchIO, SeqIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.AlignInfo import SummaryInfo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from ete3 import Tree
from mysql.connector import connection
from sshtunnel import SSHTunnelForwarder

In [2]:
with open("db_curated_server_info.txt", "r") as file:
    lines = file.readlines()

config = {}

for line in lines:
    line = line.strip()
    if line and not line.startswith("#"):
        key, value = line.split("=", 1)
        config[key] = value.strip()

server_name = config.get("server_name")
srever_port = int(config.get("srever_port"))
ssh_password = config.get("ssh_password")
ssh_username = config.get("ssh_username")
db_adress = config.get("db_adress")
db_port = int(config.get("db_port"))

In [3]:
tunnel = SSHTunnelForwarder(
    (server_name, srever_port),
    ssh_password=ssh_password,
    ssh_username=ssh_username,
    remote_bind_address=(db_adress, db_port),
)
tunnel.start()
print(tunnel.local_bind_port)

44071


In [4]:
conn = connection.MySQLConnection(
    user="db_user",
    password="db_password",
    host="localhost",
    port=tunnel.local_bind_port,
    database="db_name",
)
cursor = conn.cursor()

In [5]:
query = "SHOW TABLES;"
cursor.execute(query)
cursor.fetchall()

[('alternative_name',),
 ('histone',),
 ('histone_description',),
 ('histone_has_publication',),
 ('publication',),
 ('sequence',),
 ('sequence_has_publication',)]

In [6]:
query = "SELECT * FROM histone"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'id, level, taxonomic_span, taxonomic_span_id, description, parent'

In [7]:
query = "SELECT * FROM sequence"
cursor.execute(query)
cursor.fetchall()
", ".join([i[0] for i in cursor.description])

'accession, variant, gi, ncbi_gene_id, hgnc_gene_name, taxonomy_id, organism, phylum, class, taxonomy_group, info, sequence, variant_under_consideration'

In [13]:
query = (
    "SELECT h.id, h.level, COUNT(accession) as count FROM histone h LEFT JOIN sequence s ON h.id = s.variant "
    "GROUP BY h.id "
)
cursor.execute(query)
count_df = pd.DataFrame(
    cursor.fetchall(), columns=[i[0] for i in cursor.description]
).sort_values(["count"])
count_df

Unnamed: 0,id,level,count
0,Archaeal,type,0
141,H2A.Z.2_(Primates),variant,0
139,H2A.Z.2.s2_(Primates),variant,0
137,H2A.Z.2.s1_(Primates),variant,0
135,H2A.Z.1_(Primates),variant,0
...,...,...,...
132,H2A.Z,variant_group,20
59,cH3,variant_group,23
120,H2A.Q,variant,28
50,cH2B_(Chlorophyta),variant,35


In [16]:
count_df[count_df["count"] == 0]

Unnamed: 0,id,level,count
0,Archaeal,type,0
141,H2A.Z.2_(Primates),variant,0
139,H2A.Z.2.s2_(Primates),variant,0
137,H2A.Z.2.s1_(Primates),variant,0
135,H2A.Z.1_(Primates),variant,0
...,...,...,...
27,cH2A_(Mus_musculus),variant,0
53,cH2B_(Homo_sapiens),variant,0
31,cH2B,variant_group,0
25,cH2A_(Homo_sapiens),variant,0


In [14]:
query = "SELECT * FROM sequence WHERE variant='H2A.Z.2_(Primates)'"
cursor.execute(query)
pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description])

Unnamed: 0,accession,variant,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,sequence,variant_under_consideration


In [24]:
def get_tree_dict(cursor, paretnt_id):
    cursor.execute(f"SELECT id FROM histone WHERE parent = '{paretnt_id}'")
    res = cursor.fetchall()
    if len(res) < 1:
        return "null"
    return {v: get_tree_dict(cursor, v) for v, *_ in res}

In [25]:
def dict2tree(tree, d):
    """
    converts tree from classification.json to a ete3 object
    d is
    with open('classification.json') as json_file:
        data = json.load(json_file)
    d=data['tree']
    """
    for k, v in d.items():
        CH = tree.add_child(name=k)
        if isinstance(v, dict):
            dict2tree(CH, v)

In [30]:
def muscle_aln(sequences, options=[], debug=False, sort=True):
    muscle = os.path.join(os.path.dirname(sys.executable), "muscle")
    process = subprocess.Popen(
        [muscle] + options,
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    aln, error = process.communicate(sequences.encode("utf-8"))
    if debug:
        print(sequences)
        print()
        print("Stderr:")
        print(error.decode("utf-8"))
        print("Stdout:")
        print(aln.decode("utf-8"))
    seqFile = io.StringIO()
    seqFile.write(aln.decode("utf-8"))
    seqFile.seek(0)
    sequences_ids = [s.split(" ", 1)[0] for s in sequences.split(">")]
    sequences = list(
        SeqIO.parse(seqFile, "fasta")
    )  # Not in same order, but does it matter?
    if sort:
        sequences.sort(key=lambda x: sequences_ids.index(x.id))  # Yes, it matters
    msa = MultipleSeqAlignment(sequences)
    return msa

In [70]:
def muscle_p2p_aln(msa1, msa2, options=[], debug=False):
    """
    align two alignments
    :return: MultipleSeqAlignment object
    """
    os.makedirs("tmp/")  # create tmp dir to save msa profiles
    try:
        with open("tmp/one.afa", "w") as f:
            f.write(format(msa1, "fasta"))
        with open("tmp/two.afa", "w") as f:
            f.write(format(msa2, "fasta"))

        muscle = os.path.join(os.path.dirname(sys.executable), "muscle")
        process = subprocess.Popen(
            [muscle]
            + options
            + ["-profile", "-in1", "tmp/one.afa", "-in2", "tmp/two.afa"],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        aln, error = process.communicate()
        if debug:
            print("Stderr:")
            print(error.decode("utf-8"))
            print("Stdout:")
            print(aln.decode("utf-8"))

        seqFile = io.StringIO()
        seqFile.write(aln.decode("utf-8"))
        seqFile.seek(0)
        sequences = list(
            SeqIO.parse(seqFile, "fasta")
        )  # Not in same order, but does it matter?
        msa = MultipleSeqAlignment(sequences)
    except:
        shutil.rmtree("tmp/")
        raise
    shutil.rmtree("tmp/")  # rm -rf tmp dir
    return msa

In [49]:
def get_fasta_seq(data):
    return "\n".join(
        [
            SeqRecord(
                Seq(row["sequence"]),
                id=f"{row['organism'].split()[0]}|{row['accession']}|{row['variant']}",
                name=row["accession"],
                description=f"organism={row['organism']} phylum={row['phylum']} class={row['class']}",
            ).format("fasta")
            for i, row in data.iterrows()
        ]
    )

In [88]:
def generate_draft_seeds(hist_tree, sequence_data, save_directory):
    # generate draft seeds - needs debugging
    draft_seeds_msa = {}

    # create directory if not exist or rewrite the directory
    try:
        os.makedirs(save_directory)
    except FileExistsError:
        # directory already exists
        shutil.rmtree(save_directory)
        os.makedirs(save_directory)

    # hist_tree traversal
    for node in hist_tree.traverse("postorder"):
        print("Processing ", node.name)
        if node.is_root(): continue
        if node.is_leaf():  # we get sequences for that variant and align them.
            draft_seeds_msa[node.name] = muscle_aln(
                get_fasta_seq(
                    sequence_data.query(
                        f'type=="{node.name}" | variant=="{node.name}"',
                        engine="python",
                    )
                )
            )
            print(node.name, "Alignment length:", len(draft_seeds_msa[node.name]))
            with open(f"{save_directory}/{node.name}.fasta", "w") as f:
                f.write(format(draft_seeds_msa[node.name], "fasta"))
        elif not node.is_root():  # we will do profile to profile alignment
            # we should first check if there are seqs with this subvariant as the most specific one
            print(f"\t Node is internal, progressive alignment:")
            msa = muscle_aln(
                get_fasta_seq(
                    sequence_data.query(
                        f'variant=="{node.name}"'
                    )
                )
            )
            draft_seeds_msa[node.name + "_only"] = msa
            print(f"\t\t For {node.name} aligned {len(msa)} sequences")
            # progressively align
            for ch in node.get_children():
                if len(msa) == 0:
                    msa = draft_seeds_msa[ch.name]
                    print(
                        f"\t\t Adding child {node.name} aligned {len(draft_seeds_msa[ch.name])} sequences"
                    )
                    continue
                elif len(draft_seeds_msa[ch.name]) != 0:
                    msa = muscle_p2p_aln(msa, draft_seeds_msa[ch.name])
                    print(
                        f"\t\t Adding child {node.name} aligned {len(draft_seeds_msa[ch.name])} sequences"
                    )
                else:
                    continue
            draft_seeds_msa[node.name] = msa
            print(node.name, "Alignment length:", len(draft_seeds_msa[node.name]))
            with open(f"{save_directory}/{node.name}.fasta", "w") as f:
                f.write(format(draft_seeds_msa[node.name], "fasta"))
            with open(f"{save_directory}/{node.name}_only.fasta", "w") as f:
                f.write(format(draft_seeds_msa[node.name + "_only"], "fasta"))
        #         print(f"\t\t Final for {node.name} aligned {len(draft_seeds_msa[node.name])} sequences")

In [43]:
query = "SELECT id FROM histone WHERE parent IS NULL"
cursor.execute(query)
types = cursor.fetchall()
variants_tree = {}
for t, *_ in types:
    variants_tree[t] = get_tree_dict(cursor, t)
variants_tree

{'Archaeal': 'null',
 'H1': {'cH1': 'null',
  'generic_H1': 'null',
  'H1.0': {'H1.0_(Homo_sapiens)': 'null'},
  'H1.1': {'H1.1_(Homo_sapiens)': 'null'},
  'H1.10': {'H1.10_(Homo_sapiens)': 'null'},
  'H1.2': {'H1.2_(Homo_sapiens)': 'null'},
  'H1.3': {'H1.3_(Homo_sapiens)': 'null'},
  'H1.4': {'H1.4_(Homo_sapiens)': 'null'},
  'H1.5': {'H1.5_(Homo_sapiens)': 'null'},
  'OO_H1.8': {'H1.8_(Homo_sapiens)': 'null'},
  'scH1': 'null',
  'TS_H1.6': {'H1.6_(Homo_sapiens)': 'null'},
  'TS_H1.7': {'H1.7_(Homo_sapiens)': 'null'},
  'TS_H1.9': 'null'},
 'H2A': {'cH2A': {'cH2A_(Animals)': {'cH2A_(Vertebrata)': {'cH2A_(Mammalia)': {'cH2A_(Homo_sapiens)': {'cH2A.10_(Homo_sapiens)': 'null',
       'cH2A.11_(Homo_sapiens)': 'null',
       'cH2A.1_(Homo_sapiens)': 'null',
       'cH2A.2_(Homo_sapiens)': 'null',
       'cH2A.3_(Homo_sapiens)': 'null',
       'cH2A.4_(Homo_sapiens)': 'null',
       'cH2A.5_(Homo_sapiens)': 'null',
       'cH2A.6_(Homo_sapiens)': 'null',
       'cH2A.7_(Homo_sapiens)': '

In [44]:
hist_tree = Tree()
dict2tree(hist_tree, variants_tree)
print(hist_tree.get_ascii(show_internal=True))


   /-Archaeal
  |
  |   /-cH1
  |  |
  |  |--generic_H1
  |  |
  |  |-H1.0-H1.0_(Homo_sapiens)
  |  |
  |  |-H1.1-H1.1_(Homo_sapiens)
  |  |
  |  |-H1.10-H1.10_(Homo_sapiens)
  |  |
  |  |-H1.2-H1.2_(Homo_sapiens)
  |  |
  |  |-H1.3-H1.3_(Homo_sapiens)
  |-H1
  |  |-H1.4-H1.4_(Homo_sapiens)
  |  |
  |  |-H1.5-H1.5_(Homo_sapiens)
  |  |
  |  |-OO_H1.8-H1.8_(Homo_sapiens)
  |  |
  |  |--scH1
  |  |
  |  |-TS_H1.6-H1.6_(Homo_sapiens)
  |  |
  |  |-TS_H1.7-H1.7_(Homo_sapiens)
  |  |
  |   \-TS_H1.9
  |
  |                                                                        /-cH2A.10_(Homo_sapiens)
  |                                                                       |
  |                                                                       |--cH2A.11_(Homo_sapiens)
  |                                                                       |
  |                                                                       |--cH2A.1_(Homo_sapiens)
  |                                         

In [45]:
sequence_df = pd.read_csv(
    "/home/l_singh/_scratch/hdb/project_dir/histonedb/CURATED_SET/histones.csv"
).fillna("")
sequence_df.index = list(sequence_df["accession"])

In [80]:
sequence_df[sequence_df["variant"].str.contains("H3_(Lilly\?\?\?)")]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence


In [75]:
sequence_df.query(
    f'type=="H3_(Lilly???)" | variant.str.contains("H3_(Lilly???)")',
    engine="python",
)

error: multiple repeat at position 11

In [87]:
sequence_df[sequence_df["variant"].str.contains("cH2A.1_\(Homo_sapiens\)")]

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence
NP_734466.1,NP_734466.1,H2A,cH2A,cH2A.1_(Homo_sapiens),,,221613.0,H2AC1,9606.0,Homo sapiens,Chordata,Mammalia,Mammalia,,2011515 24506885 7068607,MSGRGKQGGKARAKSKSRSSRAGLQFPVGRIHRLLRKGNYAERIGA...


In [85]:
sequence_df.query(
    f'type=="cH2A.1_(Homo_sapiens)" | variant.str.contains("cH2A.1_(Homo_sapiens)")',
    engine="python",
)

Unnamed: 0,accession,type,variant_group,variant,doublet,gi,ncbi_gene_id,hgnc_gene_name,taxonomy_id,organism,phylum,class,taxonomy_group,info,references,sequence


In [89]:
generate_draft_seeds(
    hist_tree,
    sequence_df,
    "/home/l_singh/_scratch/hdb/project_dir/histonedb/CURATED_SET/draft_seeds",
)

Processing  Archaeal
Archaeal Alignment length: 0
Processing  cH1
cH1 Alignment length: 0
Processing  generic_H1
generic_H1 Alignment length: 13
Processing  H1.0_(Homo_sapiens)
H1.0_(Homo_sapiens) Alignment length: 1
Processing  H1.0
	 Node is internal, progressive alignment:
		 For H1.0 aligned 14 sequences
		 Adding child H1.0 aligned 1 sequences
H1.0 Alignment length: 15
Processing  H1.1_(Homo_sapiens)
H1.1_(Homo_sapiens) Alignment length: 1
Processing  H1.1
	 Node is internal, progressive alignment:
		 For H1.1 aligned 0 sequences
		 Adding child H1.1 aligned 1 sequences
H1.1 Alignment length: 1
Processing  H1.10_(Homo_sapiens)
H1.10_(Homo_sapiens) Alignment length: 1
Processing  H1.10
	 Node is internal, progressive alignment:
		 For H1.10 aligned 5 sequences
		 Adding child H1.10 aligned 1 sequences
H1.10 Alignment length: 6
Processing  H1.2_(Homo_sapiens)
H1.2_(Homo_sapiens) Alignment length: 1
Processing  H1.2
	 Node is internal, progressive alignment:
		 For H1.2 aligned 0 seq

# Close connection

In [15]:
cursor.close()
conn.close()
tunnel.stop()