In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import dill
import re
import networkx as nx
import pandas as pd

from tqdm import tqdm
from itertools import chain
from sklearn.manifold import TSNE
from collections import defaultdict, Counter

from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve

from f723.tools.show.classification import plot_precision_recall_curve
from f723.tools.models.application import ClassificationResult
from f723.tools.urs.extraction import assemble_chains
from f723.tools.dataset.entities import make_pair

In [2]:
NRLIST_PATH = '/home/mikhail/bioinformatics/data/nrlist_3.76_3.0A.csv'
CIF_DIR = '/home/mikhail/bioinformatics/data/NR_3.0/cif/'
OUT_DIR = '/home/mikhail/bioinformatics/data/NR_3.0/out/'
NRLIST_PATH = '/home/mikhail/bioinformatics/data/nrlist_3.76_3.0A.csv'
SEC_STRUCT_DIR = '/home/mikhail/bioinformatics/data/sec_struct'
DATASET_DIR = '/home/mikhail/bioinformatics/data/dataset_all_60'

In [3]:
def get_batch(index):
    with open(os.path.join(DATASET_DIR, 'batch_{}'.format(index)), 'rb') as infile:
        return dill.load(infile)
    

def get_data():
    return chain.from_iterable((get_batch(i) for i in tqdm(range(30))))

In [4]:
with open('/home/mikhail/bioinformatics/data/nonredundant.txt', 'r') as infile:
    nonredundant_chain_ids = {tuple(chain_id.split('.cif1_')) for chain_id in infile.read().splitlines()}

Сначала посмотрим на элементы вторичной структуры, в которых находятся SM-HC-AG пары.

Соберем граф, в котором вершинами будут нуклеотиды, а ребрами -- SM-пары. Тогда, компоненты связности получившегося графа будут элементами вторичной структуры. Есть предположение, что так как SM-HC-AG пары кластеризуются куда лучше других, то им соответствует некоторый очень частотный элемент вторичной структуры, который при этом других неканонических пар не содержит, иначе, вероятно, эти соседствующие пары тоже бы классифицировались хорошо.

In [5]:
graph = nx.Graph()
noncanonical_pairs = set()
ag_hc_pairs = set()

for pair_data in get_data():
    meta = pair_data.meta
    
    if (meta.pdb_id, meta.pair.nt_left.chain_id) in nonredundant_chain_ids:
        nt_left = meta.pair.nt_left
        nt_right = meta.pair.nt_right
        nt_left_features = pair_data.features.neighbours_left[5]
        nt_right_features = pair_data.features.neighbours_right[5]
        
        node_left = meta.pdb_id, nt_left, nt_left_features.fragment_length, nt_left_features.fragment_index
        node_right = meta.pdb_id, nt_right, nt_right_features.fragment_length, nt_right_features.fragment_index
        
        graph.add_nodes_from([node_left, node_right])

        if pair_data.features.relation == 'SM':
            graph.add_edge(node_left, node_right)
            
            if meta.type == 'noncanonical_bps':
                noncanonical_pairs.add(node_left)
                noncanonical_pairs.add(node_right)
                
                ss = nt_left_features.secondary_structure
                if ss == 'HC' and {nt_left.base, nt_right.base} == {'A', 'G'}:
                    ag_hc_pairs.add(node_left)
                    ag_hc_pairs.add(node_right)

100%|██████████| 30/30 [01:54<00:00,  3.83s/it]


In [6]:
components = list(nx.connected_components(graph))

In [7]:
ag_hc_components = [c for c in components if c & ag_hc_pairs]

In [8]:
Counter(map(len, ag_hc_components)).most_common()

[(4, 55),
 (5, 9),
 (7, 6),
 (8, 4),
 (6, 3),
 (15, 3),
 (10, 2),
 (9, 2),
 (13, 2),
 (12, 1),
 (19, 1),
 (122, 1),
 (17, 1),
 (22, 1),
 (31, 1)]

In [9]:
for c in ag_hc_components:
    print(len(noncanonical_pairs & c), len(ag_hc_pairs & c))
    print('\n'.join(map(str, c)))
    print()

2 2
('3rw6', Nucleotide(id='H.G.28.', base='G', chain_id='H', index=27), 4, 0)
('3rw6', Nucleotide(id='H.A.29.', base='A', chain_id='H', index=28), 4, 1)
('3rw6', Nucleotide(id='H.A.31.', base='A', chain_id='H', index=30), 4, 3)
('3rw6', Nucleotide(id='H.A.30.', base='A', chain_id='H', index=29), 4, 2)

2 2
('4p95', Nucleotide(id='A.A.570.', base='A', chain_id='A', index=185), 4, 2)
('4p95', Nucleotide(id='A.A.571.', base='A', chain_id='A', index=186), 4, 3)
('4p95', Nucleotide(id='A.G.568.', base='G', chain_id='A', index=183), 4, 0)
('4p95', Nucleotide(id='A.A.569.', base='A', chain_id='A', index=184), 4, 1)

2 2
('4y4o', Nucleotide(id='1a.A.162.', base='A', chain_id='1a', index=155), 4, 3)
('4y4o', Nucleotide(id='1a.A.160.', base='A', chain_id='1a', index=153), 4, 1)
('4y4o', Nucleotide(id='1a.A.161.', base='A', chain_id='1a', index=154), 4, 2)
('4y4o', Nucleotide(id='1a.G.159.', base='G', chain_id='1a', index=152), 4, 0)

2 2
('4y4o', Nucleotide(id='1a.G.299.', base='G', chain_id='1

In [10]:
len(ag_hc_pairs)

198

In [11]:
sum([len(c & ag_hc_pairs) for c in components if 
     len(c) == 4 and len(c & noncanonical_pairs) == len(c & ag_hc_pairs)]) / len(ag_hc_pairs)

0.5555555555555556

Итого 55% SM-AG-HC пар лежат в маленьких повторяющихся структурах, где кроме них ничего нет.

In [12]:
all_chains = assemble_chains(NRLIST_PATH, CIF_DIR, OUT_DIR, SEC_STRUCT_DIR)

100%|██████████| 1074/1074 [00:47<00:00, 22.64it/s]


In [14]:
noncanonical_lw = {}

for c in all_chains:
    if (c.pdb_id, c.id) in nonredundant_chain_ids:
        for bp in c.noncanonical_bps:
            noncanonical_lw[c.pdb_id, make_pair(bp.nt_left, bp.nt_right)] = bp.type.lw

In [15]:
regex = re.compile('[ct][SWH]{2}')
symmetric_keys = {}

for key in set(noncanonical_lw.values()):
    if regex.match(key):
        bond = ''.join(sorted(key[1:]))
        symmetric_keys[key] = key[0] + bond

In [16]:
stats = []

for pair_data in get_data():
    meta = pair_data.meta
    
    noncanonical = meta.type == 'noncanonical_bps'
    nonredundant = (meta.pdb_id, meta.pair.nt_left.chain_id) in nonredundant_chain_ids
    bases = [meta.pair.nt_left.base, meta.pair.nt_right.base] 
    standard = all([b in 'AUGC' for b in bases])
    
    if all([noncanonical, nonredundant, standard]):
        lw = symmetric_keys.get(noncanonical_lw[meta.pdb_id, make_pair(meta.pair.nt_left, meta.pair.nt_right)])
        
        if lw is not None:
            stats.append((''.join(sorted(bases)), pair_data.features.relation, lw))

100%|██████████| 30/30 [03:02<00:00,  6.10s/it]


In [17]:
table_data = defaultdict(Counter)

for base, relation, lw in stats:
    table_data[base, lw][relation] += 1
    table_data['total', lw][relation] += 1
    table_data[base, 'total'][relation] += 1
    table_data['total', 'total'][relation] += 1

In [18]:
all_bases = list({a[0] for a in stats}) + ['total']
all_lws = list({a[2] for a in stats}) + ['total']

table = pd.DataFrame(index=all_bases, columns=all_lws)
for key, cntr in table_data.items():
    num = sum(cntr.values())
    table.loc[key] = tuple([int(100 * cntr[k] / num) for k in ['SM', 'LC', 'LR']]) + (num,)

In [19]:
table

Unnamed: 0,tSS,tSW,tHH,tHS,cWW,cSW,cSS,cHS,tWW,cHH,tHW,cHW,total
AG,"(7, 51, 40, 52)","(38, 40, 20, 54)","(60, 0, 40, 5)","(95, 1, 3, 250)","(88, 2, 8, 45)","(45, 35, 18, 37)","(5, 47, 47, 17)","(42, 52, 5, 19)","(100, 0, 0, 1)","(71, 28, 0, 7)","(91, 0, 8, 12)","(80, 5, 15, 20)","(70, 16, 12, 519)"
CU,,"(100, 0, 0, 1)",,"(50, 50, 0, 4)","(100, 0, 0, 24)","(33, 33, 33, 3)",,"(22, 77, 0, 9)","(66, 0, 33, 3)","(50, 50, 0, 4)","(50, 50, 0, 6)","(0, 100, 0, 3)","(64, 31, 3, 57)"
CG,"(100, 0, 0, 1)","(40, 0, 60, 10)","(0, 100, 0, 1)","(50, 50, 0, 2)","(87, 1, 10, 156)","(50, 50, 0, 8)","(13, 33, 53, 15)","(60, 40, 0, 5)","(75, 0, 25, 8)","(0, 66, 33, 3)","(25, 75, 0, 4)","(81, 18, 0, 11)","(74, 10, 15, 224)"
AU,"(100, 0, 0, 1)","(100, 0, 0, 4)","(100, 0, 0, 2)","(77, 22, 0, 9)","(74, 0, 25, 35)","(11, 55, 33, 9)","(100, 0, 0, 2)","(53, 23, 23, 13)","(90, 0, 10, 20)","(100, 0, 0, 1)","(94, 0, 4, 121)","(51, 23, 25, 39)","(79, 7, 12, 256)"
AA,,"(33, 33, 33, 3)","(96, 3, 0, 30)","(86, 6, 6, 15)","(77, 11, 11, 9)","(63, 36, 0, 11)",,"(100, 0, 0, 8)","(89, 0, 10, 19)","(100, 0, 0, 1)","(82, 3, 14, 28)","(100, 0, 0, 1)","(85, 7, 7, 125)"
CC,,,,"(100, 0, 0, 6)","(88, 11, 0, 18)","(12, 75, 12, 8)",,"(33, 66, 0, 6)","(50, 50, 0, 2)",,"(100, 0, 0, 2)",,"(66, 30, 2, 42)"
GG,"(43, 12, 43, 16)","(66, 33, 0, 3)","(50, 50, 0, 2)","(86, 13, 0, 15)","(83, 16, 0, 6)","(20, 80, 0, 5)","(0, 20, 80, 5)","(53, 33, 13, 15)","(100, 0, 0, 2)",,"(75, 20, 5, 20)","(44, 40, 14, 27)","(56, 27, 15, 116)"
AC,,"(28, 50, 21, 14)","(100, 0, 0, 2)","(88, 11, 0, 9)","(94, 0, 5, 36)","(12, 87, 0, 24)","(0, 60, 40, 10)","(39, 60, 0, 23)","(50, 0, 50, 6)",,"(79, 20, 0, 34)","(50, 50, 0, 10)","(56, 36, 7, 168)"
UU,"(50, 0, 50, 2)","(0, 100, 0, 1)",,"(100, 0, 0, 2)","(96, 0, 3, 32)","(0, 100, 0, 1)",,"(0, 50, 50, 2)","(83, 0, 16, 6)",,"(100, 0, 0, 6)","(0, 0, 100, 2)","(83, 5, 11, 54)"
GU,"(100, 0, 0, 1)","(75, 25, 0, 16)",,"(100, 0, 0, 4)","(92, 7, 0, 27)","(33, 41, 25, 12)","(0, 50, 50, 4)","(89, 10, 0, 48)","(37, 25, 37, 8)","(100, 0, 0, 1)","(75, 0, 25, 8)","(85, 14, 0, 7)","(77, 15, 7, 136)"
