In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
import dill
import numpy as np
import multiprocessing_on_dill as mp
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from collections import defaultdict, Counter
from tqdm import tqdm
from itertools import chain
from cityhash import CityHash64
from itertools import groupby

from f723.tools.urs.extraction import assemble_chains, get_sec_struct_model
from f723.tools.dataset.entities import NucleotideFeatures, PairFeatures, PairMeta, PairData, make_pair, Pair

In [3]:
DATASET_DIR = '/home/mikhail/bioinformatics/data/dataset_all_60'

In [4]:
def get_batch(index):
    with open(os.path.join(DATASET_DIR, 'batch_{}'.format(index)), 'rb') as infile:
        return dill.load(infile)
    

def get_data():
    return chain.from_iterable((get_batch(i) for i in tqdm(range(30))))

Добавим к фичам ещё расстояние между нуклеотидами пары в смысле индексов нуклеотидов в цепи.

In [5]:
class FeaturesExtractor:
    def extract(self, item):
        raise NotImplementedError
    
    def description(self, item):
        raise NotImplementedError


class NucleotideFeaturesExtractor(FeaturesExtractor):
    SECONDARY_STRUCTURES = ['BC', 'BI', 'BP', 'HC', 'HI', 'HP', 'IC', 'II', 'IP', 'JC', 'JI', 'JP', 'S']
    BASES = ['a', 'u', 'g', 'c']
    NUCLEOTIDE_FEATURES_LEN = len(SECONDARY_STRUCTURES) + len(BASES) + 1 + 2
    
    def extract(self, nf):
        if nf is None:
            features = [0] * (self.NUCLEOTIDE_FEATURES_LEN + 1)
        else:
            features = [1]

            nucleotide_secondary_structures = nf.secondary_structure.split(';')
            for secondary_structure in self.SECONDARY_STRUCTURES:
                features.append(int(secondary_structure in nucleotide_secondary_structures))

            for base in self.BASES:
                features.append(int(nf.base == base))

            features.append(int(nf.base in self.BASES))

            features.extend([nf.fragment_length, nf.fragment_index])

        return features
    
    def description(self, nf):
        meta_features = [('nucleotide', 'is_dummy')]
        secondary_structure_features = [('secondary_structure', typ) for typ in self.SECONDARY_STRUCTURES]
        base_features = [('base', base) for base in self.BASES] + [('base', 'rare')]
        fragment_features = [('fragment', 'length'), ('fragment', 'index')]

        return meta_features + secondary_structure_features + base_features + fragment_features


class RelationFeaturesExtractor(FeaturesExtractor):
    RELATIONS = ['LC', 'LR', 'SM']
    
    def extract(self, pair):
        relation = [int(pair.features.relation == relation) for relation in self.RELATIONS] 
        distance = [abs(pair.meta.pair.nt_left.index - pair.meta.pair.nt_right.index)]
        
        return relation + distance
    
    def description(self, pair):
        return [('relation', relation) for relation in self.RELATIONS] + [('distance',)]
    

class PairFeaturesExtractor(FeaturesExtractor):
    def __init__(self):
        self._nucleotide_features_extractor = NucleotideFeaturesExtractor()
        self._relation_features_extractor = RelationFeaturesExtractor()
    
    def extract(self, pair):
        result_features = []
        neighbour_sets = [pair.features.neighbours_left, pair.features.neighbours_right]

        for neighbour_set_permutation in [neighbour_sets, reversed(neighbour_sets)]:
            features = []
            
            for nf in chain.from_iterable(neighbour_set_permutation):
                features.extend(self._nucleotide_features_extractor.extract(nf))
            features.extend(self._relation_features_extractor.extract(pair))
            
            result_features.append(features)
        
        return result_features
    
    def description(self, pair):
        features_description = []
        
        for lr, nfs in [('left', pair.features.neighbours_left), 
                        ('right', pair.features.neighbours_right)]:
            for index, nf in enumerate(nfs):
                for feature_description in self._nucleotide_features_extractor.description(pair):
                    features_description.append((lr, index - (len(nfs) - 1) // 2, feature_description))
        
        features_description.extend(self._relation_features_extractor.description(pair))
        
        return features_description

In [6]:
FEATURES_PATH = os.path.join(DATASET_DIR, 'features_with_distance.npy')
TARGET_PATH = os.path.join(DATASET_DIR, 'target_with_distance.npy')
GROUPS_PATH = os.path.join(DATASET_DIR, 'groups_with_distance.npy')

PAIR_TYPES = ['ss_bps', 'noncanonical_bps', 'random']

In [7]:
pair_sample = get_batch(0)[0]
FEATURES_SHAPE = (2 * sum(1 for _ in get_data()), 
                  len(PairFeaturesExtractor().description(pair_sample)))

100%|██████████| 30/30 [02:45<00:00,  2.42s/it]


In [10]:
def prepare_features():
    pair_features_extractor = PairFeaturesExtractor()
    pair_sample = get_batch(0)[0]
    features = np.memmap(FEATURES_PATH, shape=FEATURES_SHAPE, mode='w+')
    
    index = 0
    for pair_data in get_data():
        for pair_features in pair_features_extractor.extract(pair_data):
            features[index] = pair_features
            
            index += 1
            
    target = np.repeat([PAIR_TYPES.index(pair_data.meta.type) for pair_data in get_data()], 2)
    pdb_ids = np.repeat([pair_data.meta.pdb_id for pair_data in get_data()], 2)
    
    np.save(TARGET_PATH, target)
    np.save(GROUPS_PATH, pdb_ids)

In [11]:
prepare_features()

100%|██████████| 30/30 [14:33<00:00, 22.99s/it]
100%|██████████| 30/30 [01:30<00:00,  2.54s/it]
100%|██████████| 30/30 [01:29<00:00,  2.50s/it]


In [8]:
def load_features():
    features = np.memmap(FEATURES_PATH, shape=FEATURES_SHAPE)
    target = np.load(TARGET_PATH)
    pdb_ids = np.load(GROUPS_PATH)
    
    return features, target, pdb_ids

In [9]:
features, target, pdb_ids = load_features()

In [10]:
mask = target != 0  # throw away secondary structure pairs

features = features[mask]
target = target[mask]
pdb_ids = pdb_ids[mask]

In [11]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

In [12]:
group_kfold = GroupKFold(n_splits=5)
group_kfold.get_n_splits(features, target, pdb_ids)
feature_importances = []

for train_index, test_index in group_kfold.split(features, target, pdb_ids):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = target[train_index], target[test_index]
    print('Train positive rate: {}, test positive rate: {}'.format(
        np.mean(y_train == 1), np.mean(y_test == 1)))
    
    model = RandomForestClassifier(class_weight='balanced', n_estimators=100, n_jobs=8)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    feature_importances.append(model.feature_importances_)
    
    print(precision_recall_fscore_support(y_test, y_pred))
    
    break

Train positive rate: 0.0026536722659665614, test positive rate: 0.002601588276253125
(array([0.89151599, 0.99833828]), array([0.36194241, 0.99988512]), array([0.51485944, 0.9991111 ]), array([   3542, 1357934]))


Никакой разницы эта фича не делает

In [15]:
description = PairFeaturesExtractor().description(pair_sample)
feature_importances = np.array(feature_importances)
mean_feature_importances = feature_importances.mean(axis=0)

In [16]:
[(description[i], mean_feature_importances[i]) for i in np.argsort(mean_feature_importances)[-50:]]

[(('right', -3, ('fragment', 'index')), 0.005200427607819375),
 (('left', 0, ('fragment', 'index')), 0.005250879806340551),
 (('right', -4, ('fragment', 'length')), 0.0054191361016154346),
 (('right', 0, ('fragment', 'index')), 0.0054558460694541155),
 (('right', 1, ('secondary_structure', 'JC')), 0.005575806803694708),
 (('right', 1, ('secondary_structure', 'HC')), 0.0056328425208703415),
 (('right', -3, ('fragment', 'length')), 0.005794128276486197),
 (('left', -5, ('fragment', 'length')), 0.005854986018202906),
 (('left', 1, ('secondary_structure', 'IC')), 0.005914806926669246),
 (('left', -3, ('fragment', 'length')), 0.005981165060443585),
 (('right', -2, ('fragment', 'length')), 0.006013305962351617),
 (('right', 5, ('fragment', 'index')), 0.006061284148834449),
 (('left', -2, ('fragment', 'length')), 0.006325700292497207),
 (('right', -5, ('fragment', 'length')), 0.006355754244356948),
 (('right', -1, ('fragment', 'length')), 0.006511810452424899),
 (('left', 5, ('fragment', 'ind

Хоть distance и находится в ряду наиболее значимых фичей, попробуем объяснить, почему она не поднимает качество. Мне кажется, дело в том, что distance хорошо помогает отбрасывать большое количество случайных пар на больших расстояниях. Но так как мы работаем с парами на расстоянии не более 60 нуклеотидов, то здесь уже распределение неканонических пар по расстоянию гораздо более похоже на равномерное, а сама по себе фича distance не позволяет отличить неканоническое спаривание от случайной пары. Фактически мы уже использовали главный потенциал distance, когда решили работать с парами на малых расстояниях.

Потюним RandomForest по гриду

In [13]:
for num_estimators in [50, 100, 200]:
    for max_features in [10, 20, 40, 80, 120]:
        print('num_estimators={}; max_features={}'.format(num_estimators, max_features))
        
        model = RandomForestClassifier(class_weight='balanced', n_estimators=num_estimators, 
                                       n_jobs=8, max_features=max_features)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
    
        print(precision_recall_fscore_support(y_test, y_pred))

num_estimators=50; max_features=10
(array([0.91122914, 0.99827888]), array([0.33907397, 0.99991384]), array([0.49423868, 0.99909569]), array([   3542, 1357934]))
num_estimators=50; max_features=20
(array([0.88590604, 0.99836616]), array([0.37267081, 0.99987481]), array([0.52464229, 0.99911992]), array([   3542, 1357934]))
num_estimators=50; max_features=40
(array([0.87654321, 0.99838743]), array([0.38085827, 0.99986008]), array([0.53099784, 0.99912321]), array([   3542, 1357934]))
num_estimators=50; max_features=80
(array([0.87557604, 0.99837348]), array([0.37549407, 0.99986082]), array([0.52558783, 0.99911659]), array([   3542, 1357934]))
num_estimators=50; max_features=120
(array([0.88420348, 0.99836689]), array([0.37295313, 0.9998726 ]), array([0.52462272, 0.99911918]), array([   3542, 1357934]))
num_estimators=100; max_features=10
(array([0.90845617, 0.99825686]), array([0.33060418, 0.9999131 ]), array([0.48478576, 0.99908429]), array([   3542, 1357934]))
num_estimators=100; max_fe

KeyboardInterrupt: 

Пока результаты не очень: дефолтные параметры дают качество на том же уровне.