In [1]:
import pandas as pd
import sys
sys.path.append('/mnt/wartmann_pe940/git/AllExons/Repository/Scripts/')
import utility as ut
import numpy as np
import re

In [2]:
root_dir = '/mnt/wartmann_pe940/git/AllExons/'
dir_data = '/mnt/wartmann_pe940/git/AllExons/Data/datamatrix/list/dataMatrix_std.txt'
dir_doid = '/mnt/wartmann_pe940/git/AllExons/Data/DOID_labelsList_mapping.pkl'
data_mit = pd.read_csv(dir_data)
doid_labels = pd.read_pickle(dir_doid)
max_depth = 4 


In [3]:
rm_me = list(data_mit.max(axis=1) > 1)
# set exon names as index
data_mit.set_index('exons', inplace=True, drop=False)
# select exons with TPM >1
data_mit = data_mit.loc[rm_me]
# select exons with valid labeling
data_mit = data_mit[~pd.isna(data_mit['labels']) == True]



# send to preprocess (drops zeros, ambigious exons (exons annotated with multiple diseases)
# and does max-min scaling)

data_mit = ut.pre_process(data_mit)

# convert label string to list
data_mit['labels'] = data_mit['labels'].apply(ut.str_to_list)
# list of lenghts (i.e. depth) of label list per exon
labeling_depth = list(map(len, data_mit['labels'].values))

# idx for exons with depth >= XX
min_length = list(map(lambda x: x >= max_depth, labeling_depth))

# select exons with min depth
data_mit = data_mit[min_length]

df_tree = ut.build_tree(data_mit, doid_labels)

In [4]:
# this tree has to be pruned now to the desired subtree
# 'disease' is the only entry on lvl 0
# drop all lvl 1 nodes that are not XX and 'disease'
use_this_tree = df_tree.drop(df_tree[
    (~df_tree['label'].isin(['disease of anatomical entity','disease of metabolism'])) &
    (df_tree['lvl'] == 1)].index)

In [5]:
# drop all nodes below lvl 1 that have less than XX members
use_this_tree.drop(use_this_tree[(use_this_tree['count_all'] < 200)].index, inplace=True)

In [6]:
use_this_tree.drop(use_this_tree[use_this_tree['lvl'] > max_depth-1].index, inplace=True)

In [7]:
# now that we removed some of the top nodes, let's remove all their children
# starting one lvl below new root node to not remove root node
for lvl in range(2, max_depth + 1):
    # remove parent less nodes
    use_this_tree.drop(use_this_tree[~use_this_tree['parent'].isin(use_this_tree['id'].values) &
                                     (use_this_tree['lvl'] == lvl)].index, inplace=True)

In [8]:
# remove leaf nodes w/o siblings
number_of_sib = []
for label in use_this_tree['label'].values:
    # get int value of parent for given label and the number of siblings for label
    parent = use_this_tree.loc[use_this_tree['label'] == label, 'parent'].values[0]
    number_of_sib.append(use_this_tree[use_this_tree['parent'] == parent].shape[0])

In [9]:
# number_of_sib if 1 it has 0 siblings, the 1 come from counting the node itself
use_this_tree['number_of_sib'] = number_of_sib
use_this_tree.drop(use_this_tree[(use_this_tree['number_of_sib'] < 2) &
                                 (use_this_tree['lvl'] == max_depth-1)].index, inplace=True)

In [10]:
# remove children less nodes from bottom up
for lvl in range(max_depth-2, 1, -1):
    use_this_tree.drop(use_this_tree[~use_this_tree['id'].isin(use_this_tree['parent'].values) &
                                     (use_this_tree['lvl'] == lvl)].index, inplace=True)

In [11]:
# reindex df such that idx corresponds to axis=1 cooridnate in label matrix
# UWAGA:the root node (idx=0) is not a label and labels are therefor 1 indexed
use_this_tree.drop(use_this_tree[use_this_tree['lvl'] == 0].index, inplace=True)
use_this_tree = use_this_tree.sort_values('lvl')
use_this_tree['idx'] = range(use_this_tree.shape[0])
use_this_tree.set_index('idx', inplace=True)

# now make sure we only have data that is labeled for this tree
# get all the leaf nodes
leaf_nodes = use_this_tree.loc[use_this_tree['lvl'] == max_depth-1, 'label'].values

In [12]:
list_leaf_data = [x[max_depth-1] for x in data_mit['labels'].values]

In [13]:
# does the exon labeling contain one of the leaf nodes?
bool_keep_data = [True if x in leaf_nodes else False for x in list_leaf_data]
data_mit = data_mit[bool_keep_data]


In [25]:
label_matrix = np.zeros((data_mit.shape[0], use_this_tree.shape[0]))

In [26]:
# TODO: find a way to avoid at least one loop

# loop through each exon and assign labels to label_matrix
for i, list_labels in enumerate(data_mit['labels']):
    # loop through the list starting on the first lvl after root node
    for label in list_labels[1:4]:
        # get label idx

        idx = use_this_tree[use_this_tree['label'] == label].index[0]
        # add to idx-1 (shift due to root node not beign a class)
        label_matrix[i, idx] = 1

In [27]:
# drop meta data and hstack to datamatrix
for_np = data_mit.drop(['labels', 'COID', 'phenString', 'DOID', 'exons'], axis=1)
np_data_mit = np.hstack([for_np, label_matrix])

In [28]:
np.savetxt('/mnt/wartmann_pe940/git/AllExons/Data/datamatrix/data_labeled__2_3_8.csv',
           np_data_mit, delimiter=',')

In [20]:
data_mit.shape

(3390, 35)

In [24]:
use_this_tree

Unnamed: 0_level_0,id,parent,label,count_all,lvl,number_of_sib
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,0,disease of anatomical entity,4217,1,3
1,71,0,disease of metabolism,1636,1,3
2,2,1,musculoskeletal system disease,953,2,5
3,11,1,nervous system disease,1551,2,5
4,72,71,inherited metabolic disorder,1582,2,1
5,12,11,central nervous system disease,738,3,2
6,22,2,connective tissue disease,353,3,2
7,30,11,sensory system disease,577,3,2
8,73,72,amino acid metabolic disorder,363,3,4
9,90,72,carbohydrate metabolic disorder,308,3,4
