In [1]:
#--- imports
import pypermr
import pyranges as pr
import numpy as np
import pandas as pd
import time
from itertools import repeat
import random
import multiprocessing
from multiprocessing import Pool
from scipy.stats import rankdata
import inspect
import time, sys
from IPython.display import clear_output

In [2]:
#--- constants/preset
n_cores=12
prefix='vip_'
min_feature_per_set=15
win_length_kb=25

In [3]:
#--- features
genes=pd.read_table('/Users/joshuaschmidt/Projects/pypermr/genes.txt',
    header=0, names=['Chromosome',"Start","End","feature"],
    dtype={"Chromosome":str,"Start":int,"End":int,"feature":str})
genes['idx'] = np.arange(len(genes))
genes_2kb=pypermr.gene_definition(genes,updown=2000)

In [4]:
#--- backgrounds
central_background_file='/Users/joshuaschmidt/Projects/pypermr/central_background.txt'
central_background=pypermr.read_variant_file(central_background_file)
eastern_background_file='/Users/joshuaschmidt/Projects/pypermr/eastern_background.txt'
eastern_background=pypermr.read_variant_file(eastern_background_file)
internal_background_file='/Users/joshuaschmidt/Projects/pypermr/internal_background.txt'
internal_background=pypermr.read_variant_file(internal_background_file)

In [5]:
central_background[:10]

Unnamed: 0,Chromosome,Start,End
0,1,253151,253151
1,1,267919,267919
2,1,979359,979359
3,1,982942,982942
4,1,992018,992018
5,1,995690,995690
6,1,998413,998413
7,1,1003427,1003427
8,1,1004788,1004788
9,1,1004941,1004941


In [95]:
#internal_bg_genes[['Chromosome', 'Start', 'End', 'idx']]

In [6]:
def chrom_windows(chrom,start,end, window):
    starts=np.arange(int(start),int(end),window)
    out_df=pd.DataFrame(starts,columns =['Start'])
    out_df['End']=out_df['Start']+window-1
    out_df['Chromosome']=chrom
    return(out_df)

In [12]:
#--- common feature background
frames = [central_background, eastern_background, internal_background]
all_bg = pd.concat(frames)
all_bg = all_bg.drop_duplicates()
all_bg = all_bg.groupby('Chromosome').agg({'Start':'min', 'End':'max'}).reset_index()
all_bg = [chrom_windows(x, y, z, win_length_kb*1000) for x, y, z in zip(all_bg['Chromosome'], all_bg['Start'], all_bg['End'])]
all_bg = pd.concat(all_bg)

In [24]:
e_bg = pr.PyRanges(all_bg).join(pr.PyRanges(eastern_background)).df[['Chromosome', 'Start', 'End']].drop_duplicates()
c_bg = pr.PyRanges(all_bg).join(pr.PyRanges(central_background)).df[['Chromosome', 'Start', 'End']].drop_duplicates()
i_bg = pr.PyRanges(all_bg).join(pr.PyRanges(internal_background)).df[['Chromosome', 'Start', 'End']].drop_duplicates()
e_c_bg = pd.merge(e_bg, c_bg, how='inner', on=['Chromosome', 'Start', 'End'])
all_bg = pd.merge(e_c_bg, i_bg, how='inner', on=['Chromosome', 'Start', 'End'])

In [26]:
all_bg['id']=np.arange(all_bg.shape[0])
all_bg

Unnamed: 0,Chromosome,Start,End,id
0,1,1060732,1085731,0
1,1,1085732,1110731,1
2,1,1110732,1135731,2
3,1,1160732,1185731,3
4,1,1185732,1210731,4
...,...,...,...,...
101374,22,48738985,48763984,101374
101375,22,48763985,48788984,101375
101376,22,48788985,48813984,101376
101377,22,48863985,48888984,101377


In [110]:
#--- intersect all_bg windows with genes
all_bg_window_genes=pypermr.intersect_variants_features(all_bg,genes_2kb)
#all_bg_window_genes=pypermr.collapse_feature_list(all_bg_window_genes)
#all_bg_window_genes.groupby('id')['idx'].unique()
all_bg_window_genes
np.mean(all_bg_window_genes.groupby("id").agg({"idx": pd.Series.nunique})['idx'])

1.156403049273881

In [47]:
all_bg_window_genes_c=all_bg_window_genes.groupby('id')['idx'].apply(np.hstack).to_frame().reset_index()
all_bg_window_genes_c['id']=np.arange(all_bg_window_genes_c.shape[0])

In [48]:
bg_list = all_bg_window_genes_c['idx'].tolist()

In [42]:
#--- intersect with windows
#eastern_all_bg=pr.PyRanges(all_bg).join(pr.PyRanges(eastern_background)).df
#central_all_bg=pr.PyRanges(all_bg).join(pr.PyRanges(central_background)).df
#internal_all_bg=pr.PyRanges(all_bg).join(pr.PyRanges(internal_background)).df

In [43]:
#bg_window_list = [eastern_all_bg['id'].values,central_all_bg['id'].values, internal_all_bg['id'].values  ]

In [44]:
#--- get all bg windows that have a feature from each of our statistics
#from functools import reduce
#bg_window_intersect = reduce(np.intersect1d, (bg_window_list))
#np.size(bg_window_intersect)

In [56]:
#--- filter genes to common window bg
genes_in_common_bg = all_bg_window_genes['feature'].unique()
genes_in_common_bg = genes[genes['feature'].isin(genes_in_common_bg)]
genes_in_common_bg

Unnamed: 0,Chromosome,Start,End,feature,idx
2,1,1099891,1117284,TTLL10,2
3,1,1157907,1160677,B3GALT6,3
4,1,1198009,1202485,SCNN1D,4
5,1,1203263,1216123,ACAP3,5
6,1,1235676,1236528,CPTP,6
...,...,...,...,...,...
16193,9,137155596,137177228,DPH7,16193
16194,9,137181216,137188334,ZMYND19,16194
16195,9,137204153,137215129,ARRDC1,16195
16196,9,137218797,137439599,EHMT1,16196


In [60]:
#--- annotation sets
annotation_sets=pd.read_table('/Users/joshuaschmidt/Projects/pypermr/vip.txt',
	dtype={"id":str,"feature":str,"name":str})
s_annotation_sets=pypermr.collapse_annotation_sets(annotation_sets,genes_in_common_bg,min_feature_per_set)
s_annotation_tups=pypermr.collapse_annotation_sets_tuple(annotation_sets,genes_in_common_bg,min_feature_per_set)

In [61]:
#--- candidates
central_candidates_file='/Users/joshuaschmidt/Projects/pypermr/central_candidates.txt'
central_candidates=pypermr.read_variant_file(central_candidates_file)
eastern_candidates_file='/Users/joshuaschmidt/Projects/pypermr/eastern_candidates.txt'
eastern_candidates=pypermr.read_variant_file(eastern_candidates_file)
internal_candidates_file='/Users/joshuaschmidt/Projects/pypermr/internal_candidates.txt'
internal_candidates=pypermr.read_variant_file(internal_candidates_file)

In [70]:
#--- candidate genes
eastern_candidate_genes=pr.PyRanges(all_bg_window_genes).join(pr.PyRanges(eastern_candidates)).df['idx'].unique()
n_eastern_candidates=np.size(eastern_candidate_genes)
central_candidate_genes=pr.PyRanges(all_bg_window_genes).join(pr.PyRanges(central_candidates)).df['idx'].unique()
n_central_candidates=np.size(central_candidate_genes)
internal_candidate_genes=pr.PyRanges(all_bg_window_genes).join(pr.PyRanges(internal_candidates)).df['idx'].unique()
n_internal_candidates=np.size(internal_candidate_genes)

In [129]:
def get_N_unique_genes_blocks(gene_list,n_genes,n_resample):
    gene_sample=pd.unique(np.concatenate(random.sample(gene_list,n_resample)))
    if(np.size(gene_sample) >= n_genes):
        return(gene_sample[:n_genes])
    while(np.size(gene_sample) < n_genes):
        n_resample = n_genes - np.size(gene_sample)
        new_sample=pd.unique(np.concatenate(random.sample(gene_list,n_resample)))
        gene_sample=np.concatenate([gene_sample,new_sample])
        gene_sample=pd.unique(gene_sample)
    return(gene_sample[:n_genes])


In [132]:
def get_permutations_blocks(gene_list,n_permutations,n_candidates,sample_factor):
    M=range(n_permutations)
    n_resample = round(n_candidates*sample_factor)
    L=np.array([get_N_unique_genes_blocks(gene_list,n_candidates,n_resample) for m in M])
    return(L)


In [133]:
get_permutations_blocks(bg_list,10,100,0.8)

array([[ 5126,    62,  1964,  5596,  1874,  3160, 15002,  7555, 11819,
        13276, 10300,  5164,  5165, 13787,  9566,  9567,  3945,  6293,
         6294,  6295,  1700, 13686,  4463,  4705, 11730,  4813,  6135,
         6136, 15478, 15250,  4015, 13909, 16058, 16059, 12118, 12119,
         1909,  9357, 12437,  1794,  1860, 15203, 15085,  3320, 15401,
        15876,  1585, 13996,  7366,  5897, 14019, 11313, 12889,  2051,
        15851,  5868,  4970, 15256,  5509, 12571,  7336,  1432, 16011,
        16012, 16013,  7098, 10634,  3256,  3257,  3258,  3259, 15942,
        12455,  4051,  1923,  9515,  9904,  4619,  4620,  8845, 11900,
        11902, 10707,  7107,  7108,  7298,  7307,  4242,  4243,  3319,
        14831,  1935,    70, 11366,  8599,  8601,  7516,  7517, 14475,
        14476],
       [ 8858,  3023, 15404, 12155,  5800, 10066, 10160,  1893,  9473,
         9474,  6251,  5606, 15003,  5739,  5740,  6037, 12020, 14904,
        11972,  1259, 11284, 15998,  8163, 11832, 13994,  447

In [155]:
import concurrent.futures
import logging
threads = list()
with concurrent.futures.ThreadPoolExecutor() as executor:
    future = executor.submit(get_permutations_blocks, args=(bg_list,10000,100,0.8))
    return_value = future.result()
    

In [156]:
for x in threads:    
    x.start()

In [157]:
for x in threads:    
    x.join()

In [159]:
threads[0].result

AttributeError: 'Thread' object has no attribute 'result'