https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt

In [1]:
#import pickle as pkl
import numpy as np
import pandas as pd, collections
import _pickle as cPickle
import gc
from itertools import compress
import os, re

In [40]:
TF = 'FOXA1'
kawakami_motif_regions = '/projects/covid-ct/imlab/users/temi/projects/TFXcan/data/train-test-val'
kawakami_enformer_outputs = '/projects/covid-ct/imlab/users/temi/projects/TFXcan/enformer_predictions/kawakami-test'

In [37]:
available_enformer_predictions = [a.split('_')[3] for a in os.listdir(kawakami_enformer_outputs)]
len(available_enformer_predictions)

1856

In [46]:
kawakami_motifs = pd.read_table(f'{kawakami_motif_regions}//kawakami_test_motif_regions.csv', sep=' ')
available_kawakami_regions = kawakami_motifs.loc[kawakami_motifs.motif_name.isin(available_enformer_predictions), ]
available_kawakami_regions.head()

Unnamed: 0,chr,motif_center_start,motif_center_end,binding_count,motif_name,start,end
0,chr1,243407231,243407233,2,TP35198,243210624,243603840
1,chr1,231173964,231173966,7,TP33352,230977357,231370573
2,chr1,172557731,172557733,5,TP22333,172361124,172754340
3,chr1,228416840,228416842,1,TP32959,228220233,228613449
4,chr1,191122600,191122602,10,TP26228,190925993,191319209


In [50]:
# how many true positives vs true negatives are available
collections.Counter(['TP' if a.startswith('TP') else 'TN' for a in available_kawakami_regions['motif_name']])

Counter({'TP': 928, 'TN': 928})

### Splitting into train, test and by aggregation methods

I want 1000 training sets evenly split between TP and TN, and the rest can be for testing

In [None]:
aggMethods

Unnamed: 0,chr,motif_center_start,motif_center_end,binding_count,motif_name,start,end
0,chr1,198885387,198885389,1,TP27935,198688780,199081996
1,chr1,212629681,212629683,2,TP30057,212433074,212826290
2,chr1,239047719,239047721,8,TP34563,238851112,239244328
3,chr1,172416278,172416280,6,TP22285,172219671,172612887
4,chr1,148811751,148811753,4,TP18629,148615144,149008360
...,...,...,...,...,...,...,...
9929,chrX,115200375,115200377,1,TP438978,115003768,115396984
9930,chrX,147210539,147210541,25,TP441903,147013932,147407148
9931,chrX,143206250,143206252,2,TP441730,143009643,143402859
9932,chrX,138134863,138134865,8,TP441303,137938256,138331472


In [13]:
kawakami_motifs.motif_name.isin(available_enformer_predictions)

0       False
1       False
2       False
3       False
4       False
        ...  
9929    False
9930    False
9931    False
9932    False
9933    False
Name: motif_name, Length: 9934, dtype: bool

## train and test motifs

## Test how well Kawakami does on Freedman

In [54]:
save_dir = '../defined_regions'

In [58]:
kawakami_regions = pd.read_table(f'{save_dir}/kawakami_tp_tn_regions.txt', sep=' ')
freedman_regions = pd.read_table(f'{save_dir}/freedman_tp_tn_regions.txt', sep=' ')

In [60]:
# need to select these regions from the kawakami-test directory
kawakami_regions.motif_name

0          TN700
1         TN1740
2         TN1207
3         TN1412
4         TN1192
          ...   
1727    TP438712
1728    TP436083
1729    TP435841
1730    TP435836
1731    TP438468
Name: motif_name, Length: 1732, dtype: object

In [3]:
Gata3_first = 983 # CHIP:GATA3:T47D treated with 0.02% dimethyl sulfoxide for 1 hour
Gata3_second = 1417 # CHIP:GATA3:SH-SY5Y
Gata3_third = 2740 # CHIP:GATA3:MCF-7

GATA3_tracks = [Gata3_first, Gata3_second, Gata3_third]

tracks_to_remove = GATA3_tracks # OR FALSE
tracks_to_select = [t for t in list(range(0, 5313)) if t not in tracks_to_remove] # these are the tracks I want to retain
# tracks_to_select

In [4]:
# I need to have a naming conventions for the tracks e.g. `f_{track}`
tracks_colnames = [f'f_{index}' for index in range(0, 5313)]
len(tracks_colnames)

5313

In [5]:
def threshold_tracks(selected_tracks, thresh=None):
    
    # if selected tracks is just a dictionary
    if isinstance(selected_tracks, type({})):
        for k, v in selected_tracks.items():
            if isinstance(thresh, type(None)):
                thresh = np.mean(v, axis=0)
            selected_tracks[k] = np.where(v >= thresh, 1, 0)
    
    # if it is a np array
    if isinstance(selected_tracks, type(np.zeros((1, 3)))):
        if isinstance(thresh, type(None)):
            thresh = np.mean(selected_tracks, axis=0)
        selected_tracks = np.where(selected_tracks >= thresh, 1, 0)

    # if it is a list of np arrays e.g predictions from 2 haplotypes
    if isinstance(selected_tracks, type([])):
        if isinstance(thresh, type(None)):
            for i, np_array in enumerate(selected_tracks):
                thresh = np.mean(np_array, axis=0)
                selected_tracks[i] = np.where(np_array >= thresh, 1, 0)
        else:
            for i, np_array in enumerate(selected_tracks):
                selected_tracks[i] = np.where(np_array >= thresh, 1, 0)

    return selected_tracks

def select_tracks(list_or_np_array, select_tracks=None, select_bins=None, motif_bin=None):

    # make sure that only one is evaluated in select_tracks and remove_tracks
    # I think it is many times faster if it removes 2 tracks vs if it selects 5311 tracks out of 5313
    # or if it 

    if isinstance(list_or_np_array, type([])):
        for i, np_array in enumerate(list_or_np_array):
            if isinstance(select_bins, type(8)):
                temp = np_array[:, select_tracks]
                list_or_np_array[i] = temp[range(motif_bin - select_bins, (motif_bin + select_bins + 1)), : ]
            else:
                list_or_np_array[i] = np_array[:, select_tracks]
            
    if isinstance(list_or_np_array, type(np.empty((2, 2)))):
        if isinstance(select_bins, type(8)):
            temp = list_or_np_array[:, select_tracks]
            list_or_np_array = temp[range(motif_bin - select_bins, (motif_bin + select_bins + 1)), : ]
        else:
            list_or_np_array = list_or_np_array[:, select_tracks]

    return list_or_np_array


def load_personalized_predictions_pickle(individuals, TF_info, motifs, motif_bin, track_headers, tracks_to_select=None, bins_to_select=8, apply_threshold=False):

    TF = TF_info[0]
    cell_line = TF_info[1]

    if not isinstance(tracks_to_select, type(None)):
        track_headers = [track_headers[index] for index in tracks_to_select]
        
    individuals_dict = {}
    for ind in individuals:
        motif_dict = {}
        for m in motifs:
            with open(f'{personalized_predictions_path}/{ind}/{TF}_{cell_line}_{m}_personalized_predictions.pkl', 'rb') as input_obj:
                
                gc.disable()
                
                # just select what you need
                if isinstance(tracks_to_select, type(None)):
                    temp = cPickle.load(input_obj) # selects only GATA3 tracks
                else:
                    temp = select_tracks(cPickle.load(input_obj), select_tracks=tracks_to_select, select_bins=bins_to_select, motif_bin=motif_bin)

                if apply_threshold == True:
                    temp = threshold_tracks(temp)
                    
                gc.enable()
                
                #m_thresholds = Threshold_tracks(temp) # thresholds them as needed, by average
                motif_dict[m] = temp
        individuals_dict[ind] = motif_dict

    return (individuals_dict, track_headers)

def load_reference_predictions_pickle(TF, motifs, motif_bin, tracks_to_select=None, bins_to_select=8, apply_threshold=False):

    motif_dict = {}

    for m in motifs:
        with open(f'{reference_predictions_path}/{TF}/{TF}_reference_{m}_predictions_2022-07-19.pkl', 'rb') as input_obj:
            gc.disable()
            
            # just select what you need
            if isinstance(tracks_to_select, type(None)):
                temp = cPickle.load(input_obj) # selects only GATA3 tracks
            else:
                temp = select_tracks(cPickle.load(input_obj), select_tracks=tracks_to_select, select_bins=bins_to_select, motif_bin=motif_bin)

            if apply_threshold == True:
                temp = threshold_tracks(temp)
                
            gc.enable()
            
            #m_thresholds = Threshold_tracks(temp) # thresholds them as needed, by average
            motif_dict[m] = temp

    return motif_dict   

# Reference genome predictions

In [5]:
region_data = pd.read_csv('/projects/covid-ct/imlab/users/temi/projects/TFXcan/log/logging_predictions.csv')
region_data.head()

NameError: name 'pd' is not defined

In [99]:
# header_names = ['chr', 'motif_start', 'motif_end', 'id', 'score', 'strand', 'start', 'end', 'motif_name']
# dtypes_names = {'chr':str, 'motif_start':int, 'motif_end':int, 'id':str, 'score':str, 'strand':str, 'start':int, 'end':int, 'motif_name':str}

# region_data = pd.read_table(f'/projects/covid-ct/imlab/users/temi/projects/TFXcan/processed-data/{TF}/{TF}_motif_regions.txt', sep=' ', names=dtypes_names.keys(), dtype=dtypes_names)
# region_data.head(5)

In [62]:
motif_y = ['TP' if m.startswith('TP') else 'TN' for m in region_data.motif.values]
region_data['motif_y'] = motif_y

In [63]:
collections.Counter(motif_y)

Counter({'TP': 1214, 'TN': 14652})

I should randomly select like 800 TPs and 800 TNs and train on these for training
And like 200 each for testing...

In [64]:
np.random.seed(2022)

train_n = 800 # that will be 1600
test_n = 400 # that wll be like 400

train_info = region_data.groupby('motif_y').sample(n=train_n, random_state=2022)
test_info = region_data[~region_data.index.isin(train_info.index.values)].groupby('motif_y').sample(n=test_n, random_state=2022)

In [65]:
train_info.shape, test_info.shape

((1600, 3), (800, 3))

In [15]:
# define some motifs
train_motifs = train_info.motif.values   #['TP1', 'TN14722']#region_data.motif.values #['TP1', 'TN14722']
test_motifs = test_info.motif.values

AttributeError: 'DataFrame' object has no attribute 'motif'

In [72]:
train_motifs

array(['TN14558', 'TN5968', 'TN5896', ..., 'TP339', 'TP879', 'TP1021'],
      dtype=object)

In [73]:
# this step is slow - needs to be optimized, but it is reading a lot of data, I guess

train_enformer_predictions = load_reference_predictions_pickle(TF=TF, motifs=train_motifs, motif_bin=448, tracks_to_select=tracks_to_select, bins_to_select=8)

In [75]:
train_enformer_predictions['TN10044'].shape

(17, 5310)

In [76]:
test_enformer_predictions = load_reference_predictions_pickle(TF=TF, motifs=test_motifs, motif_bin=448, tracks_to_select=tracks_to_select, bins_to_select=8)

In [77]:
test_enformer_predictions['TN10041'].shape

(17, 5310)

Aggregating by the mean

In [93]:
y = []
X = []

for k, v in train_enformer_predictions.items():
    y.append(1) if k.startswith('TP') else y.append(0)

    v = v.mean(axis=0)
    v = np.expand_dims(v, axis=1).T
    X.append(v)

y = np.expand_dims(np.array(y), axis=1)

train_data = np.hstack((y, np.vstack(X)))

In [95]:
train_data.shape

(1600, 5311)

In [96]:
y = []
X = []

for k, v in test_enformer_predictions.items():
    y.append(1) if k.startswith('TP') else y.append(0)

    v = v.mean(axis=0)
    v = np.expand_dims(v, axis=1).T
    X.append(v)

y = np.expand_dims(np.array(y), axis=1)
test_data = np.hstack((y, np.vstack(X)))

In [97]:
test_data.shape

(800, 5311)

In [98]:
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

train_data.to_csv('/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data/train_aggByMean.csv')
test_data.to_csv('/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data/test_aggByMean.csv')

Aggregating by 3 standard deviations from the mean

In [110]:
three_SD_from_mean = 3 * train_enformer_predictions['TN10044'].std(axis=0)
mean_tracks = train_enformer_predictions['TN10044'].mean(axis=0)
plus_threshold = mean_tracks + three_SD_from_mean 
minus_threshold = mean_tracks - three_SD_from_mean

In [143]:
np.where((train_enformer_predictions['TN10044'] < plus_threshold) | (train_enformer_predictions['TN10044'] > minus_threshold), 0, 1)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [145]:
def agg_by_SD(selected_tracks):

    three_SD_from_mean, mean_tracks = 3 * selected_tracks.std(axis=0), selected_tracks.mean(axis=0)
    plus_threshold = mean_tracks + three_SD_from_mean 
    minus_threshold = mean_tracks - three_SD_from_mean

    return np.where((selected_tracks < plus_threshold) | (selected_tracks > minus_threshold), 0, 1)

In [152]:
y = []
X = []

for k, v in test_enformer_predictions.items():
    y.append(1) if k.startswith('TP') else y.append(0)

    v = agg_by_SD(v)
    #v = np.expand_dims(v, axis=1).T
    X.append(v)

y = np.expand_dims(np.array(y), axis=1)
#sd_test_data = np.hstack((y, np.vstack(X)))


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 800 and the array at index 1 has size 13600

In [156]:
X[0].sum(axis=0)

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
TP1_region_data = region_data.loc[region_data.motif_name == 'TP1']

motif_center = TP1_region_data.motif_start.values[0] + 1

motif_bin = 0
bin_start = TP1_region_data.start.values[0] + ((768 + 320) * 128)
#print(f'motif center: {motif_center} | bins start: {bin_start}')

while bin_start < motif_center:
    bin_start += 128
    motif_bin += 1

    if bin_start > motif_center:
        motif_bin += 1
        break

print(motif_bin)

448


In [11]:
# train_enformer_predictions_temp = {k: np.transpose(v) for k, v in train_enformer_predictions.items()}
# X_train = np.array([v for v in train_enformer_predictions_temp.values()])
# X_train.shape
# X_train = np.vstack(X_train)
# X_train
# y_train = np.hstack(np.array([[1] * 3 if k.startswith('TP') else [0]*3 for k in train_enformer_predictions_temp.keys()]))
# y_train
# dt = pd.DataFrame(np.c_[y_train, X_train])
# dt.to_csv('/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data/data.csv')
# test_enformer_predictions_temp = {k: np.transpose(v) for k, v in test_enformer_predictions.items()}
# X_test = np.array([v for v in test_enformer_predictions_temp.values()])
# X_test = np.vstack(X_test)
# y_test = np.hstack(np.array([[1] * 3 if k.startswith('TP') else [0]*3 for k in test_enformer_predictions_temp.keys()]))
# X_test.shape, y_test.shape
# dt_test = pd.DataFrame(np.c_[y_test, X_test])
# dt_test.to_csv('/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data/test-data.csv')


# # Basic LogisticRegression algorithm
# logistic_regression_classifier = LogisticRegressionCV(cv=3, max_iter=1000)
# # SAGA should be considered more advanced and used over SAG. For more information, see: https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-defintions
# # Note, you should probably tune this, these values are arbitrary
# elastic_net_classifier = LogisticRegressionCV(cv=3, penalty='elasticnet', l1_ratios=[0.1, 0.5, 0.9], solver='saga', max_iter=1000)

# # Train the models
# logistic_regression_classifier.fit(X_train, y_train)
# elastic_net_classifier.fit(X_train, y_train)
# # Test the models
# print("Logistic Regression: {} || Elasticnet: {}".format(logistic_regression_classifier.score(X_test, y_test), elastic_net_classifier.score(X_test, y_test)))
# # Print out some more metrics
# print("Logistic Regression")
# print(classification_report(y_test, logistic_regression_classifier.predict(X_test)))
# print("Elastic Net")
# print(classification_report(y_test, elastic_net_classifier.predict(X_test)))
# elastic_net_classifier.coef_[0]

# Personalized predictions

In [6]:
#import pickle as pkl
import numpy as np
import pandas as pd, collections
import _pickle as cPickle
import gc

In [7]:
personalized_predictions_path = '/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/personalized_predictions'

Gata3_first = 983 # CHIP:GATA3:T47D treated with 0.02% dimethyl sulfoxide for 1 hour
Gata3_second = 1417 # CHIP:GATA3:SH-SY5Y
Gata3_third = 2740 # CHIP:GATA3:MCF-7

individuals = ['HG00479', 'HG00590']
TF = 'GATA3'
cell_line = 'T47D'
TF_info = [TF, cell_line]

tracks_to_remove = [Gata3_first, Gata3_second, Gata3_third] # OR FALSE
tracks_to_select = [t for t in list(range(0, 5313)) if t not in tracks_to_remove] # these are the tracks I want to retain
# tracks_to_select

In [8]:
# predictions_log = pd.read_csv('/projects/covid-ct/imlab/users/temi/projects/TFXcan/log/logging_predictions_personalized.csv')
# predictions_log.head(), predictions_log.shape

dtypes_names = {'chr':str, 'motif_center_start':int, 'motif_center_end':int, 'id':str, 'score':str, 'strand':str, 'start':int, 'end':int, 'motif_name':str}

# region_data = pd.read_table(f'/projects/covid-ct/imlab/users/temi/projects/TFXcan/processed-data/{TF}/{TF}_motif_regions.txt', sep=' ', names=dtypes_names.keys(), dtype=dtypes_names)

# for now, I will use the motif regions file
predictions_log = pd.read_csv(f'/projects/covid-ct/imlab/users/temi/projects/TFXcan/processed-data/{TF}/{cell_line}/{TF}_{cell_line}_motif_regions_3000_subset.txt', sep=' ', names=dtypes_names.keys(), dtype=dtypes_names)
predictions_log.head(), predictions_log.shape

(     chr  motif_center_start  motif_center_end          id            score  \
 0   chr9            89487723          89487725           0                0   
 1  chr19            49712430          49712432  GATA3_T47D  1.4208161552449   
 2  chr12            39544520          39544522           0                0   
 3   chr1           160978778         160978780           0                0   
 4   chr6            99544116          99544118           0                0   
 
   strand      start        end motif_name  
 0      +   89291116   89684332    TP10276  
 1      +   49515823   49909039    TN14186  
 2      +   39347913   39741129    TP13022  
 3      +  160782171  161175387      TP277  
 4      +   99347509   99740725     TP7604  ,
 (3000, 9))

In [9]:
motif_y = ['TP' if m.startswith('TP') else 'TN' for m in predictions_log.motif_name.values]
predictions_log['motif_y'] = motif_y
collections.Counter(motif_y)

Counter({'TP': 1500, 'TN': 1500})

In [10]:
motifs = predictions_log.motif_name.values #['TP1', 'TN7964']
motifs

array(['TP10276', 'TN14186', 'TP13022', ..., 'TP15744', 'TP3097',
       'TP8947'], dtype=object)

In [11]:
np.random.seed(2022)

train_n = 1000 # that will be 1600
test_n = 500 # that wll be like 400

train_info = predictions_log.groupby('motif_y').sample(n=train_n, random_state=2022)
test_info = predictions_log[~predictions_log.index.isin(train_info.index.values)].groupby('motif_y').sample(n=test_n, random_state=2022)


In [12]:
train_info.shape, test_info.shape

((2000, 10), (1000, 10))

In [16]:
train_motifs = train_info.motif_name.values   #['TP1', 'TN14722']#region_data.motif.values #['TP1', 'TN14722']
test_motifs = test_info.motif_name.values

I need to define the motif bin i.e. the bin where the motif falls within

In [13]:
# just one example should suffice

TF_region_data = predictions_log.loc[predictions_log.motif_name == motifs[0]]

motif_center = TF_region_data.motif_center_start.values[0] + 1

motif_bin = 0
bin_start = TF_region_data.start.values[0] + ((768 + 320) * 128)
#print(f'motif center: {motif_center} | bins start: {bin_start}')

while bin_start < motif_center:
    bin_start += 128
    motif_bin += 1

    if bin_start > motif_center:
        motif_bin += 1
        break

print(motif_bin)

448


Then I can load the data, select tracks and bins as appropriate, and naively threshold using the mean of the values

In [17]:
TF, train_motifs

('GATA3',
 array(['TN9240', 'TN11833', 'TN6303', ..., 'TP123', 'TP18508', 'TP17690'],
       dtype=object))

In [19]:
# this step is slow - needs to be optimized, but it is reading a lot of data, I guess

train_enformer_predictions, train_headers = load_personalized_predictions_pickle(individuals=[individuals[0]], TF_info=TF_info, track_headers=tracks_colnames, motifs=train_motifs, motif_bin=448, tracks_to_select=tracks_to_select, bins_to_select=8)

In [20]:
test_enformer_predictions, test_headers = load_personalized_predictions_pickle(individuals=[individuals[0]], TF_info=TF_info, track_headers=tracks_colnames, motifs=test_motifs, motif_bin=448, tracks_to_select=tracks_to_select, bins_to_select=8)

Here I select only GATA3 tracks

In [21]:
GATA3_train_enformer_predictions, GATA3_train_headers = load_personalized_predictions_pickle(individuals=[individuals[0]], TF_info=TF_info, track_headers=tracks_colnames, motifs=train_motifs, motif_bin=448, tracks_to_select=GATA3_tracks, bins_to_select=8)

In [36]:
GATA3_test_enformer_predictions, GATA3_test_headers = load_personalized_predictions_pickle(individuals=[individuals[0]], TF_info=TF_info, track_headers=tracks_colnames, motifs=test_motifs, motif_bin=448, tracks_to_select=GATA3_tracks, bins_to_select=8)

In [21]:
train_headers, test_headers = ['class'] + train_headers, ['class'] + test_headers

In [37]:
GATA3_train_headers, GATA3_test_headers = ['class'] + GATA3_train_headers, ['class'] + GATA3_test_headers

1. The motif bin
2. Mean
2. 2 or 3 standard deviations away from the mean
3. Use the average; across the bins

In [23]:
def agg_by_mean(pred_tracks, use_bins=None):
    y = []
    X = []

    for k, v in pred_tracks.items():
        y.append(1) if k.startswith('TP') else y.append(0)

        if isinstance(use_bins, type(None)):
            v = v[0].mean(axis=0)
        elif isinstance(use_bins, type([])):
            v = v[0][use_bins, :].mean(axis=0)
        v = np.expand_dims(v, axis=1).T
        X.append(v)

    y = np.expand_dims(np.array(y), axis=1)
    dt = np.hstack((y, np.vstack(X)))

    return dt

def agg_by_center(pred_tracks, center=8):

    y = []
    X = []

    for k, v in pred_tracks.items():
        y.append(1) if k.startswith('TP') else y.append(0)
        v = v[0][center, :]
        v = np.expand_dims(v, axis=1).T
        X.append(v)

    y = np.expand_dims(np.array(y), axis=1)
    dt = np.hstack((y, np.vstack(X)))

    return dt


In [24]:
# these are the bins
upstream = list(range(0, 8))
center = [8]
downstream = list(range(9, 17))

In [28]:
# 1. Agg by mean of everything
train_aggbymean = {k: agg_by_mean(v) for k, v in train_enformer_predictions.items()}
test_aggbymean = {k: agg_by_mean(v) for k, v in test_enformer_predictions.items()}

In [25]:
train_aggbymean_GATA3 = {k: agg_by_mean(v) for k, v in GATA3_train_enformer_predictions.items()}
test_aggbymean_GATA3 = {k: agg_by_mean(v) for k, v in GATA3_test_enformer_predictions.items()}

In [29]:
# 2. Agg by mean of upstream only
train_aggbymean_upstream = {k: agg_by_mean(v, use_bins=upstream) for k, v in train_enformer_predictions.items()}
test_aggbymean_upstream = {k: agg_by_mean(v, use_bins=upstream) for k, v in test_enformer_predictions.items()}


In [26]:
train_aggbymean_upstream_GATA3 = {k: agg_by_mean(v, use_bins=upstream) for k, v in GATA3_train_enformer_predictions.items()}
train_aggbymean_upstream_GATA3 = {k: agg_by_mean(v, use_bins=upstream) for k, v in GATA3_test_enformer_predictions.items()}

In [30]:
# 3. Agg by mean of downstream only
train_aggbymean_downstream = {k: agg_by_mean(v, use_bins=downstream) for k, v in train_enformer_predictions.items()}
test_aggbymean_downstream = {k: agg_by_mean(v, use_bins=downstream) for k, v in test_enformer_predictions.items()}

In [27]:
train_aggbymean_downstream_GATA3 = {k: agg_by_mean(v, use_bins=downstream) for k, v in GATA3_train_enformer_predictions.items()}
train_aggbymean_downstream_GATA3 = {k: agg_by_mean(v, use_bins=downstream) for k, v in GATA3_test_enformer_predictions.items()}

In [31]:
# 4. Agg by mean of upstream and downstream only
train_aggbymean_upstream_downstream = {k: agg_by_mean(v, use_bins=upstream + downstream) for k, v in train_enformer_predictions.items()}
test_aggbymean_upstream_downstream = {k: agg_by_mean(v, use_bins=upstream + downstream) for k, v in test_enformer_predictions.items()}

In [28]:
train_aggbymean_upstream_downstream_GATA3 = {k: agg_by_mean(v, use_bins=upstream + downstream) for k, v in GATA3_train_enformer_predictions.items()}
test_aggbymean_upstream_downstream_GATA3 = {k: agg_by_mean(v, use_bins=upstream + downstream) for k, v in GATA3_test_enformer_predictions.items()}

In [32]:
# 5. Agg by the center only
train_aggbycenter = {k: agg_by_center(v) for k, v in train_enformer_predictions.items()}
test_aggbycenter = {k: agg_by_center(v) for k, v in test_enformer_predictions.items()}

In [29]:
train_aggbycenter_GATA3 = {k: agg_by_center(v) for k, v in GATA3_train_enformer_predictions.items()}
test_aggbycenter_GATA3 = {k: agg_by_center(v) for k, v in GATA3_test_enformer_predictions.items()}

In [30]:
train_aggbymean_upstream_downstream['HG00479'].shape, train_aggbycenter['HG00479'].shape, train_aggbycenter['HG00479']

NameError: name 'train_aggbymean_upstream_downstream' is not defined

In [31]:
train_test_data_path = '/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data'

Save the tracks

In [36]:
ind_name = list(train_aggbycenter.keys())[0]
pd.DataFrame(train_aggbycenter['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByCenter_{ind_name}.csv', index=False, header=train_headers)
pd.DataFrame(test_aggbycenter['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByCenter_{ind_name}.csv', index=False, header=test_headers)

In [35]:
ind_name = list(train_aggbymean.keys())[0]
pd.DataFrame(train_aggbymean['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByMean_{ind_name}.csv', index=False, header=train_headers)
pd.DataFrame(test_aggbymean['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByMean_{ind_name}.csv', index=False, header=test_headers)

ValueError: Writing 4 cols but got 3 aliases

In [38]:
ind_name = list(train_aggbymean_upstream.keys())[0]
pd.DataFrame(train_aggbymean_upstream_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByMeanUpstream_{ind_name}.csv', index=False, header=train_headers)
pd.DataFrame(train_aggbymean_upstream['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByMeanUpstream_{ind_name}.csv', index=False, header=test_headers)

In [39]:
ind_name = list(train_aggbymean_downstream.keys())[0]
pd.DataFrame(train_aggbymean_downstream['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByMeanDownstream_{ind_name}.csv', index=False, header=train_headers)
pd.DataFrame(train_aggbymean_downstream['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByMeanDownstream_{ind_name}.csv', index=False, header=test_headers)

In [40]:
ind_name = list(train_aggbymean_upstream_downstream.keys())[0]
pd.DataFrame(train_aggbymean_upstream_downstream['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByMeanUpstreamDownstream_{ind_name}.csv', index=False, header=train_headers)
pd.DataFrame(train_aggbymean_upstream_downstream['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByMeanUpstreamDownstream_{ind_name}.csv', index=False, header=test_headers)

Save the GATA3 only tracks too

In [38]:
ind_name = list(train_aggbycenter_GATA3.keys())[0]
pd.DataFrame(train_aggbycenter_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByCenter_{ind_name}_GATA3.csv', index=False, header=GATA3_train_headers)
pd.DataFrame(test_aggbycenter_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByCenter_{ind_name}_GATA3.csv', index=False, header=GATA3_test_headers)

In [39]:
ind_name = list(train_aggbymean_GATA3.keys())[0]
pd.DataFrame(train_aggbymean_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByMean_{ind_name}_GATA3.csv', index=False, header=GATA3_train_headers)
pd.DataFrame(test_aggbymean_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByMean_{ind_name}_GATA3.csv', index=False, header=GATA3_test_headers)

In [40]:
ind_name = list(train_aggbymean_upstream_GATA3.keys())[0]
pd.DataFrame(train_aggbymean_upstream_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByMeanUpstream_{ind_name}_GATA3.csv', index=False, header=GATA3_train_headers)
pd.DataFrame(train_aggbymean_upstream_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByMeanUpstream_{ind_name}_GATA3.csv', index=False, header=GATA3_test_headers)

In [41]:
ind_name = list(train_aggbymean_downstream_GATA3.keys())[0]
pd.DataFrame(train_aggbymean_downstream_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByMeanDownstream_{ind_name}_GATA3.csv', index=False, header=GATA3_train_headers)
pd.DataFrame(train_aggbymean_downstream_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByMeanDownstream_{ind_name}_GATA3.csv', index=False, header=GATA3_test_headers)

In [42]:
ind_name = list(train_aggbymean_upstream_downstream_GATA3.keys())[0]
pd.DataFrame(train_aggbymean_upstream_downstream_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/train_aggByMeanUpstreamDownstream_{ind_name}_GATA3.csv', index=False, header=GATA3_train_headers)
pd.DataFrame(train_aggbymean_upstream_downstream_GATA3['HG00479']).to_csv(path_or_buf=f'{train_test_data_path}/test_aggByMeanUpstreamDownstream_{ind_name}_GATA3.csv', index=False, header=GATA3_test_headers)

# KAWAKAMI Mouse

In [2]:
kawakami_dir = '../data/kawakami/wPGSA_ChIPpeak_detailed_info.txt'

In [3]:
kawakami_info = pd.read_table(kawakami_dir)
kawakami_info.shape, kawakami_info.head(5)

((55293968, 13),
     TF               peakfile space     start       end  width strand  \
 0  EED  mm9_Eed_mm8_peaks.bed  chr1   5008937   5009772    836      +   
 1  EED  mm9_Eed_mm8_peaks.bed  chr1   5010331   5011166    836      +   
 2  EED  mm9_Eed_mm8_peaks.bed  chr1   9535883   9536723    841      +   
 3  EED  mm9_Eed_mm8_peaks.bed  chr1  17086414  17087547   1134      +   
 4  EED  mm9_Eed_mm8_peaks.bed  chr1  19093850  19094932   1083      +   
 
               feature  start_position  end_position feature_strand  \
 0  ENSMUSG00000002459       4899658.0     5060366.0              -   
 1  ENSMUSG00000002459       4899658.0     5060366.0              -   
 2  ENSMUSG00000061024       9535540.0     9537531.0              +   
 3  ENSMUSG00000042686      16987446.0    17087970.0              -   
 4  ENSMUSG00000042596      19093102.0    19156406.0              +   
 
   insideFeature  distancetoFeature  
 0        inside            51429.0  
 1        inside            50035

How many TFs?

In [9]:
len(set(kawakami_info.TF.values))

463

In [10]:
len(set(kawakami_info.peakfile.values))

3146

In [16]:
kawakami_info.peakfile[~kawakami_info.peakfile.astype(str).str.startswith('mm9')]

300701                                  MYBL1_peaks.bed
300702                                  MYBL1_peaks.bed
300703                                  MYBL1_peaks.bed
300704                                  MYBL1_peaks.bed
300705                                  MYBL1_peaks.bed
                               ...                     
55293963    wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak
55293964    wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak
55293965    wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak
55293966    wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak
55293967    wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak
Name: peakfile, Length: 51196267, dtype: object

In [14]:
kawakami_info.peakfile.astype(str).str.startswith('mm9')

0            True
1            True
2            True
3            True
4            True
            ...  
55293963    False
55293964    False
55293965    False
55293966    False
55293967    False
Name: peakfile, Length: 55293968, dtype: bool

In [None]:
'FOXA1' in kawakami_info.TF.tolist()

In [22]:
k_FOXA1 = kawakami_info[kawakami_info.TF == 'FOXA1']
k_FOXA1.shape

(146568, 13)

In [23]:
k_FOXA1

Unnamed: 0,TF,peakfile,space,start,end,width,strand,feature,start_position,end_position,feature_strand,insideFeature,distancetoFeature
3672621,FOXA1,GSM427090_peaks.narrowPeak,chr1,9621702,9621846,145,+,ENSMUSG00000079671,9550914.0,9621173.0,-,upstream,-529.0
3672622,FOXA1,GSM427090_peaks.narrowPeak,chr1,21115223,21115376,154,+,ENSMUSG00000041779,20991479.0,21069306.0,-,upstream,-45917.0
3672623,FOXA1,GSM427090_peaks.narrowPeak,chr1,21221816,21222011,196,+,ENSMUSG00000025934,21230689.0,21255291.0,+,upstream,-8873.0
3672624,FOXA1,GSM427090_peaks.narrowPeak,chr1,21316849,21317045,197,+,ENSMUSG00000067750,21339996.0,21342280.0,+,upstream,-23147.0
3672625,FOXA1,GSM427090_peaks.narrowPeak,chr1,22594890,22595049,160,+,ENSMUSG00000041670,22276333.0,22812563.0,-,inside,217673.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36634468,FOXA1,GSM1146490_peaks.narrowPeak,chrX,166442492,166442701,210,+,ENSMUSG00000072844,166412976.0,166416849.0,-,upstream,-25643.0
36634469,FOXA1,GSM1146490_peaks.narrowPeak,chrX,166443236,166443581,346,+,ENSMUSG00000072844,166412976.0,166416849.0,-,upstream,-26387.0
36634470,FOXA1,GSM1146490_peaks.narrowPeak,chrX,166445149,166445367,219,+,ENSMUSG00000072844,166412976.0,166416849.0,-,upstream,-28300.0
36634471,FOXA1,GSM1146490_peaks.narrowPeak,chrX,166445843,166446053,211,+,ENSMUSG00000072844,166412976.0,166416849.0,-,upstream,-28994.0


Are they all mouse data?

In [24]:
all(kawakami_info.feature.astype(str).str.startswith('ENSMUS'))

False

Notice that the chromosomes, column `space` here are sort of weird; might want to take them out

Maybe not - since I can see the TFs in the `peakfile` column

In [26]:
kawakami_info[~kawakami_info.feature.astype(str).str.startswith('ENSMUS')]

Unnamed: 0,TF,peakfile,space,start,end,width,strand,feature,start_position,end_position,feature_strand,insideFeature,distancetoFeature
49688,ESR1,mm9_ESR1_mm8_peaks.bed,13_random,106236,106457,222,+,,,,,,
49689,ESR1,mm9_ESR1_mm8_peaks.bed,13_random,109202,109709,508,+,,,,,,
53236,ESR1,mm9_ESR1_mm8_peaks.bed,8_random,736701,736959,259,+,,,,,,
77604,NANOG,mm9_GSM286118_bioNanog.bed,Y_random,19717367,19718734,1368,+,,,,,,
77605,NANOG,mm9_GSM286118_bioNanog.bed,Y_random,10229537,10230708,1172,+,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51440828,RAD21,mm9_GSM1618336_male_liver_RAD21_peaks.bed,9_random,425756,425966,211,+,,,,,,
51440829,RAD21,mm9_GSM1618336_male_liver_RAD21_peaks.bed,Un_random,42466,43250,785,+,,,,,,
51440830,RAD21,mm9_GSM1618336_male_liver_RAD21_peaks.bed,Un_random,43660,44270,611,+,,,,,,
51444867,RAD21,mm9_GSM1618336_male_liver_RAD21_peaks.bed,Y_random,7708260,7708764,505,+,,,,,,


In [30]:
kawakami_info[kawakami_info.feature.astype(str).str.startswith('ENSMUS')]

Unnamed: 0,TF,peakfile,space,start,end,width,strand,feature,start_position,end_position,feature_strand,insideFeature,distancetoFeature
0,EED,mm9_Eed_mm8_peaks.bed,chr1,5008937,5009772,836,+,ENSMUSG00000002459,4899658.0,5060366.0,-,inside,51429.0
1,EED,mm9_Eed_mm8_peaks.bed,chr1,5010331,5011166,836,+,ENSMUSG00000002459,4899658.0,5060366.0,-,inside,50035.0
2,EED,mm9_Eed_mm8_peaks.bed,chr1,9535883,9536723,841,+,ENSMUSG00000061024,9535540.0,9537531.0,+,inside,343.0
3,EED,mm9_Eed_mm8_peaks.bed,chr1,17086414,17087547,1134,+,ENSMUSG00000042686,16987446.0,17087970.0,-,inside,1556.0
4,EED,mm9_Eed_mm8_peaks.bed,chr1,19093850,19094932,1083,+,ENSMUSG00000042596,19093102.0,19156406.0,+,inside,748.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
55293963,GATA1,wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak,chrY,227553,227578,26,+,ENSMUSG00000056673,234232.0,280254.0,+,upstream,-6679.0
55293964,GATA1,wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak,chrY,235502,235702,201,+,ENSMUSG00000056673,234232.0,280254.0,+,inside,1270.0
55293965,GATA1,wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak,chrY,387886,388201,316,+,ENSMUSG00000069049,347056.0,365035.0,+,downstream,40830.0
55293966,GATA1,wgEncodeSydhTfbsMelGata1IggratPk.narrowPeak,chrY,474114,474323,210,+,ENSMUSG00000068457,433588.0,582181.0,-,inside,108067.0


In [29]:
kawakami_info.TF.nunique(), kawakami_info.TF.unique()

(463,
 array(['EED', 'SUZ12', 'PHC1', 'RING2', 'GLI1', 'ESR1', 'KLF2', 'KLF4',
        'KLF5', 'P53', 'NANOG', 'NR0B1', 'NACC1', 'SOX2', 'ZN281', 'MYC',
        'ZFP42', 'PO5F1', 'TFE2', 'SALL4', 'STAT3', 'CHD7', 'TIF1B',
        'CNOT3', 'STAT4', 'CCND1', 'SOX17', 'AP2C', 'SMCA4', 'EOMES',
        'ETS2', 'GATA3', 'WT1', 'HXB4', 'ZIC3', 'IRF8', 'MYBA', 'SMAD1',
        'E2F1', 'TF2L1', 'CTCF', 'ZFX', 'ERR2', 'MYCN', 'EP300', 'SETB1',
        'GATA1', 'JARD2', 'EZH2', 'KDM5A', 'IRF4', 'TBX3', 'MTF2', 'PPARG',
        'SPI1', 'CEBPB', 'KLF1', 'TAL1', 'CRX', 'CREB1', 'CREM', 'PRD14',
        'GATA4', 'MEF2A', 'NKX25', 'SRF', 'TBX5', 'KDM5B', 'TET1', 'SRBP2',
        'SIN3A', 'RAD21', 'REST', 'RCOR1', 'RCOR2', 'RCOR3', 'SIN3B',
        'HXA2', 'SUH', 'TCF7', 'RUNX1', 'TEAD4', 'FOXO1', 'CEBPD', 'STA5A',
        'STA5B', 'TAF7L', 'TBP', 'CEBPA', 'EGR1', 'DMRT1', 'BMI1', 'THB',
        'TYY1', 'ASXL1', 'ZN322', 'NFIB', 'NUCKS', 'EP400', 'GLI3', 'E2F4',
        'EZH1', 'SIR1', 'AHR', 'SMAD5',

# KAWAKAMI Human

In [18]:
which_bed = 'SUMO1_GSM1035433_detailed_info.txt'  # an example

kawakami_dir = '../data/kawakami-human/detailed_info/' 
print(kawakami_dir + which_bed)
kawakami_info = pd.read_table(kawakami_dir + which_bed)
kawakami_info.shape, kawakami_info.head(5)

../data/kawakami-human/detailed_info/SUMO1_GSM1035433_detailed_info.txt


((1735, 13),
   space    start      end  width strand          feature  start_position  \
 0  chr1     9891    10466    576      +  ENSG00000219789          1874.0   
 1  chr1  1310509  1310713    205      +  ENSG00000218550       1303908.0   
 2  chr1  1624147  1624781    635      +  ENSG00000189339       1582803.0   
 3  chr1  1677622  1677996    375      +  ENSG00000215790       1646138.0   
 4  chr1  6614507  6614773    267      +  ENSG00000041988       6607514.0   
 
    end_position feature_strand insideFeature  distancetoFeature  \
 0        3533.0              +    downstream             8017.0   
 1     1304275.0              +    downstream             6601.0   
 2     1614027.0              -      upstream           -10120.0   
 3     1667291.0              -      upstream           -10331.0   
 4     6618219.0              +        inside             6993.0   
 
    shortestDistance fromOverlappingOrNearest  
 0            6358.0          NearestLocation  
 1            623

In [19]:
kawakami_human_tfs = [each.split('_')[0] for each in os.listdir(kawakami_dir)]
len(set(kawakami_human_tfs))

439

In [52]:
def read_kawakami_data(TF, kawakami_dir='../data/kawakami-human/detailed_info/', read_data=True):

    # check if the data exists
    mask = [bool(re.match(f'^({TF}).+', each)) for each in os.listdir(kawakami_dir)]

    if not any(mask):
        return(f'{TF} information not available.')
    else:
        tf_data = list(compress(os.listdir(kawakami_dir), mask))
        if read_data == True:
            #mask = [each.startswith(TF) for each in os.listdir(kawakami_dir)]
            tf_dataframe = [pd.read_table(kawakami_dir + which_bed) for which_bed in tf_data]

            tf_data = ['_'.join(info.split('_')[0:2]) for info in tf_data]
            out = dict(zip(tf_data, tf_dataframe))

            return(out)
        else:
            return(tf_data)

In [53]:
read_kawakami_data('GATA', read_data=True)

{'GATA2_GSM467648':       space     start       end  width strand          feature  \
 0      chr1      9968     10218    251      +  ENSG00000219789   
 1      chr1     32381     32948    568      +  ENSG00000222027   
 2      chr1     32381     32948    568      +  ENSG00000222003   
 3      chr1     34390     34596    207      +  ENSG00000197490   
 4      chr1     38613     38818    206      +  ENSG00000197490   
 ...     ...       ...       ...    ...    ...              ...   
 29456  chrY  58994193  58994308    116      +  ENSG00000217042   
 29457  chrY  58996573  58997108    536      +  ENSG00000217042   
 29458  chrY  58997235  58997370    136      +  ENSG00000217042   
 29459  chrY  59334312  59334588    277      +  ENSG00000217042   
 29460  chrY  59363169  59363351    183      +  ENSG00000217042   
 
        start_position  end_position feature_strand insideFeature  \
 0              1874.0        3533.0              +    downstream   
 1             24418.0       25944.0 

In [51]:
'_'.join('GATA2_GSM467648_detailed_info.txt'.split('_')[0:2])

'GATA2_GSM467648'

How many TFs?

In [None]:

len(set(kawakami_info.TF.values))
len(set(kawakami_info.peakfile.values))
kawakami_info.peakfile[~kawakami_info.peakfile.astype(str).str.startswith('mm9')]
kawakami_info.peakfile.astype(str).str.startswith('mm9')
'FOXA1' in kawakami_info.TF.tolist()
k_FOXA1 = kawakami_info[kawakami_info.TF == 'FOXA1']
k_FOXA1.shape
k_FOXA1
Are they all mouse data?
all(kawakami_info.feature.astype(str).str.startswith('ENSMUS'))
Notice that the chromosomes, column `space` here are sort of weird; might want to take them out

Maybe not - since I can see the TFs in the `peakfile` column
kawakami_info[~kawakami_info.feature.astype(str).str.startswith('ENSMUS')]
kawakami_info[kawakami_info.feature.astype(str).str.startswith('ENSMUS')]
kawakami_info.TF.nunique(), kawakami_info.TF.unique()

In [None]:
# space     start       end width strand         feature start_position end_position feature_strand insideFeature distancetoFeature shortestDistance fromOverlappingOrNearest