In [22]:
import sys
import pandas as pd
paths_table='/bin/app/lncrna/paths.txt'
bed_path='/bin/app/genome/genome.lncrna.transOnly.bed'
outpath='/bin/app/lncrna'

In [24]:
readin=pd.read_csv(paths_table, delimiter='\t', header=None)

bed=pd.read_csv(bed_path, delimiter='\t', header=None)
bed.columns = ["chr", "start", "end", "name", "score", "strand"]

# Define column names
bw_cols=["name", "size", "covered_bases", "sum", "mean0", "mean", "min", "max"]
bb_cols=["name", "covered_percent", "mean", "min", "max"]

readin.head(20)

Unnamed: 0,0,1
0,CAGE_neg,/bin/app/lncrna/CAGE.sort.tsv
1,CAGE_neg_whole_trans,/bin/app/lncrna/CAGE_neg_whole_trans.tsv
2,CAGE_pos,/bin/app/lncrna/CAGE.sort.tsv
3,CAGE_pos_whole_trans,/bin/app/lncrna/CAGE_pos_whole_trans.tsv
4,GCcont,/bin/app/lncrna/GCcont.tsv
5,H3K4me3_S2,/bin/app/lncrna/H3K4me3_S2.tsv
6,JASPAR_TF,/bin/app/lncrna/JASPAR_TF.tsv
7,phastCons27,/bin/app/lncrna/phastCons27.tsv
8,phyloP124,/bin/app/lncrna/phyloP124.tsv
9,phyloP27,/bin/app/lncrna/phyloP27.tsv


In [25]:
# Create dataframes from extracted features
for i in range(len(readin)):
    if readin.iloc[i,0] == 'GCcont':
        df_gc=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_gc.columns=bw_cols
    elif readin.iloc[i,0] == 'phastCons27':
        df_pc27=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_pc27.columns=bw_cols
    elif readin.iloc[i,0] == 'phyloP27':
        df_pp27=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_pp27.columns=bw_cols
    elif readin.iloc[i,0] == 'phyloP124':
        df_pp124=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_pp124.columns=bw_cols
    elif readin.iloc[i,0] == 'Pol2_S2':
        df_pol2=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_pol2.columns=bw_cols
    elif readin.iloc[i,0] == 'H3K4me3_S2':
        df_me3=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_me3.columns=bw_cols
    elif readin.iloc[i,0] == 'CAGE_pos_whole_trans':
        df_posTSS_wt=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_posTSS_wt.columns=bw_cols
    elif readin.iloc[i,0] == 'CAGE_neg_whole_trans':
        df_negTSS_wt=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_negTSS_wt.columns=bw_cols
    elif readin.iloc[i,0] == 'ReMap':
        df_re=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_re.columns=bb_cols
    elif readin.iloc[i,0] == 'JASPAR_TF':
        df_tf=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_tf.columns=bb_cols
    elif readin.iloc[i,0] == 'CAGE_pos':
        df_posTSS=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None)
        df_posTSS.columns=["name", "startPosTSS"]
    elif readin.iloc[i,0] == 'CAGE_neg':
        df_negTSS=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None, na_values=0)
        df_negTSS.columns=["name", "endNegTSS"]
    elif readin.iloc[i,0] == 'RNAfold':
        df_2s=pd.read_csv(readin.iloc[i,1], delimiter='\t', header=None, na_values=0)
        df_2s.columns=["name", "mfe"]

# Function to append values of df with different row numbers
def append_columns(ref_df, query_df, ref_col, append_col, **kwargs):
    ref_col = str(ref_col)
    append_col = str(append_col)
    new_col_name = kwargs.get('new_col_name', None)    
    try: new_col_name
    except NameError: new_col_name = None
    if new_col_name is None:
        new_col_name = append_col
    
    df=pd.DataFrame({})
    for name, group in query_df.groupby(ref_col):
        buffer_df = pd.DataFrame({ref_col: group[ref_col][:1]})
        i = 0
        for index, value in group[append_col].iteritems():
            i += 1
            string = new_col_name
            buffer_df[string] = value
        df = df.append(buffer_df)
    features = pd.merge(ref_df, df, how='left', on=ref_col)
    return features

def append_all_cols(ref_df, query_df, ref_col):
    ref_col = str(ref_col)
    for i in range(len(query_df.columns)):
        if query_df.columns[i] == ref_col:
            ref_col = query_df.columns[i]
            for i in range(len(query_df.columns)):
                if query_df.columns[i] != ref_col:
                    append_col = query_df.columns[i]
                    features = append_columns(ref_df, query_df, ref_col, append_col)
                    ref_df = features
    return features

In [30]:
df_2s.head()

Unnamed: 0,name,mfe
0,FBtr0309810.1,-376.24
1,FBtr0309810,-853.8
2,FBtr0309810.2,-443.85
3,FBtr0347585,-270.9
4,FBtr0347585.1,-270.9


In [31]:
## Create the features table

features = bed["name"]
features = append_columns(features, bed, 'name', 'start')
features = append_columns(features, bed, 'name', 'end')
features["length"] = features["end"] - features["start"]

# Incorporate secondary structure values
features = append_all_cols(features, df_2s, 'name')

### Drop all rows that have no computed RNA fold
features.dropna(axis=0, how='any', inplace=True)

# Incorporate TSS peaks near ('ofsset' +/- 'start'/'end')
features = append_all_cols(features, df_posTSS, 'name')
features = append_all_cols(features, df_negTSS, 'name')

# Find best TSS from the positive and negative TSS values arround 5' and 3' of transcript
tss_peak = features[["startPosTSS", "endNegTSS"]]
tss_peak = abs(tss_peak)
tss_peak = tss_peak.max(axis=1)
features["bestTSS"] = tss_peak
features.drop(columns=["startPosTSS", "endNegTSS"], inplace=True)

# Incorporate TSS peaks inside the whole transcript
features = append_columns(features, df_posTSS_wt, 'name', 'max', new_col_name='PosTSS_inside')
features = append_columns(features, df_negTSS_wt, 'name', 'min', new_col_name='NegTSS_inside')

# Find best TSS inside the transcript itself
tss_peak_ins = features[["PosTSS_inside", "NegTSS_inside"]]
tss_peak_ins = abs(tss_peak_ins)
tss_peak_ins = tss_peak_ins.max(axis=1)
features["bestTSS_inside"] = tss_peak_ins
features.drop(columns=["PosTSS_inside", "NegTSS_inside"], inplace=True)

# Incorporate best metric for each queried bigWig or bigBed files
features = append_columns(features, df_gc, 'name', 'mean', new_col_name='mean_gc')
features = append_columns(features, df_re, 'name', 'mean', new_col_name='mean_remap')
features = append_columns(features, df_me3, 'name', 'covered_bases', new_col_name='cov_me3')
features["cov_me3"] = features["cov_me3"] / features["length"]
features = append_columns(features, df_tf, 'name', 'covered_percent', new_col_name='cov_tfbs')
features = append_columns(features, df_pol2, 'name', 'covered_bases', new_col_name='cov_pol2')
features["cov_pol2"] = features["cov_pol2"] / features["length"]
features = append_columns(features, df_pc27, 'name', 'mean0', new_col_name='mean_pcons27')
features = append_columns(features, df_pp27, 'name', 'mean0', new_col_name='mean_pPcons27')
features = append_columns(features, df_pp124, 'name', 'mean0', new_col_name='mean_pPcons124')

features.head(25)

Unnamed: 0,name,start,end,length,mfe,bestTSS,bestTSS_inside,mean_gc,mean_remap,cov_me3,cov_tfbs,cov_pol2,mean_pcons27,mean_pPcons27,mean_pPcons124
0,FBtr0309810.1,21951,22941,990,-376.24,77.0,58.0,46.7475,490.322,1.0,0.942424,0.743434,0.789939,1.54466,5.06269
1,FBtr0309810,21951,24237,2286,-853.8,77.0,58.0,45.1619,337.694,1.0,0.951006,0.615048,0.829356,1.7182,6.30211
2,FBtr0309810.2,22997,24237,1240,-443.85,3.0,28.0,44.3226,229.102,1.0,0.949194,0.540323,0.888432,1.91445,7.49766
3,FBtr0347585,54816,55767,951,-270.9,214.0,214.0,40.1893,232.264,1.0,0.963197,0.76551,0.436751,0.681945,0.582797
4,FBtr0347585.1,54816,55767,951,-270.9,214.0,214.0,40.1893,232.264,1.0,0.963197,0.76551,0.436751,0.681945,0.582797
5,FBtr0345732,65998,66242,244,-53.76,3.0,6.0,26.9672,140.885,0.340164,0.991803,1.0,0.211381,0.39618,0.073516
6,FBtr0345732.1,65998,66242,244,-53.76,3.0,6.0,26.9672,140.885,0.340164,0.991803,1.0,0.211381,0.39618,0.073516
7,FBtr0345733,66317,66524,207,-44.94,21.0,9017.0,36.3285,362.14,1.0,0.966184,1.0,0.407377,0.690961,0.561546
8,FBtr0345733.1,66317,66524,207,-44.94,21.0,9017.0,36.3285,362.14,1.0,0.966184,1.0,0.407377,0.690961,0.561546
9,FBtr0344053.2,71038,73642,2604,-858.84,70.0,28874.0,37.9339,310.358,1.0,0.981567,1.0,0.384323,0.712149,0.660117


In [32]:
total=len(features)

In [33]:
final_features=features.dropna(axis=0, how='any')


In [34]:
after=len(final_features)
droped = total - after
print('Droped ' + str(droped) + ' rows with NA values, from a total of ' + str(total))

Droped 243 rows with NA values, from a total of 7907


In [35]:
final_features.to_csv(outpath + '/X_train_lncrna_2s.csv', index=False)