# 02__aggregate_features

in this notebook, i aggregate all features examined (in order to make plots for Fig 5 and for the cluster analysis in Fig 6)

tables in this notebook:
- Table S5: features for all lncRNAs in the screen

In [1]:
import warnings
warnings.filterwarnings('ignore')

import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import re
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm
import sys
import time

# import utils
sys.path.append("../../../utils")
from plotting_utils import *
from classify_utils import *

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.autolayout'] = False

In [2]:
sns.set(**PAPER_PRESET)
fontsize = PAPER_FONTSIZE

In [3]:
np.random.seed(2019)

## functions

In [4]:
def min_biotype(row):
    if row.cleaner_gene_biotype == "protein_coding":
        return "mRNA"
    else:
        return "lncRNA"

## variables

In [5]:
# features
feature_dir = "../../../misc/09__model_features"
splicing_f = "%s/gene_splicing_efficiency.with_DIGIT.txt" % feature_dir
gc_f = "%s/gc_content.with_DIGIT.txt" % feature_dir
n_tss_f = "%s/n_tss_within_100bp.with_DIGIT.txt" % feature_dir
n_enh_f = "%s/n_enhancers_within_1mb.with_DIGIT.txt" % feature_dir
closest_enh_tss_f = "%s/closest_enh_to_TSS.with_DIGIT.fixed.txt" % feature_dir
closest_enh_tran_f = "%s/closest_enh_to_transcript.with_DIGIT.txt" % feature_dir
prom_cons_f = "%s/promoter_conservation.500buff.with_DIGIT.txt" % feature_dir
exon_cons_f = "%s/exon_conservation.with_DIGIT.txt" % feature_dir
dna_len_f = "%s/transcript_length.with_DIGIT.txt" % feature_dir
rna_len_f = "%s/transcript_length_RNA.with_DIGIT.txt" % feature_dir
n_exons_f = "%s/n_exons_per_transcript.with_DIGIT.txt" % feature_dir
closest_DE_enh_tss_f = "%s/closest_DE_enh_to_TSS.with_DIGIT.fixed.txt" % feature_dir
closest_DE_enh_tran_f = "%s/closest_DE_enh_to_transcript.with_DIGIT.txt" % feature_dir

In [6]:
# gwas file
gwas_dir = "../../../misc/06__gwas"
closest_endo_f = "%s/transcript_coords.closest_endo_cancer_snp.with_DIGIT.bed" % gwas_dir

In [7]:
gene_map_f = "../../../misc/00__gene_metadata/gencode.v25lift37.GENE_ID_TRANSCRIPT_ID_MAP.with_DIGIT.fixed.txt"

In [8]:
hits_f = "../../../data/02__screen/02__enrichment_data/enrichment_values.with_rna_seq.UPDATED.txt"

## 1. import data

In [9]:
splicing = pd.read_table(splicing_f)

In [10]:
gc = pd.read_table(gc_f, header=None)
gc.columns = ["transcript_id", "gc"]

In [11]:
n_tss = pd.read_table(n_tss_f, delim_whitespace=True, header=None)
n_tss.columns = ["n_tss", "transcript_id"]

In [12]:
n_enh = pd.read_table(n_enh_f, delim_whitespace=True, header=None)
n_enh.columns = ["n_enh", "transcript_id"]

In [13]:
closest_enh_tss = pd.read_table(closest_enh_tss_f, header=None)
closest_enh_tss.columns = ["chr", "start", "end", "transcript_id", "score", "strand", "enh_chr", "enh_start", 
                           "enh_end", "closest_enh_id", "enh_len", "enh_strand", "enh_tss1", "enh_tss2",
                           "enh_blocks", "enh_nblocks", "enh_distblocks", "enh_endblocks", "enh_tss_dist"]
closest_enh_tss = closest_enh_tss[["transcript_id", "enh_tss_dist"]]

In [14]:
closest_enh_tran = pd.read_table(closest_enh_tran_f, header=None)
closest_enh_tran.columns = ["chr", "start", "end", "transcript_id", "enh_chr", "enh_start", 
                           "enh_end", "closest_enh_id", "enh_len", "enh_strand", "enh_tss1", "enh_tss2",
                           "enh_blocks", "enh_nblocks", "enh_distblocks", "enh_endblocks", "enh_tran_dist"]
closest_enh_tran = closest_enh_tran[["transcript_id", "enh_tran_dist"]]

In [15]:
prom_cons = pd.read_table(prom_cons_f)

In [16]:
exon_cons = pd.read_table(exon_cons_f)

In [17]:
dna_len = pd.read_table(dna_len_f, header=None)
dna_len.columns = ["transcript_id", "dna_len"]

In [18]:
rna_len = pd.read_table(rna_len_f, header=None)
rna_len.columns = ["transcript_id", "rna_len"]

In [19]:
n_exons = pd.read_table(n_exons_f, header=None)
n_exons.columns = ["n_exons", "gene_id", "transcript_id"]

In [20]:
closest_endo = pd.read_table(closest_endo_f, sep="\t", header=None)
closest_endo.columns = ["chr", "start", "end", "transcript_id", "snp_chr", "snp_start", "snp_end",
                        "closest_endo_snp_id", "closest_endo_snp_disease", "closest_endo_snp_distance"]

In [21]:
closest_DE_enh_tss = pd.read_table(closest_DE_enh_tss_f, header=None)
closest_DE_enh_tss.columns = ["chr", "start", "end", "transcript_id", "score", "strand", "enh_chr", "enh_start", 
                           "enh_end", "DE_enh_tss_dist"]
closest_DE_enh_tss = closest_DE_enh_tss[["transcript_id", "DE_enh_tss_dist"]]

In [22]:
closest_DE_enh_tran = pd.read_table(closest_DE_enh_tran_f, header=None)
closest_DE_enh_tran.columns = ["chr", "start", "end", "transcript_id", "enh_chr", "enh_start", 
                           "enh_end", "DE_enh_tran_dist"]
closest_DE_enh_tran = closest_DE_enh_tran[["transcript_id", "DE_enh_tran_dist"]]

In [23]:
gene_map = pd.read_table(gene_map_f, header=None)
gene_map.columns = ["gene_id", "transcript_id"]

In [24]:
hits = pd.read_table(hits_f)

## 2. join transcript-level data w/ gene id

In [25]:
print(len(gc))
gc = gc.merge(gene_map, on="transcript_id")
print(len(gc))

200140
200140


In [26]:
print(len(n_tss))
n_tss = n_tss.merge(gene_map, on="transcript_id")
print(len(n_tss))

94934
94934


In [27]:
print(len(n_enh))
n_enh = n_enh.merge(gene_map, on="transcript_id")
print(len(n_enh))

199413
199413


In [28]:
print(len(closest_enh_tss))
closest_enh_tss = closest_enh_tss.merge(gene_map, on="transcript_id")
print(len(closest_enh_tss))

200140
200140


In [29]:
print(len(closest_enh_tran))
closest_enh_tran = closest_enh_tran.merge(gene_map, on="transcript_id")
print(len(closest_enh_tran))

200140
200140


In [30]:
print(len(prom_cons))
prom_cons = prom_cons.merge(gene_map, left_on="name", right_on="transcript_id")
print(len(prom_cons))

200140
200140


In [31]:
print(len(exon_cons))
exon_cons = exon_cons.merge(gene_map, left_on="name", right_on="transcript_id")
print(len(exon_cons))

1185571
1185571


In [32]:
print(len(dna_len))
dna_len = dna_len.merge(gene_map, on="transcript_id")
print(len(dna_len))

200140
200140


In [33]:
print(len(rna_len))
rna_len = rna_len.merge(gene_map, on="transcript_id")
print(len(rna_len))

200140
200140


In [34]:
print(len(closest_endo))
closest_endo = closest_endo.merge(gene_map, on="transcript_id")
print(len(closest_endo))

255868
255868


In [35]:
print(len(closest_DE_enh_tss))
closest_DE_enh_tss = closest_DE_enh_tss.merge(gene_map, on="transcript_id")
print(len(closest_DE_enh_tss))

200140
200140


In [36]:
print(len(closest_DE_enh_tran))
closest_DE_enh_tran = closest_DE_enh_tran.merge(gene_map, on="transcript_id")
print(len(closest_DE_enh_tran))

200140
200140


## 3. aggregate features to gene level

In [37]:
gc_gene = gc.groupby("gene_id")["gc"].agg("mean").reset_index()
print(len(gc_gene))

60253


In [38]:
n_tss_gene = n_tss.groupby("gene_id")["n_tss"].agg("max").reset_index()
print(len(n_tss_gene))

23065


In [39]:
n_enh_gene = n_enh.groupby("gene_id")["n_enh"].agg("max").reset_index()
print(len(n_enh_gene))

59781


In [40]:
closest_enh_tss_gene = closest_enh_tss.groupby("gene_id")["enh_tss_dist"].agg("min").reset_index()
print(len(closest_enh_tss_gene))

60253


In [41]:
closest_enh_tran_gene = closest_enh_tran.groupby("gene_id")["enh_tran_dist"].agg("min").reset_index()
print(len(closest_enh_tran_gene))

60253


In [42]:
closest_DE_enh_tss_gene = closest_DE_enh_tss.groupby("gene_id")["DE_enh_tss_dist"].agg("min").reset_index()
print(len(closest_DE_enh_tss_gene))

60253


In [43]:
closest_DE_enh_tran_gene = closest_DE_enh_tran.groupby("gene_id")["DE_enh_tran_dist"].agg("min").reset_index()
print(len(closest_DE_enh_tran_gene))

60253


In [44]:
prom_cons_gene = prom_cons.groupby("gene_id")["median"].agg("max").reset_index()
prom_cons_gene.columns = ["gene_id", "prom_cons"]
print(len(prom_cons_gene))

60253


In [45]:
exon_cons_tx = exon_cons.groupby(["name", "gene_id"])["median"].agg("mean").reset_index()
exon_cons_gene = exon_cons_tx.groupby("gene_id")["median"].agg("max").reset_index()
exon_cons_gene.columns = ["gene_id", "exon_cons"]
print(len(exon_cons_gene))

60253


In [46]:
dna_len_gene = dna_len.groupby("gene_id")["dna_len"].agg("max").reset_index()
print(len(dna_len_gene))

60253


In [47]:
rna_len_gene = rna_len.groupby("gene_id")["rna_len"].agg("max").reset_index()
print(len(rna_len_gene))

60253


In [48]:
n_exons_gene = n_exons.groupby("gene_id")["n_exons"].agg("max").reset_index()
print(len(n_exons_gene))

60253


In [49]:
# same thing for RNA-seq data: sum up transcript expression levels
endo_exp = hits[["gene_id", "hESC_mean", "endo_mean"]].groupby("gene_id")[["hESC_mean", "endo_mean"]].agg("sum").reset_index()
print(len(endo_exp))

7650


In [50]:
# take transcript w/ maximum logfc expression
endo_fc = hits[["gene_id", "endo_hESC_abslog2fc"]].groupby("gene_id")["endo_hESC_abslog2fc"].agg("max").reset_index()
print(len(endo_fc))

7650


In [51]:
# need to also do this for gwas
closest_endo_gene = closest_endo[["gene_id", 
                                  "closest_endo_snp_distance"]].groupby("gene_id")["closest_endo_snp_distance"].agg("min").reset_index()
closest_endo_gene = closest_endo_gene.merge(closest_endo[["gene_id", "closest_endo_snp_distance", 
                                                          "closest_endo_snp_id", "closest_endo_snp_disease"]],
                                            on=["gene_id", 
                                                "closest_endo_snp_distance"]).drop_duplicates(subset=["gene_id", 
                                                                                                      "closest_endo_snp_distance"])
print(len(closest_endo_gene))

60253


## 4. merge all genomic features into 1 dataframe

In [52]:
data = splicing.merge(gc_gene, on="gene_id", how="outer")
print(len(data))

60253


In [53]:
data = data.merge(n_tss_gene, on="gene_id", how="left").merge(n_enh_gene, on="gene_id", how="left")
print(len(data))

60253


In [54]:
data = data.merge(closest_enh_tss_gene, on="gene_id", how="left").merge(closest_enh_tran_gene, 
                                                                        on="gene_id", how="left")
print(len(data))

60253


In [55]:
data = data.merge(closest_DE_enh_tss_gene, on="gene_id", how="left").merge(closest_DE_enh_tran_gene, 
                                                                           on="gene_id", how="left")
print(len(data))

60253


In [56]:
data = data.merge(prom_cons_gene, on="gene_id", how="left")
print(len(data))

60253


In [57]:
data = data.merge(exon_cons_gene, on="gene_id", how="left").merge(dna_len_gene, on="gene_id", how="left")
print(len(data))

60253


In [58]:
data = data.merge(rna_len_gene, on="gene_id", how="left").merge(n_exons_gene, on="gene_id", how="left")
print(len(data))

60253


In [59]:
data = data.merge(closest_endo_gene[["gene_id", "closest_endo_snp_distance", "closest_endo_snp_id",
                                     "closest_endo_snp_disease"]], on="gene_id", how="left")
print(len(data))

60253


In [60]:
# for n tss and n enh, NAs do not mean lack of data but mean 0 -- so replace NAs in these cols with 0
data["n_tss"] = data["n_tss"].fillna(0)
data["n_enh"] = data["n_enh"].fillna(0)

In [61]:
data["short_gene_id"] = data["gene_id"].str.split("_", expand=True)[0]
data["shorter_gene_id"] = data["short_gene_id"].str.split(".", expand=True)[0]
data["minimal_biotype"] = data.apply(min_biotype, axis=1)
data.minimal_biotype.value_counts()

lncRNA    40121
mRNA      20132
Name: minimal_biotype, dtype: int64

In [62]:
# remove bad biotypes we don't care about (like pseudogenes) which will have a null gene_name value
data_filt = data[~pd.isnull(data["gene_name"])]
len(data_filt)

33856

## 5. create df including only genes in screen + include endo RNAseq features

In [63]:
genes_in_screen = hits["gene_id"].unique()
len(genes_in_screen)

7650

In [64]:
hits.is_hit.value_counts()

no hit              11320
stringent no hit      357
hit                    99
Name: is_hit, dtype: int64

In [65]:
gene_hit_status = hits[["gene_id", "gene_name", "ctrl_status", "cleaner_gene_biotype",
                        "is_hit"]].sort_values(by=["gene_id", "is_hit"]).drop_duplicates(subset="gene_id", 
                                                                                           keep="first")
gene_hit_status.head()

Unnamed: 0,gene_id,gene_name,ctrl_status,cleaner_gene_biotype,is_hit
30,DIGIT,DIGIT,experimental,promoter_overlap,hit
875,ENSG00000005206.12,SPPL2B,experimental,promoter_overlap,no hit
3375,ENSG00000008311.14_1,AASS,mRNA,protein_coding,no hit
377,ENSG00000010278.12_2,CD9,mRNA,protein_coding,stringent no hit
8019,ENSG00000013297.10_2,CLDN11,mRNA,protein_coding,no hit


In [66]:
print(len(data))
df_screen = data.merge(gene_hit_status, on=["gene_id", "gene_name", "cleaner_gene_biotype"])
print(len(df_screen))
df_screen.head()

60253
7650


Unnamed: 0,gene_id,SK-N-SH_eff_mean,SK-N-SH_exp_mean,SK-N-SH_eff_ratio,NCI-H460_eff_mean,NCI-H460_exp_mean,NCI-H460_eff_ratio,K562_eff_mean,K562_exp_mean,K562_eff_ratio,...,rna_len,n_exons,closest_endo_snp_distance,closest_endo_snp_id,closest_endo_snp_disease,short_gene_id,shorter_gene_id,minimal_biotype,ctrl_status,is_hit
0,ENSG00000243485.4_2,0.208126,0.319514,0.06649922,0.0,3.094915e-08,0.0,0.03873,0.115228,0.004463,...,712,3,863464,rs13303010,pancreatic carcinoma,ENSG00000243485.4,ENSG00000243485,lncRNA,experimental,no hit
1,ENSG00000237613.2_1,5e-06,3.2e-05,1.65436e-10,0.0,0.0,0.0,0.014606,0.032696,0.000478,...,1187,3,858492,rs13303010,pancreatic carcinoma,ENSG00000237613.2,ENSG00000237613,lncRNA,experimental,no hit
2,ENSG00000238009.6_2,0.226354,0.307654,0.06963862,0.018496,0.04327625,0.0008,0.688701,2.415708,1.663702,...,2748,4,760850,rs13303010,pancreatic carcinoma,ENSG00000238009.6,ENSG00000238009,lncRNA,experimental,no hit
3,ENSG00000239945.1_2,0.0,1.790425,0.0,0.0,2.987223,0.0,0.002306,1.325207,0.003055,...,1319,2,803468,rs13303010,pancreatic carcinoma,ENSG00000239945.1,ENSG00000239945,lncRNA,experimental,no hit
4,ENSG00000239906.1_1,0.061595,2.201202,0.1355827,0.0,0.0,0.0,0.623117,2.832431,1.764937,...,323,2,754234,rs13303010,pancreatic carcinoma,ENSG00000239906.1,ENSG00000239906,lncRNA,experimental,no hit


In [67]:
df_screen = df_screen.merge(endo_exp, on="gene_id").merge(endo_fc, on="gene_id")
len(df_screen)

7650

In [68]:
df_screen[df_screen["is_hit"] == "hit"].minimal_biotype.value_counts()

lncRNA    65
mRNA      11
Name: minimal_biotype, dtype: int64

In [69]:
tmp = df_screen[df_screen["is_hit"] == "hit"][["gene_name", "minimal_biotype", "ctrl_status", "cleaner_gene_biotype"]]
tmp

Unnamed: 0,gene_name,minimal_biotype,ctrl_status,cleaner_gene_biotype
81,RP4-680D5.8,lncRNA,experimental,transcript_overlap
197,MKNK1-AS1,lncRNA,experimental,transcript_overlap
221,RP11-67L3.4,lncRNA,experimental,promoter_overlap
243,FOXD3-AS1,lncRNA,experimental,transcript_overlap
311,RP11-421L21.3,lncRNA,experimental,promoter_overlap
...,...,...,...,...
7287,KB-1440D3.14,lncRNA,experimental,gene_nearby
7374,RP3-508I15.9,lncRNA,experimental,promoter_overlap
7438,RP11-120D5.1,lncRNA,experimental,promoter_overlap
7600,LINC00623,lncRNA,experimental,transcript_overlap


In [70]:
tmp.groupby(["minimal_biotype", "ctrl_status"])["gene_name"].agg("count")

minimal_biotype  ctrl_status 
lncRNA           experimental    65
mRNA             control          5
                 mRNA             6
Name: gene_name, dtype: int64

In [71]:
tmp[(tmp["minimal_biotype"] == "mRNA") & (tmp["ctrl_status"] == "experimental")]

Unnamed: 0,gene_name,minimal_biotype,ctrl_status,cleaner_gene_biotype


## 6. write final files

### general features for all genes

In [72]:
meta_cols = ["gene_id", "gene_name", "csf", "cleaner_gene_biotype", "minimal_biotype"]
sub_feature_cols = ['max_eff', 'max_exp', 'gc', 'n_tss', 'n_enh', 'enh_tss_dist', 'enh_tran_dist', 'prom_cons',
                    'exon_cons', 'dna_len', 'rna_len', 'n_exons']

In [73]:
all_cols = meta_cols + sub_feature_cols

In [74]:
supp = data_filt[all_cols]
supp.head()

Unnamed: 0,gene_id,gene_name,csf,cleaner_gene_biotype,minimal_biotype,max_eff,max_exp,gc,n_tss,n_enh,enh_tss_dist,enh_tran_dist,prom_cons,exon_cons,dna_len,rna_len,n_exons
2,ENSG00000243485.4_2,MIR1302-2,lncRNA_good_csf,intergenic,lncRNA,0.208126,0.559674,0.545147,0.0,18.0,809376,808633,0.0,0.0,1543,712,3
3,ENSG00000237613.2_1,FAM138A,lncRNA_good_csf,intergenic,lncRNA,0.154032,0.222609,0.465565,0.0,18.0,803660,803661,0.0,0.006667,1527,1187,3
6,ENSG00000186092.4_1,OR4F5,protein_coding,protein_coding,mRNA,0.023544,0.050019,0.571895,0.0,20.0,770651,769734,0.0,0.0,917,918,1
7,ENSG00000238009.6_2,RP11-34P13.7,lncRNA_good_csf,transcript_overlap,lncRNA,0.688701,2.415708,0.537288,0.0,26.0,706018,706019,0.38,0.505,36987,2748,4
8,ENSG00000239945.1_2,RP11-34P13.8,lncRNA_good_csf,transcript_overlap,lncRNA,0.027553,2.987223,0.525398,0.0,21.0,748636,748637,0.11,0.14,1554,1319,2


In [75]:
supp[supp["gene_name"] == "DIGIT"]

Unnamed: 0,gene_id,gene_name,csf,cleaner_gene_biotype,minimal_biotype,max_eff,max_exp,gc,n_tss,n_enh,enh_tss_dist,enh_tran_dist,prom_cons,exon_cons,dna_len,rna_len,n_exons
60203,DIGIT,DIGIT,lncRNA_good_csf,promoter_overlap,lncRNA,0.224504,0.639554,0.480683,0.0,72.0,2518,0,0.1,0.0,2966,2226,2


In [76]:
supp.to_csv("../../../data/03__features/SuppTable_AllFeatures.UPDATED.txt", sep="\t", index=False)

### + endo features for screen genes

In [77]:
meta_cols = ["gene_id", "gene_name", "csf", "cleaner_gene_biotype", "minimal_biotype", "is_hit"]
sub_feature_cols = ['max_eff', 'max_exp', 'gc', 'n_tss', 'n_enh', 'enh_tss_dist', 'enh_tran_dist', 'prom_cons',
                    'exon_cons', 'dna_len', 'rna_len', 'n_exons', 'hESC_mean', 'endo_mean', 'endo_hESC_abslog2fc',
                    'closest_endo_snp_distance', 'closest_endo_snp_id', 'closest_endo_snp_disease', 'DE_enh_tss_dist',
                    'DE_enh_tran_dist']

In [78]:
all_cols = meta_cols + sub_feature_cols

In [79]:
supp = df_screen[all_cols]
supp.head()

Unnamed: 0,gene_id,gene_name,csf,cleaner_gene_biotype,minimal_biotype,is_hit,max_eff,max_exp,gc,n_tss,...,rna_len,n_exons,hESC_mean,endo_mean,endo_hESC_abslog2fc,closest_endo_snp_distance,closest_endo_snp_id,closest_endo_snp_disease,DE_enh_tss_dist,DE_enh_tran_dist
0,ENSG00000243485.4_2,MIR1302-2,lncRNA_good_csf,intergenic,lncRNA,no hit,0.208126,0.559674,0.545147,0.0,...,712,3,0.08846,0.0,0.122288,863464,rs13303010,pancreatic carcinoma,909431,908688
1,ENSG00000237613.2_1,FAM138A,lncRNA_good_csf,intergenic,lncRNA,no hit,0.154032,0.222609,0.465565,0.0,...,1187,3,0.0,0.480302,0.565892,858492,rs13303010,pancreatic carcinoma,903715,903716
2,ENSG00000238009.6_2,RP11-34P13.7,lncRNA_good_csf,transcript_overlap,lncRNA,no hit,0.688701,2.415708,0.537288,0.0,...,2748,4,0.280988,0.111266,0.205054,760850,rs13303010,pancreatic carcinoma,806073,806074
3,ENSG00000239945.1_2,RP11-34P13.8,lncRNA_good_csf,transcript_overlap,lncRNA,no hit,0.027553,2.987223,0.525398,0.0,...,1319,2,0.20794,0.30715,0.113875,803468,rs13303010,pancreatic carcinoma,848691,848692
4,ENSG00000239906.1_1,RP11-34P13.14,lncRNA_good_csf,promoter_overlap,lncRNA,no hit,0.950739,19.2999,0.4613,0.0,...,323,2,13.451185,15.657979,0.205026,754234,rs13303010,pancreatic carcinoma,799457,799458


In [80]:
supp.to_csv("../../../data/03__features/SuppTable_S5.locus_features.txt", sep="\t", index=False)

In [81]:
### look at GWAS hits
tmp = supp[supp["minimal_biotype"] == "lncRNA"]
tmp = tmp[tmp["is_hit"] == "hit"]
tmp.sort_values(by="closest_endo_snp_distance")[["gene_name", "closest_endo_snp_distance",
                                                 "closest_endo_snp_id", "closest_endo_snp_disease"]]

Unnamed: 0,gene_name,closest_endo_snp_distance,closest_endo_snp_id,closest_endo_snp_disease
4055,RP11-867G2.8,0,rs12279741,small cell lung carcinoma
2211,RP11-541P9.3,0,rs12187751,"colorectal cancer, microsatellite instability ..."
3246,VLDLR-AS1,0,rs148802107,"smoking status measurement, lung carcinoma"
6859,AC011523.2,252,rs2659124,prostate carcinoma
3789,MRPL23-AS1,10343,rs78276179,squamous cell lung carcinoma
...,...,...,...,...
7242,LINC01424,2443598,rs1547374,pancreatic carcinoma
3581,RP11-124O11.1,2713477,rs76934034,prostate carcinoma
197,MKNK1-AS1,2852458,rs113688544,squamous cell lung carcinoma
7600,LINC00623,6316531,rs17599629,prostate carcinoma


In [82]:
supp[supp["gene_name"] == "FOXD3-AS1"].iloc[0]

gene_id                       ENSG00000230798.5_1
gene_name                               FOXD3-AS1
csf                               lncRNA_good_csf
cleaner_gene_biotype           transcript_overlap
minimal_biotype                            lncRNA
is_hit                                        hit
max_eff                                  0.976705
max_exp                                   339.448
gc                                       0.454614
n_tss                                           2
n_enh                                          24
enh_tss_dist                                 1751
enh_tran_dist                                1106
prom_cons                                    0.98
exon_cons                                0.333333
dna_len                                      3557
rna_len                                       941
n_exons                                         4
hESC_mean                                 134.634
endo_mean                               0.0651309


In [83]:
supp[supp["gene_name"] == "LINC00623"].iloc[0]

gene_id                       ENSG00000235398.4
gene_name                             LINC00623
csf                             lncRNA_good_csf
cleaner_gene_biotype         transcript_overlap
minimal_biotype                          lncRNA
is_hit                                      hit
max_eff                                0.952244
max_exp                                 21.1824
gc                                      0.52109
n_tss                                         0
n_enh                                        40
enh_tss_dist                             191871
enh_tran_dist                            191872
prom_cons                                  0.98
exon_cons                              0.666667
dna_len                                   40358
rna_len                                    1322
n_exons                                       7
hESC_mean                               45.0987
endo_mean                                47.915
endo_hESC_abslog2fc                    0