# Generate the lncRNA based on log2Foldchange

In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import glob
import numpy as np
from ipywidgets import widgets
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))
    
# Display output text box widget (will populate when value submitted in input)
printmd("**Type the log2fold change that you want to use?**")
name=input() 

transcript = widgets.Dropdown(options = ['mRNA', 'lncRNA', 'lncRNA + mRNA'])
printmd("**Select the transcript type**")
display(transcript)


**Type the log2fold change that you want to use?**

-0.58


**Select the transcript type**

Dropdown(options=('mRNA', 'lncRNA', 'lncRNA + mRNA'), value='mRNA')

In [103]:
new_variable = float(name)
printmd("**Cut-off:**")
print(new_variable)

#read files for foldchange criteria
if transcript.value == 'mRNA':
    all_files = glob.glob('data/*mRNA_change.txt')
elif transcript.value == 'lncRNA':
    all_files = glob.glob('data/*lncRNA_change.txt')
elif transcript.value == 'lncRNA + mRNA':
    all_files = glob.glob('data/*_change.txt') 
list_of_dfs = [pd.read_csv(all_files, sep="\t").drop(['Gene', 'baseMean','lfcSE','stat','pvalue', 'padj'], 1).set_index('Transcript') for all_files in all_files]

#loop for creating a list of datadrames - 1 per file
for dataframe, all_file in zip(list_of_dfs, all_files):
    all_file2 = all_file[5:].split('_')
    dataframe.rename(columns={'log2FoldChange':all_file2[0]}, inplace=True)

#Combine all the dfs
combined_df = pd.concat(list_of_dfs, axis=1, sort=True)
#combined_df_pvalue = pd.concat(list_of_dfs2, axis=1, sort=True)

#Create a coloumn named label containing all the cell line intersections based on gene name. Replace "-1" by the wanted foldchange
combined_df['label'] = (combined_df < new_variable).apply(lambda y: combined_df.columns[y.tolist()].tolist(), axis=1)

#Count the number of intersection using "," as delimiter
combined_df['Count Overlap'] = combined_df['label'].astype(str).str.count("\,") +1 

#Sort based on # of intersections
combined_df_sorted = combined_df.sort_values(by=['Count Overlap'], ascending=False)

#remove empty intersection
combined_df_sorted_clean = combined_df_sorted[~combined_df_sorted['label'].astype(str).str.contains('\[]')].fillna(0)
combined_df_sorted_clean

    

**Cut-off:**

-0.58


Unnamed: 0,CL40,CL40.1,COLO320,COLO320.1,GTG7,GTG7.1,HT55,HT55.1,HUTU80,HUTU80.1,...,LS180,LS180.1,LS411,LS411.1,MDST8,MDST8.1,SW1463,SW1463.1,label,Count Overlap
NM_002165,0.000000,-1.181748,0.000000,0.344808,0.000000,-0.730505,0.000000,-1.497162,0.000000,-1.164779,...,0.000000,-2.316883,0.000000,-0.714432,0.000000,-1.417058,0.000000,0.575379,"[CL40, GTG7, HT55, HUTU80, LOVO, LS180, LS411,...",8
NM_002513,0.000000,1.020166,0.000000,-1.138803,0.000000,-1.072349,0.000000,-2.902673,0.000000,-1.190006,...,0.000000,-0.406431,0.000000,-1.791223,0.000000,-1.933246,0.000000,-2.029388,"[COLO320, GTG7, HT55, HUTU80, LOVO, LS411, MDS...",8
ENSG00000279259.1_5,-1.831048,0.000000,-1.139510,0.000000,-1.319975,0.000000,0.000000,0.000000,0.000000,0.000000,...,-2.313718,0.000000,-1.273999,0.000000,-1.183875,0.000000,-1.732485,0.000000,"[CL40, COLO320, GTG7, LOVO, LS180, LS411, MDST...",8
NM_021103,0.000000,-3.376571,0.000000,-2.940238,0.000000,-1.757715,0.000000,-1.995440,0.000000,0.337592,...,0.000000,-1.008932,0.000000,0.010219,0.000000,-0.867050,0.000000,-1.167274,"[CL40, COLO320, GTG7, HT55, LOVO, LS180, MDST8...",8
NM_001039842,0.000000,3.299277,0.000000,-0.724057,0.000000,-0.758224,0.000000,-0.250184,0.000000,-1.358259,...,0.000000,-0.665963,0.000000,-1.062795,0.000000,-2.541358,0.000000,-1.150794,"[COLO320, GTG7, HUTU80, LOVO, LS180, LS411, MD...",8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000272917.1_6,0.000000,0.000000,0.265758,0.000000,0.000000,0.000000,0.000000,0.000000,0.049285,0.000000,...,0.657373,0.000000,0.000000,0.000000,-0.820840,0.000000,0.000000,0.000000,[MDST8],1
ENSG00000272884.1_6,-0.064524,0.000000,1.096731,0.000000,-0.075413,0.000000,1.210698,0.000000,-0.147358,0.000000,...,0.452481,0.000000,-0.589699,0.000000,0.433788,0.000000,2.206086,0.000000,[LS411],1
ENSG00000273018.6_7,0.000000,0.000000,-1.164833,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[COLO320],1
ENSG00000272977.1_6,-0.061181,0.000000,-0.312941,0.000000,-0.350081,0.000000,0.000000,0.000000,-1.789893,0.000000,...,-0.068745,0.000000,-0.442929,0.000000,0.905885,0.000000,0.000000,0.000000,[HUTU80],1


## Save the matrix as a excel file:

In [18]:
def csv_download_link(df, csv_file_name, delete_prompt=True):
    """Display a download link to load a data frame as csv from within a Jupyter notebook"""
    df.to_csv(csv_file_name, sep='\t')
    from IPython.display import FileLink
    display(FileLink(csv_file_name))
csv_download_link(combined_df_sorted_clean, 'matrix_TEAD_lncRNA.csv')   

## Expression of a specific lncRNA

In [1]:
combined_df_sorted_clean.loc["ENSG00000163597.14_2"]

NameError: name 'combined_df_sorted_clean' is not defined