In [1]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
from os import system
from IPython.display import clear_output
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import phik 
import seaborn as sns
from phik import resources, report
from phik.report import plot_correlation_matrix
from itertools import combinations
import random
from tabulate import tabulate
from IPython.display import display, HTML

# load functions 
# load the datasets, set cutoff values, correlation method (these are done individually so the system saves them globallly)

In [2]:
gtex_gct_dataset = 'gene_tpm_2017-06-05_v8_breast_mammary_tissue.gct_2'
correlation_method ='pearson'
interest_gene='ITGA2'
correlation_cutoff = 0.6
tcga_hiseq_dataset = '/home/hshadman/integrins_expression/saghar_TCGA_Breast/TCGA_BRCA_HiSeqV2' 
tcga_hiseq_corresponding_sample_info = '/home/hshadman/integrins_expression/saghar_TCGA_Breast/TCGA.BRCA.sampleMap_BRCA_clinicalMatrix'
opposing_corr_cutoff = 0.1


In [3]:
#functions for data processing
def gtex_data_processing(gtex_gct_dataset):
    unwanted_integrins = ['ITGB3BP','ITGB1BP1','ITGB5-AS1','ITGA9-AS1','ITGB1P1','ITGB2-AS1', 'ITGB1BP2','ITGB1BP3']
    gtex_breast_data_all=pd.read_csv(gtex_gct_dataset,sep='\t')
    gtex_breast_data_all=gtex_breast_data_all.T.copy()
    gtex_breast_data_all.columns=gtex_breast_data_all.iloc[2]
    df_int=gtex_breast_data_all.loc[(gtex_breast_data_all.index!='id')&(gtex_breast_data_all.index!='Name')&(gtex_breast_data_all.index!='Description')].astype('float').copy()
    #drop unwanted integrins    
    df_int=df_int.drop([undesired_integrin for undesired_integrin in df_int.columns if undesired_integrin in unwanted_integrins],axis=1)
    #drop duplicate gene names (if any) 
    df_int=df_int.loc[:,~df_int.columns.duplicated()]    
    return df_int

def tcga_data_processing(tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info):
    global tcga_brca_df, tcga_brca_sample_info
    unwanted_integrins = ['ITGB3BP','ITGB1BP1','ITGB5-AS1','ITGA9-AS1','ITGB1P1','ITGB2-AS1', 'ITGB1BP2','ITGB1BP3']    
    tcga_brca_df = pd.read_csv(tcga_hiseq_dataset, sep = '\t', index_col=0)
    #drop unwanted integrins    
    tcga_brca_df=tcga_brca_df.loc[~tcga_brca_df.index.isin(unwanted_integrins)]
    #drop duplicate gene names (if any)
    tcga_brca_df=tcga_brca_df.loc[~tcga_brca_df.index.duplicated()]    
    tcga_brca_sample_info = pd.read_csv(tcga_hiseq_corresponding_sample_info,sep="\t",index_col=0)
    #drop metastatic from main sample type data frame
    tcga_brca_sample_info = tcga_brca_sample_info[~(tcga_brca_sample_info.sample_type=='Metastatic')]

    #separate normal and primtum
    tcga_brca_normal_patients = tcga_brca_sample_info[tcga_brca_sample_info.sample_type=='Solid Tissue Normal']
    tcga_brca_primtum_patients = tcga_brca_sample_info[tcga_brca_sample_info.sample_type=='Primary Tumor']

    #drop index of samples not in tcga_brca_df
    tcga_brca_normal_patients=  tcga_brca_normal_patients.drop(non_match for non_match in tcga_brca_normal_patients.index.values if non_match not in tcga_brca_df.columns)
    tcga_brca_primtum_patients =tcga_brca_primtum_patients.drop(non_match for non_match in tcga_brca_primtum_patients.index.values if non_match not in tcga_brca_df.columns) 

    #make sure both normal and tumor patients match, and drop patients not in tcga_brca_df
    matched_patient_index_list=[]
    for patient in tcga_brca_primtum_patients.patient_id.values:
        if patient in tcga_brca_normal_patients.patient_id.values and patient in [patient_id.split('-')[2] for patient_id in tcga_brca_df.columns]:
            matched_patient_index_list.append(tcga_brca_sample_info[tcga_brca_sample_info.patient_id==patient].index.values)
    matched_patient_list=tcga_brca_sample_info.loc[[x for l in matched_patient_index_list for x in l]].copy()


    #separate normal and primary tumor
    normal_matched_patient_list=matched_patient_list[matched_patient_list.sample_type=='Solid Tissue Normal']
    primtum_matched_patient_list = matched_patient_list[matched_patient_list.sample_type=='Primary Tumor']

    #matched normal and primary tumor samples
    tcga_brca_df_normal_matched=tcga_brca_df[tcga_brca_df.columns[tcga_brca_df.columns.isin(normal_matched_patient_list.index.values)]]
    tcga_brca_df_primtum_matched=tcga_brca_df[tcga_brca_df.columns[tcga_brca_df.columns.isin(primtum_matched_patient_list.index.values)]]

    #test to see if normal and primary tumor patient IDs are a match
    for i in [patient_id.split('-')[2] for patient_id in tcga_brca_df_normal_matched.columns]:
        if i not in [patient_id.split('-')[2] for patient_id in tcga_brca_df_primtum_matched.columns]:
            print('trouble')
            
    #TCGA brca normal ALL
    tcga_brca_df_normal_all = tcga_brca_df.T.loc[tcga_brca_normal_patients.index.values].copy()
    # dataframe with all TCGA primary tumor samples (not necessarily matched)
    tcga_brca_df_primtum_all= tcga_brca_df.T.loc[tcga_brca_primtum_patients.index.values].copy()
    
    return {'tcga_brca_df_normal_matched':tcga_brca_df_normal_matched,
           'tcga_brca_df_primtum_matched':tcga_brca_df_primtum_matched,
           'tcga_brca_df_normal_all':tcga_brca_df_normal_all,
           'tcga_brca_df_primtum_all':tcga_brca_df_primtum_all}

def correlation_selected_gene(provided_dataset,interest_gene,correlation_method):
    dataset_corr = provided_dataset.corrwith(provided_dataset[interest_gene],method=correlation_method).drop(interest_gene)
    dataset_corr = dataset_corr.dropna(axis=0,how='any')
    idx = dataset_corr.index.drop_duplicates(keep=False)
    dataset_corr = dataset_corr.loc[idx]
    return dataset_corr
def pairwise_pearson_correlation(provided_dataset):
    #columns have to be variables and rows have to be observations
    pairwise_corr=pd.DataFrame(np.corrcoef(provided_dataset,rowvar=False),
                               columns=provided_dataset.columns,index=provided_dataset.columns)



In [4]:
#functions for counting
def count_genes_3_no_overlap(interest_gene,correlation_method,gtex_dataset,dataset2,dataset3):
#make sure to preprocess dataset
#dictionary only takes unique values, careful about this
    print('no overlap between datasets')
    dict_empty={}
    first_dataset_name = 'gtex_all'
    second_dataset_name= 'tcga_normal_all'
    third_dataset_name= 'tcga_primtum_all'
    names = [first_dataset_name,second_dataset_name,third_dataset_name]
    j=0
    for dataset in [gtex_dataset,dataset2,dataset3]:
        dataset=correlation_selected_gene(dataset,interest_gene,correlation_method)
        dict_empty[names[j]]=dataset.shape[0]
        j+=1
    dict_pd_df = pd.DataFrame(dict_empty,index=[f'corr_with_{interest_gene}'])
    return display(HTML(dict_pd_df.to_html()))    
def count_genes_same_3(interest_gene,correlation_method,gtex_dataset,dataset2,dataset3):
#make sure to preprocess dataset
#dictionary only takes unique values, careful about this
    gtex_dataset=correlation_selected_gene(gtex_dataset,interest_gene,correlation_method)
    dataset2=correlation_selected_gene(dataset2,interest_gene,correlation_method)
    dataset3=correlation_selected_gene(dataset3,interest_gene,correlation_method)
    common_gene_list= gtex_dataset.index.intersection(dataset2.index).intersection(dataset3.index)
    global new_dataset
    new_dataset = pd.concat([gtex_dataset[common_gene_list],
                             dataset2[common_gene_list],
                             dataset3[common_gene_list]],axis=1)
    first_dataset_name = 'gtex_all'
    second_dataset_name= 'tcga_normal_all'
    third_dataset_name= 'tcga_primtum_all'
    new_dataset.columns=[first_dataset_name,second_dataset_name,third_dataset_name]
    print("1st: common genes identified for all 3 datasets\n2nd: they were counted")
    dict_pd_df = pd.DataFrame(data=[[new_dataset.shape[0],new_dataset.shape[0],new_dataset.shape[0]]],
                             index=[f'corr_with_{interest_gene}_all_same_genes'],
                             columns=[first_dataset_name,second_dataset_name,third_dataset_name])
    return display(HTML(dict_pd_df.to_html()))
def count_high_corr_genes_no_overlap(interest_gene,correlation_method,correlation_cutoff,*processed_datasets):
#make sure to preprocess dataset
#dictionary only takes unique values, careful about this
    print('no overlap between datasets')
    dict_empty={}
    j=0
    for dataset in processed_datasets:
        dataset=correlation_selected_gene(dataset,interest_gene,correlation_method)
        dict_empty[input(f'insert name for {j+1}th dataset')]=dataset[dataset>=correlation_cutoff].shape[0]
        
        j+=1
    
    dict_pd_df = pd.DataFrame(dict_empty,index=[f'HIGHLY_correlated_with_{interest_gene}'])
    #return display(HTML(dict_pd_df.to_html()))
    return dict_pd_df
def cutoff_applied_togtex_2_high_both(interest_gene,correlation_method,correlation_cutoff,gtex_dataset,other_dataset):
#make sure to preprocess dataset
#dictionary only takes unique values, careful about this
    gtex_dataset=correlation_selected_gene(gtex_dataset,interest_gene,correlation_method)
    gtex_dataset = gtex_dataset[gtex_dataset>=correlation_cutoff]
    dataset2=correlation_selected_gene(other_dataset,interest_gene,correlation_method)
    common_gene_list= gtex_dataset.index.intersection(dataset2.index)
    global new_dataset    
    new_dataset = pd.concat([gtex_dataset[common_gene_list],dataset2[common_gene_list]],axis=1)
    first_dataset_name = 'gtex_cutoffmet'
    other_dataset_name=input('insert name for other dataset')
    new_dataset.columns=[first_dataset_name,other_dataset_name]
    print("1st: high cutoff applied to gtex dataset\n2nd: common genes identified with other dataset\n3rd: both datasets combined\n4th: High cutoff applied to non-gtex dataset")
    
    new_dataset = new_dataset[new_dataset[other_dataset_name]>=correlation_cutoff]
    dict_pd_df = pd.DataFrame(data=[[new_dataset.shape[0],new_dataset.shape[0]]],
                             index=[f'high_corr_with_{interest_gene}_both_same_genes'],
                             columns=[first_dataset_name,other_dataset_name])
    return dict_pd_df
def cutoff_applied_togtex_2_high_low(interest_gene,correlation_method,correlation_cutoff,opposing_corr_cutoff,gtex_dataset,other_dataset):
#make sure to preprocess dataset
#dictionary only takes unique values, careful about this
    gtex_dataset=correlation_selected_gene(gtex_dataset,interest_gene,correlation_method)
    gtex_dataset = gtex_dataset[gtex_dataset>=correlation_cutoff]
    dataset2=correlation_selected_gene(other_dataset,interest_gene,correlation_method)
    common_gene_list= gtex_dataset.index.intersection(dataset2.index)
    global new_dataset    
    new_dataset = pd.concat([gtex_dataset[common_gene_list],dataset2[common_gene_list]],axis=1)
    first_dataset_name = 'gtex_cutoffmet'
    other_dataset_name=input('insert name for other dataset')
    new_dataset.columns=[first_dataset_name,other_dataset_name]
    print("1st: high cutoff applied to gtex dataset\n2nd: common genes identified with other dataset\n3rd: both datasets combined\n4th: low cutoff applied to non-gtex dataset")
    
    new_dataset = new_dataset[new_dataset[other_dataset_name]<=opposing_corr_cutoff]
    dict_pd_df = pd.DataFrame(data=[[new_dataset.shape[0],new_dataset.shape[0]]],
                             index=[f'corr_with_{interest_gene}_highgtex_low_other'],
                             columns=[first_dataset_name,other_dataset_name])
    return dict_pd_df
def cutoff_applied_totcga_2_high_both(interest_gene,correlation_method,correlation_cutoff,gtex_dataset,other_dataset):
#make sure to preprocess dataset
#dictionary only takes unique values, careful about this
    gtex_dataset=correlation_selected_gene(gtex_dataset,interest_gene,correlation_method)
    dataset2=correlation_selected_gene(other_dataset,interest_gene,correlation_method)
    dataset2 = dataset2[dataset2>=correlation_cutoff]
    common_gene_list= gtex_dataset.index.intersection(dataset2.index)
    global new_dataset    
    new_dataset = pd.concat([gtex_dataset[common_gene_list],dataset2[common_gene_list]],axis=1)
    first_dataset_name = 'gtex_all'
    other_dataset_name='tcga_primtum_all_cutoffmet'
    new_dataset.columns=[first_dataset_name,other_dataset_name]
    print("1st: high cutoff applied to tcga dataset\n2nd: common genes identified with gtex dataset\n3rd: both datasets combined\n4th: High cutoff applied to gtex dataset")

    new_dataset = new_dataset[new_dataset[first_dataset_name]>=correlation_cutoff]
    dict_pd_df = pd.DataFrame(data=[[new_dataset.shape[0],new_dataset.shape[0]]],
                             index=[f'high_corr_with_{interest_gene}_both_same_genes'],
                             columns=[first_dataset_name,other_dataset_name])
    return display(HTML(dict_pd_df.to_html()))
def cutoff_applied_totcga_2_high_low(interest_gene,correlation_method,correlation_cutoff,opposing_corr_cutoff,gtex_dataset,other_dataset):
#make sure to preprocess dataset
#dictionary only takes unique values, careful about this
    gtex_dataset=correlation_selected_gene(gtex_dataset,interest_gene,correlation_method)
    dataset2=correlation_selected_gene(other_dataset,interest_gene,correlation_method)
    dataset2 = dataset2[dataset2>=correlation_cutoff]
    common_gene_list= gtex_dataset.index.intersection(dataset2.index)
    global new_dataset    
    new_dataset = pd.concat([gtex_dataset[common_gene_list],dataset2[common_gene_list]],axis=1)
    first_dataset_name = 'gtex_all'
    other_dataset_name='tcga_primtum_all_cutoffmet'
    new_dataset.columns=[first_dataset_name,other_dataset_name]
    print("1st: high cutoff applied to tcga dataset\n2nd: common genes identified with gtex dataset\n3rd: both datasets combined\n4th: low cutoff applied to gtex dataset")

    new_dataset = new_dataset[new_dataset[first_dataset_name]<=opposing_corr_cutoff]
    dict_pd_df = pd.DataFrame(data=[[new_dataset.shape[0],new_dataset.shape[0]]],
                             index=[f'corr_with_{interest_gene}_both_same_genes'],
                             columns=[first_dataset_name,other_dataset_name])
    return display(HTML(dict_pd_df.to_html()))

def cutoff_applied_togtex_3_high_all(interest_gene,correlation_method,correlation_cutoff,gtex_dataset,other_dataset,
                                    dataset3):
#dataset3 MUST BE tcga primtum all
#make sure to preprocess dataset
#dictionary only takes unique values, careful about this
    gtex_dataset=correlation_selected_gene(gtex_dataset,interest_gene,correlation_method)
    gtex_dataset = gtex_dataset[gtex_dataset>=correlation_cutoff]
    dataset2=correlation_selected_gene(other_dataset,interest_gene,correlation_method)
    dataset3=correlation_selected_gene(dataset3,interest_gene,correlation_method)
    common_gene_list= (gtex_dataset.index.intersection(dataset2.index)).intersection(dataset3.index)
    global new_dataset    
    new_dataset = pd.concat([gtex_dataset[common_gene_list],
                             dataset2[common_gene_list],
                            dataset3[common_gene_list]],axis=1)
    first_dataset_name = 'gtex_cutoffmet'
    other_dataset_name='tcga_normal_all'
    dataset3_name = 'tcga_primtum_all'
    new_dataset.columns=[first_dataset_name,other_dataset_name,dataset3_name]
    print("1st: high cutoff applied to gtex dataset\n2nd: common genes identified with other datasets\n3rd: all datasets combined\n4th: High cutoff applied to non-gtex datasets")
    
    new_dataset = new_dataset[(new_dataset[other_dataset_name]>=correlation_cutoff) & (new_dataset[dataset3_name]>=correlation_cutoff)]
    dict_pd_df = pd.DataFrame(data=[[new_dataset.shape[0],new_dataset.shape[0],new_dataset.shape[0]]],
                             index=[f'high_corr_with_{interest_gene}_all_same_genes'],
                             columns=[first_dataset_name,other_dataset_name,dataset3_name])
    return dict_pd_df



In [5]:
#organize as a table (original way -- before Jesse finalized the presentation but i used this for table 2)
# careful of the #ered data (e.g. #3 is #3 in both functions but presentation might be different)
def make_data_presentable(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info):
    df_int=gtex_data_processing(gtex_gct_dataset)
    tcga_data=tcga_data_processing(tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)
    tcga_brca_df_normal_all=tcga_data['tcga_brca_df_normal_all']
    tcga_brca_df_primtum_all=tcga_data['tcga_brca_df_primtum_all']
    #1
    count_genes_3_no_overlap(interest_gene,correlation_method,df_int,tcga_brca_df_normal_all,tcga_brca_df_primtum_all)
    #2
    count_genes_same_3(interest_gene,correlation_method,df_int,tcga_brca_df_normal_all,tcga_brca_df_primtum_all)
    #3
    print('for 1th dataset ENTER: gtex_all')
    print('for 2th dataset ENTER: tcga_normal_all')
    count_high_corr_genes_no_overlap(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #4
    #print('for 1th dataset ENTER: gtex_all')
    #print('for 2th dataset ENTER: tcga_normal_all')    
    #4 is not very meaningful
    #count_high_corr_genes_with_overlap(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #5
    print('for dataset ENTER: tcga_normal_all')    
    cutoff_applied_togtex_2_high_both(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #6
    print('for dataset ENTER: tcga_normal_all')        
    cutoff_applied_togtex_2_high_low(interest_gene,correlation_method,correlation_cutoff,opposing_corr_cutoff,df_int,tcga_brca_df_normal_all)
    #7
    print('for dataset ENTER: tcga_primtum_all')            
    cutoff_applied_togtex_2_high_both(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_primtum_all)
    #8
    print('for dataset ENTER: tcga_primtum_all')                
    cutoff_applied_togtex_2_high_low(interest_gene,correlation_method,correlation_cutoff,opposing_corr_cutoff,df_int,tcga_brca_df_primtum_all)
    #9
    cutoff_applied_totcga_2_high_both(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_primtum_all)
    #10
    cutoff_applied_totcga_2_high_low(interest_gene,correlation_method,correlation_cutoff,opposing_corr_cutoff,df_int,tcga_brca_df_primtum_all)
    return

In [6]:
#table 1
def organize_gtex_tcga_table1(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info):
    df_int=gtex_data_processing(gtex_gct_dataset)
    tcga_data=tcga_data_processing(tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)
    tcga_brca_df_normal_all=tcga_data['tcga_brca_df_normal_all']
    tcga_brca_df_primtum_all=tcga_data['tcga_brca_df_primtum_all']
    #3
    print('for 1th dataset ENTER: gtex_all')
    print('for 2th dataset ENTER: tcga_normal_all')
    number_3=count_high_corr_genes_no_overlap(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #4
    #print('for 1th dataset ENTER: gtex_all')
    #print('for 2th dataset ENTER: tcga_normal_all')    
    #4 is not very meaningful
    #count_high_corr_genes_with_overlap(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #5
    print('for dataset ENTER: tcga_normal_all')    
    number_5=cutoff_applied_togtex_2_high_both(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #7
    print('for dataset ENTER: tcga_primtum_all')            
    nnumber_7=cutoff_applied_togtex_2_high_both(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_primtum_all)
    #10
    number_10=cutoff_applied_togtex_3_high_all(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all,
                                    tcga_brca_df_primtum_all)
    first_column_name='Integrin' #index
    second_column_name='GTEX_highly_correlated'
    third_column_name='%also_highly_correlated_in_TCGA_normal'
    fourth_column_name='%also_highly_correlated_in_TCGA_primtum'
    fifth_column_name='%also_highly_correlated_in_all_three'
    #ignoring first column name. setting index as integrin name (first column name)
    dict_pd_df = pd.DataFrame(data=[[number_3.iloc[0,0],
                                     str(format((number_5.iloc[0,0]/number_3.iloc[0,0])*100,'2f'))+f'({str(number_5.iloc[0,0])})',
                                     str(format((nnumber_7.iloc[0,0]/number_3.iloc[0,0])*100,'2f'))+f'({str(nnumber_7.iloc[0,0])})',
                                     str(format((number_10.iloc[0,0]/number_3.iloc[0,0])*100,'2f'))+f'({str(number_10.iloc[0,0])})',
                                    ]],
                              columns=[second_column_name,third_column_name,
                                      fourth_column_name,fifth_column_name],
                              index=[interest_gene])
    return display(HTML(dict_pd_df.to_html()))

def organize_gtex_tcga_table2(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info):
    df_int=gtex_data_processing(gtex_gct_dataset)
    tcga_data=tcga_data_processing(tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)
    tcga_brca_df_normal_all=tcga_data['tcga_brca_df_normal_all']
    tcga_brca_df_primtum_all=tcga_data['tcga_brca_df_primtum_all']
    #3
    print('for 1th dataset ENTER: gtex_all')
    print('for 2th dataset ENTER: tcga_normal_all')
    number_3=count_high_corr_genes_no_overlap(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #4
    #print('for 1th dataset ENTER: gtex_all')
    #print('for 2th dataset ENTER: tcga_normal_all')    
    #4 is not very meaningful
    #count_high_corr_genes_with_overlap(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #6
    print('for dataset ENTER: tcga_normal_all')        
    number_6=cutoff_applied_togtex_2_high_low(interest_gene,correlation_method,correlation_cutoff,opposing_corr_cutoff,df_int,tcga_brca_df_normal_all)
    #8
    print('for dataset ENTER: tcga_primtum_all')                
    number_8=cutoff_applied_togtex_2_high_low(interest_gene,correlation_method,correlation_cutoff,opposing_corr_cutoff,df_int,tcga_brca_df_primtum_all)
    first_column_name='Integrin' #index
    second_column_name='GTEX_highly_correlated'
    third_column_name='%lowly_correlated_in_TCGA_normal'
    fourth_column_name='%lowly_correlated_in_TCGA_primtum'
    #fifth_column_name='%also_highly_correlated_in_all_three'
    #ignoring first column name. setting index as integrin name (first column name)
    dict_pd_df = pd.DataFrame(data=[[number_3.iloc[0,0],
                                     str(format((number_6.iloc[0,0]/number_3.iloc[0,0])*100,'2f'))+f'({str(number_6.iloc[0,0])})',
                                     str(format((number_8.iloc[0,0]/number_3.iloc[0,0])*100,'2f'))+f'({str(number_8.iloc[0,0])})'
                                    ]],
                              columns=[second_column_name,third_column_name,
                                      fourth_column_name],
                              index=[interest_gene])
    return display(HTML(dict_pd_df.to_html()))

#table 3
def organize_no_overlap_table3(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info):
    df_int=gtex_data_processing(gtex_gct_dataset)
    tcga_data=tcga_data_processing(tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)
    tcga_brca_df_normal_all=tcga_data['tcga_brca_df_normal_all']
    tcga_brca_df_primtum_all=tcga_data['tcga_brca_df_primtum_all']
    
    gtex_expression_mean = df_int[interest_gene].mean()
    gtex_expression_stdev= df_int[interest_gene].std()
    
    #3
    print('for 1th dataset ENTER: gtex_all')
    print('for 2th dataset ENTER: tcga_normal_all')
    number_3_tcga_normal=count_high_corr_genes_no_overlap(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_normal_all)
    #3
    print('for 1th dataset ENTER: gtex_all')
    print('for 2th dataset ENTER: tcga_primtum_all')
    number_3_tcga_primtum=count_high_corr_genes_no_overlap(interest_gene,correlation_method,correlation_cutoff,df_int,tcga_brca_df_primtum_all)
    

    first_column_name='Integrin' #index
    second_column_name='GTEX_expression_mean_&_stdev'
    third_column_name='GTEX_highly_correlated'
    fourth_column_name='highly_correlated_in_TCGA_normal'
    fifth_column_name='highly_corelated_in_TCGA_primtum'
    #ignoring first column name. setting index as integrin name (first column name)
    dict_pd_df = pd.DataFrame(data=[[f'{format(gtex_expression_mean,"0.2f")} + {format(gtex_expression_stdev,"0.2f")}',
                                     str(number_3_tcga_normal.iloc[0,0]),
                                     str(number_3_tcga_normal.iloc[0,1]),
                                     str(number_3_tcga_primtum.iloc[0,1])]],
                              columns=[second_column_name,third_column_name,
                                      fourth_column_name,fifth_column_name],
                              index=[interest_gene])
    return display(HTML(dict_pd_df.to_html()))
    
    
    


In [7]:
#test
df_int=gtex_data_processing(gtex_gct_dataset)
tcga_data=tcga_data_processing(tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)
tcga_brca_df_normal_all=tcga_data['tcga_brca_df_normal_all']
tcga_brca_df_primtum_all=tcga_data['tcga_brca_df_primtum_all']


In [8]:
#test
test_df_int=correlation_selected_gene(df_int,interest_gene,correlation_method)
test_tcga_brca_df_normal_all=correlation_selected_gene(tcga_brca_df_normal_all,interest_gene,correlation_method)
test_tcga_brca_df_primtum_all=correlation_selected_gene(tcga_brca_df_primtum_all,interest_gene,correlation_method)

In [9]:
test_df_int[test_df_int>=correlation_cutoff]

Description
PLEKHN1          0.737729
AGRN             0.832636
RP11-465B22.3    0.614809
C1orf159         0.626508
INTS11           0.667310
                   ...   
MTM1             0.617390
MTMR1            0.785413
ZNF185           0.803545
BRCC3            0.616336
RAB39B           0.667438
Length: 4047, dtype: float64

In [None]:
#dataset_corr = dataset_corr.dropna(axis=0,how='any')
    #idx = dataset_corr.index.drop_duplicates(keep=False)
    #dataset_corr = dataset_corr.loc[idx]

# calculate pairwise correlation PEARSON


In [10]:
#df_int is gtex dataframe. CAUTION multiple instances of dataframe variable in below codes per line
gtex_pairwise_pearson_corr = pd.DataFrame(np.corrcoef(df_int,rowvar=False),
                                          columns=df_int.columns,index=df_int.columns)
#tcga_brca_df_normal_all is tcga normal dataframe
tcga_normal_all_pearson_corr = pd.DataFrame(np.corrcoef(tcga_brca_df_normal_all,rowvar=False),
                                          columns=tcga_brca_df_normal_all.columns,index=tcga_brca_df_normal_all.columns)
#tcga_brca_df_primtum_all is tcga tumor dataframe

tcga_primtum_all_pearson_corr = pd.DataFrame(np.corrcoef(tcga_brca_df_primtum_all,rowvar=False),
                                          columns=tcga_brca_df_primtum_all.columns,index=tcga_brca_df_primtum_all.columns)




MemoryError: Unable to allocate 22.2 GiB for an array with shape (54585, 54585) and data type float64

In [10]:
#df_int is gtex dataframe. CAUTION multiple instances of dataframe variable in below codes per line
gtex_pairwise_pearson_corr = np.corrcoef(df_int,rowvar=False)
                                          
#tcga_brca_df_normal_all is tcga normal dataframe
tcga_normal_all_pearson_corr = np.corrcoef(tcga_brca_df_normal_all,rowvar=False)
#tcga_brca_df_primtum_all is tcga tumor dataframe

tcga_primtum_all_pearson_corr = np.corrcoef(tcga_brca_df_primtum_all,rowvar=False)
                                          




  c /= stddev[:, None]
  c /= stddev[None, :]


In [36]:
list_gtex_corr=[]
list_gtex_oppcorr=[]
list_tcga_normal_all_corr=[]
list_tcga_normal_all_oppcorr=[]
list_tcga_primtum_all_corr=[]
list_tcga_primtum_all_oppcorr=[]


for col in np.arange(0,gtex_pairwise_pearson_corr.shape[1]):
    list_gtex_corr.append((gtex_pairwise_pearson_corr[:,col]>=correlation_cutoff).sum())
    list_gtex_oppcorr.append((gtex_pairwise_pearson_corr[:,col]<=opposing_corr_cutoff).sum())


for col in np.arange(0,tcga_normal_all_pearson_corr.shape[1]):
    list_tcga_normal_all_corr.append((tcga_normal_all_pearson_corr[:,col]>=correlation_cutoff).sum())
    list_tcga_normal_all_oppcorr.append((tcga_normal_all_pearson_corr[:,col]<=opposing_corr_cutoff).sum())
    
for col in np.arange(0,tcga_primtum_all_pearson_corr.shape[1]):
    list_tcga_primtum_all_corr.append((tcga_primtum_all_pearson_corr[:,col]>=correlation_cutoff).sum())
    list_tcga_primtum_all_oppcorr.append((tcga_primtum_all_pearson_corr[:,col]<=opposing_corr_cutoff).sum())


In [40]:
chunk_size=1000

temp_gtex_pairwise_pearson_corr
gtex_pairwise_pearson_corr[gtex_pairwise_pearson_corr[gtex_pairwise_pearson_corr.columns]>=correlation_cutoff].count().max()

MemoryError: Unable to allocate 2.77 GiB for an array with shape (54585, 54585) and data type bool

In [111]:

def chunkify(df: pd.DataFrame, chunk_size: int):
    global chunk_list
    chunk_list=[] 
    start = 0
    length = df.shape[0]

    # If DF is smaller than the chunk, return the DF
    if length <= chunk_size:
        chunk_list.append(df[:])
        yield df[:]
        return

    # Yield individual chunks
    while start + chunk_size <= length:
        chunk_list.append(df[start:chunk_size + start])
        yield df[start:chunk_size + start]
        start = start + chunk_size

    # Yield the remainder chunk, if needed
    if start < length:
        chunk_list.append(df[start:])
        yield df[start:]
    return chunk_list



<generator object chunkify at 0x2ab975d0a660>

In [None]:
#df is gtex data
df=gtex_pairwise_pearson_corr
chunkify(df,1000)
max_list_gtex_corr=[]
min_list_gtex_corr=[]
max_list_gtex_oppcorr=[]
min_list_gtex_oppcorr=[]
for chunk in chunk_list:
    max_list_gtex_corr.append(chunk[chunk[chunk.index]>=correlation_cutoff].count().max())
    min_list_gtex_corr.append(chunk[chunk[chunk.index]>=correlation_cutoff].count().min())
    max_list_gtex_oppcorr.append(chunk[chunk[chunk.index]<=opposing_corr_cutoff].count().max())
    min_list_gtex_oppcorr.append(chunk[chunk[chunk.index]<=opposing_corr_cutoff].count().min())    
del df

In [None]:
#df is tcga_normal_all data
df=tcga_normal_all_pearson_corr
chunkify(df,1000)
max_list_tcga_normal_all_corr=[]
min_list_tcga_normal_all_corr=[]
max_list_tcga_normal_all_oppcorr=[]
min_list_tcga_normal_all_oppcorr=[]

for chunk in chunk_list:
    max_list_tcga_normal_all_corr.append(chunk[chunk[chunk.index]>=correlation_cutoff].count().max())
    min_list_tcga_normal_all_corr.append(chunk[chunk[chunk.index]>=correlation_cutoff].count().min())
    max_list_tcga_normal_all_oppcorr.append(chunk[chunk[chunk.index]<=opposing_corr_cutoff].count().max())
    min_list_tcga_normal_all_oppcorr.append(chunk[chunk[chunk.index]<=opposing_corr_cutoff].count().min())    
del df

In [None]:
#df is tcga_primtum_all data
df=tcga_primtum_all_pearson_corr
chunkify(df,1000)
max_list_tcga_primtum_all_corr=[]
min_list_tcga_primtum_all_corr=[]
max_list_tcga_primtum_all_oppcorr=[]
min_list_tcga_primtum_all_oppcorr=[]
for chunk in chunk_list:
    max_list_tcga_primtum_all_corr.append(chunk[chunk[chunk.index]>=correlation_cutoff].count().max())
    min_list_tcga_primtum_all_corr.append(chunk[chunk[chunk.index]>=correlation_cutoff].count().min())
    max_list_tcga_primtum_all_oppcorr.append(chunk[chunk[chunk.index]<=opposing_corr_cutoff].count().max())
    min_list_tcga_primtum_all_oppcorr.append(chunk[chunk[chunk.index]<=opposing_corr_cutoff].count().min())    
del df

# formally do all integrins, tables 1, 2 and 3 using the functions

In [7]:
#table 1
integrins_list=['ITGA7','ITGB8','ITGB6','ITGA10','ITGA6','ITGA4','ITGAV','ITGA9','ITGB5','ITGA1','ITGA2','ITGA8',
'ITGB1','ITGB7','ITGA5','ITGBL1','ITGA11','ITGAL','ITGAM','ITGAX','ITGAD','ITGAE','ITGA2B','ITGB3','ITGA3',
                'ITGB4','ITGB2']

gtex_gct_dataset = 'gene_tpm_2017-06-05_v8_breast_mammary_tissue.gct_2'
correlation_method ='pearson'
correlation_cutoff = 0.6
tcga_hiseq_dataset = '/home/hshadman/integrins_expression/saghar_TCGA_Breast/TCGA_BRCA_HiSeqV2' 
tcga_hiseq_corresponding_sample_info = '/home/hshadman/integrins_expression/saghar_TCGA_Breast/TCGA.BRCA.sampleMap_BRCA_clinicalMatrix'
opposing_corr_cutoff = 0.1

# make_data_presentable(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
#                          tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)

for interest_gene in integrins_list:
    organize_gtex_tcga_table1(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                             tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)



for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA7,317,77.287066(245),7.886435(25),7.886435(25)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGB8,2784,40.696839(1133),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGB6,1612,40.198511(648),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA10,378,38.095238(144),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA6,43,16.279070(7),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA4,115,37.391304(43),20.000000(23),17.391304(20)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGAV,161,73.291925(118),1.863354(3),1.863354(3)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA9,3,33.333333(1),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGB5,22,68.181818(15),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA1,67,71.641791(48),34.328358(23),34.328358(23)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA2,4047,37.385718(1513),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA8,31,6.451613(2),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGB1,70,91.428571(64),2.857143(2),2.857143(2)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGB7,83,33.734940(28),48.192771(40),31.325301(26)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA5,59,18.644068(11),3.389831(2),1.694915(1)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGBL1,7,14.285714(1),57.142857(4),14.285714(1)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA11,27,18.518519(5),14.814815(4),3.703704(1)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGAL,101,77.227723(78),75.247525(76),66.336634(67)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGAM,314,43.630573(137),24.203822(76),21.337580(67)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGAX,257,40.077821(103),36.964981(95),30.350195(78)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGAD,4,0.000000(0),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGAE,24,8.333333(2),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA2B,5,0.000000(0),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGB3,57,10.526316(6),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGA3,1430,48.881119(699),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGB4,1617,47.619048(770),0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGB2,293,45.733788(134),40.614334(119),32.764505(96)


In [11]:
#table 2
integrins_list=['ITGA7','ITGB8','ITGB6','ITGA10','ITGA6','ITGA4','ITGAV','ITGA9','ITGB5','ITGA1','ITGA2','ITGA8',
'ITGB1','ITGB7','ITGA5','ITGBL1','ITGA11','ITGAL','ITGAM','ITGAX','ITGAD','ITGAE','ITGA2B','ITGB3','ITGA3',
                'ITGB4','ITGB2']

gtex_gct_dataset = 'gene_tpm_2017-06-05_v8_breast_mammary_tissue.gct_2'
correlation_method ='pearson'
correlation_cutoff = 0.6
tcga_hiseq_dataset = '/home/hshadman/integrins_expression/saghar_TCGA_Breast/TCGA_BRCA_HiSeqV2' 
tcga_hiseq_corresponding_sample_info = '/home/hshadman/integrins_expression/saghar_TCGA_Breast/TCGA.BRCA.sampleMap_BRCA_clinicalMatrix'
opposing_corr_cutoff = 0.1

# make_data_presentable(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
#                          tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)

for interest_gene in integrins_list:
    organize_gtex_tcga_table2(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)



for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA7,317,0.000000(0),21.766562(69)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGB8,2784,3.268678(91),39.152299(1090)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGB6,1612,1.116625(18),44.044665(710)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA10,378,0.793651(3),40.476190(153)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA6,43,16.279070(7),11.627907(5)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA4,115,0.869565(1),0.869565(1)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGAV,161,0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA9,3,0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGB5,22,0.000000(0),22.727273(5)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA1,67,0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA2,4047,3.187546(129),35.211268(1425)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA8,31,32.258065(10),32.258065(10)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGB1,70,0.000000(0),7.142857(5)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGB7,83,1.204819(1),7.228916(6)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA5,59,11.864407(7),3.389831(2)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGBL1,7,0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA11,27,0.000000(0),11.111111(3)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGAL,101,0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGAM,314,3.184713(10),7.324841(23)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGAX,257,1.556420(4),8.560311(22)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGAD,4,0.000000(0),0.000000(0)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGAE,24,0.000000(0),4.166667(1)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA2B,5,0.000000(0),60.000000(3)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGB3,57,14.035088(8),14.035088(8)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA3,1430,4.195804(60),53.356643(763)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGB4,1617,3.710575(60),49.350649(798)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGB2,293,3.071672(9),5.119454(15)


In [13]:
#table 3
integrins_list=['ITGA7','ITGB8','ITGB6','ITGA10','ITGA6','ITGA4','ITGAV','ITGA9','ITGB5','ITGA1','ITGA2','ITGA8',
'ITGB1','ITGB7','ITGA5','ITGBL1','ITGA11','ITGAL','ITGAM','ITGAX','ITGAD','ITGAE','ITGA2B','ITGB3','ITGA3',
                'ITGB4','ITGB2']

gtex_gct_dataset = 'gene_tpm_2017-06-05_v8_breast_mammary_tissue.gct_2'
correlation_method ='pearson'
correlation_cutoff = 0.6
tcga_hiseq_dataset = '/home/hshadman/integrins_expression/saghar_TCGA_Breast/TCGA_BRCA_HiSeqV2' 
tcga_hiseq_corresponding_sample_info = '/home/hshadman/integrins_expression/saghar_TCGA_Breast/TCGA.BRCA.sampleMap_BRCA_clinicalMatrix'
opposing_corr_cutoff = 0.1

# make_data_presentable(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
#                          tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)

for interest_gene in integrins_list:
    organize_no_overlap_table3(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)



for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA7,152.55 + 102.06,317,1467,118


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGB8,8.35 + 8.82,2784,2772,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGB6,8.01 + 10.91,1612,1905,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA10,18.64 + 21.48,378,2255,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA6,74.20 + 27.93,43,945,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA4,2.54 + 1.81,115,415,186


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGAV,33.55 + 10.38,161,942,81


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA9,10.79 + 4.00,3,666,9


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGB5,105.48 + 37.79,22,527,3


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA1,25.24 + 11.05,67,1192,176


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA2,6.58 + 6.18,4047,2959,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA8,5.36 + 10.32,31,233,93


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGB1,165.20 + 60.53,70,1363,58


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGB7,1.77 + 1.30,83,123,369


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA5,101.97 + 69.01,59,771,122


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGBL1,12.98 + 10.20,7,1,190


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA11,4.76 + 4.56,27,12,187


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGAL,4.35 + 3.46,101,284,279


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGAM,6.86 + 8.57,314,497,124


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGAX,8.94 + 23.17,257,196,342


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGAD,0.19 + 0.25,4,16,9


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGAE,8.23 + 2.10,24,140,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA2B,1.35 + 0.72,5,0,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGB3,9.45 + 7.85,57,550,33


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGA3,42.66 + 23.39,1430,2738,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGB4,79.55 + 62.63,1617,2731,0


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGB2,22.76 + 42.35,293,350,449


# scratch work

In [77]:
organize_gtex_tcga_table1(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)

for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: High cutoff applied to non-gtex dataset
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other datasets
3rd: all datasets combined
4th: High cutoff applied to non-gtex datasets


Unnamed: 0,GTEX_highly_correlated,%also_highly_correlated_in_TCGA_normal,%also_highly_correlated_in_TCGA_primtum,%also_highly_correlated_in_all_three
ITGAX,257,40.077821(103),36.964981(95),30.350195(78)


In [32]:
organize_gtex_tcga_table2(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)

for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for dataset ENTER: tcga_normal_all
insert name for other datasettcga_normal_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset
for dataset ENTER: tcga_primtum_all
insert name for other datasettcga_primtum_all
1st: high cutoff applied to gtex dataset
2nd: common genes identified with other dataset
3rd: both datasets combined
4th: low cutoff applied to non-gtex dataset


Unnamed: 0,GTEX_highly_correlated,%lowly_correlated_in_TCGA_normal,%lowly_correlated_in_TCGA_primtum
ITGA7,317,0.000000(0),21.766562(69)


In [78]:
organize_no_overlap_table3(interest_gene,correlation_method, correlation_cutoff,opposing_corr_cutoff,gtex_gct_dataset,
                         tcga_hiseq_dataset,tcga_hiseq_corresponding_sample_info)


for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_normal_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_normal_all
for 1th dataset ENTER: gtex_all
for 2th dataset ENTER: tcga_primtum_all
no overlap between datasets
insert name for 1th datasetgtex_all
insert name for 2th datasettcga_primtum_all


Unnamed: 0,GTEX_expression_mean_&_stdev,GTEX_highly_correlated,highly_correlated_in_TCGA_normal,highly_corelated_in_TCGA_primtum
ITGAX,8.94 + 23.17,257,196,342


In [75]:
gtex_test[gtex_test.columns[gtex_test.columns.str.contains("ITG")]]

Description,ITGA10,ITGB6,ITGA6,ITGA4,ITGAV,ITGA9,ITGB5,ITGA1,ITGA2,ITGB8,...,ITGAL,ITGAM,ITGAX,ITGAD,ITGAE,ITGA2B,ITGB3,ITGA3,ITGB4,ITGB2
GTEX-1117F-2826-SM-5GZXL,26.660,4.0630,64.53,1.0940,30.85,9.045,161.50,24.31,4.8800,6.5180,...,2.856,2.9380,6.4620,0.0619,6.577,2.0960,17.700,44.29,48.63,5.644
GTEX-111YS-1926-SM-5GICC,40.580,8.6670,80.77,2.3370,41.52,11.730,84.21,26.15,8.6830,12.4000,...,3.977,1.6610,3.3690,0.0219,5.686,0.8640,6.213,38.93,77.83,7.989
GTEX-1122O-1226-SM-5H113,25.150,4.4230,62.86,4.9140,35.61,11.150,90.79,22.51,4.1400,8.1060,...,2.816,10.6300,3.1200,0.3853,8.283,0.8111,9.594,33.19,96.58,27.590
GTEX-117XS-1926-SM-5GICO,3.131,0.0799,83.15,2.3110,30.58,6.013,118.90,39.59,2.2420,0.8251,...,6.755,9.2130,17.1400,0.3599,10.170,1.8370,6.471,28.82,70.39,48.720
GTEX-117YX-1426-SM-5H12H,4.192,0.0649,28.96,1.3680,24.32,8.683,68.26,10.97,0.6376,1.3710,...,1.849,15.4200,2.1920,0.0731,5.879,0.6185,6.386,10.60,19.44,43.780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZYFC-0826-SM-5E44K,2.070,0.5010,100.90,1.9830,29.83,4.940,118.00,54.48,1.1100,0.8023,...,5.253,6.1100,7.0290,0.2819,9.798,2.0110,10.490,30.41,31.50,11.210
GTEX-ZYT6-0126-SM-5E45J,3.779,0.1647,98.94,1.4730,21.16,10.800,96.98,31.49,1.6630,0.6112,...,5.823,4.1250,2.5420,0.1390,9.708,2.2600,6.521,21.96,40.66,8.177
GTEX-ZYW4-0826-SM-5GIDG,8.663,0.2067,22.88,1.0020,26.08,6.079,122.30,21.21,0.8370,2.5160,...,2.145,4.7770,3.7860,0.1476,5.169,0.9999,13.310,27.05,33.45,11.200
GTEX-ZZ64-1226-SM-5E43R,3.342,0.0537,31.63,0.2916,26.81,15.360,98.44,24.32,0.4011,0.2316,...,0.262,0.9996,0.8714,0.0000,5.824,1.7490,2.966,19.83,16.93,3.777
