In [1]:
import pandas as pd
import numpy as np

In [2]:
# Set random seed so this part is reproducible
# https://www.random.org/ 2023-08-09
np.random.seed(3866)

In [3]:
minmax = pd.read_csv("../../../data/expression/processed/TCGA-LUAD-MinMaxNorm-2023-08-28.tsv",
                     index_col=0,
                     sep='\t')

In [11]:
for cluster_name in ['Cluster-0', 'Cluster-1', 'Cluster-2']:
    
    ratios = pd.read_csv(f"../../../data/expression/processed/response-subtype-train/V4/IO-Atlas-NSCLC-NSCLC-Response-{cluster_name}-TPM-MinMaxNorm-TRAIN-2023-08-10-V4-R2.taux",
                     sep='\t',
                     header=None, 
                     comment='#')
    
    ratios["ratio"] = ratios[0] + ":::" + ratios[1]

    ratio_clusters = pd.read_csv(f"../../../data/ratios/IO-Atlas-NSCLC-NSCLC-Response-{cluster_name}-TPM-MinMaxNorm-TRAIN-2023-08-10-V4-ratios.tsv",
                         sep='\t',
                         index_col=0)
    
    for ratio, row in ratio_clusters.iterrows():
        n, d = ratio.split(":::")
        ratio_clusters.loc[ratio, "numerator"] = n
        ratio_clusters.loc[ratio, "denominator"] = d
        
    numerator_genes = set(ratio_clusters["numerator"].values) & set(minmax.index.values)
    denominator_genes = set(ratio_clusters["denominator"].values) & set(minmax.index.values)
    gene_universe = list(numerator_genes | denominator_genes)
    
    ratio_clusters = ratio_clusters[ratio_clusters["numerator"].isin(gene_universe) & ratio_clusters["denominator"].isin(gene_universe)]
    
    cluster_gs = []

    for cluster, rows in ratio_clusters.groupby("cluster"):

        # Changed to 100 genes from 50 -- Need to finish exploring this
        tmp = ratios.reindex(ratios[ratios["ratio"].isin(rows.index.values)][2].abs().sort_values().tail(50).index.values)

        if len(tmp) > 0:
            numerator, denominator = zip(*[x.split(":::") for x in tmp.ratio.values])

            for n in numerator:
                cluster_gs.append((cluster, "up", n))

            for d in denominator:
                cluster_gs.append((cluster, "down", d))

    
    cluster_gs = pd.DataFrame(cluster_gs, columns=["gs", "direction", "gene"])
    print(cluster_gs.groupby("gs").count())
    cluster_gs.to_csv(f"../../../data/gs/TCGA-LUAD-tuned-IO-Atlas-NSCLC-NSCLC-Response-{cluster_name}-TPM-MinMaxNorm-TRAIN-2023-08-10-V4-gene-sets.tsv",
                      sep='\t')

                 direction  gene
gs                              
ratio-cluster-0         38    38
ratio-cluster-1        100   100
                 direction  gene
gs                              
ratio-cluster-0        100   100
ratio-cluster-1         36    36
ratio-cluster-2        100   100
                 direction  gene
gs                              
ratio-cluster-0        100   100
ratio-cluster-1         22    22


In [5]:
ratio_clusters[ratio_clusters["numerator"].isin(gene_universe)]

Unnamed: 0,cluster,numerator,denominator
ENSG00000028116:::ENSG00000004864,ratio-cluster-0,ENSG00000028116,ENSG00000004864
ENSG00000064012:::ENSG00000032444,ratio-cluster-0,ENSG00000064012,ENSG00000032444
ENSG00000089012:::ENSG00000005075,ratio-cluster-0,ENSG00000089012,ENSG00000005075
ENSG00000089012:::ENSG00000011485,ratio-cluster-0,ENSG00000089012,ENSG00000011485
ENSG00000089012:::ENSG00000013523,ratio-cluster-0,ENSG00000089012,ENSG00000013523
...,...,...,...
ENSG00000280137:::ENSG00000183665,ratio-cluster-0,ENSG00000280137,ENSG00000183665
ENSG00000280137:::ENSG00000204390,ratio-cluster-0,ENSG00000280137,ENSG00000204390
ENSG00000280137:::ENSG00000223508,ratio-cluster-0,ENSG00000280137,ENSG00000223508
ENSG00000280137:::ENSG00000266964,ratio-cluster-1,ENSG00000280137,ENSG00000266964


In [6]:
ratio_clusters[ratio_clusters["denominator"].isin(gene_universe)]

Unnamed: 0,cluster,numerator,denominator
ENSG00000028116:::ENSG00000004864,ratio-cluster-0,ENSG00000028116,ENSG00000004864
ENSG00000064012:::ENSG00000032444,ratio-cluster-0,ENSG00000064012,ENSG00000032444
ENSG00000089012:::ENSG00000005075,ratio-cluster-0,ENSG00000089012,ENSG00000005075
ENSG00000089012:::ENSG00000011485,ratio-cluster-0,ENSG00000089012,ENSG00000011485
ENSG00000089012:::ENSG00000013523,ratio-cluster-0,ENSG00000089012,ENSG00000013523
...,...,...,...
ENSG00000280137:::ENSG00000183665,ratio-cluster-0,ENSG00000280137,ENSG00000183665
ENSG00000280137:::ENSG00000204390,ratio-cluster-0,ENSG00000280137,ENSG00000204390
ENSG00000280137:::ENSG00000223508,ratio-cluster-0,ENSG00000280137,ENSG00000223508
ENSG00000280137:::ENSG00000266964,ratio-cluster-1,ENSG00000280137,ENSG00000266964


In [7]:
ratio_clusters.dropna()

Unnamed: 0,cluster,numerator,denominator
ENSG00000028116:::ENSG00000004864,ratio-cluster-0,ENSG00000028116,ENSG00000004864
ENSG00000064012:::ENSG00000032444,ratio-cluster-0,ENSG00000064012,ENSG00000032444
ENSG00000089012:::ENSG00000005075,ratio-cluster-0,ENSG00000089012,ENSG00000005075
ENSG00000089012:::ENSG00000011485,ratio-cluster-0,ENSG00000089012,ENSG00000011485
ENSG00000089012:::ENSG00000013523,ratio-cluster-0,ENSG00000089012,ENSG00000013523
...,...,...,...
ENSG00000280137:::ENSG00000183665,ratio-cluster-0,ENSG00000280137,ENSG00000183665
ENSG00000280137:::ENSG00000204390,ratio-cluster-0,ENSG00000280137,ENSG00000204390
ENSG00000280137:::ENSG00000223508,ratio-cluster-0,ENSG00000280137,ENSG00000223508
ENSG00000280137:::ENSG00000266964,ratio-cluster-1,ENSG00000280137,ENSG00000266964


In [8]:
ratio_clusters[ratio_clusters["numerator"].isin(minmax.index)]

Unnamed: 0,cluster,numerator,denominator
ENSG00000028116:::ENSG00000004864,ratio-cluster-0,ENSG00000028116,ENSG00000004864
ENSG00000064012:::ENSG00000032444,ratio-cluster-0,ENSG00000064012,ENSG00000032444
ENSG00000089012:::ENSG00000005075,ratio-cluster-0,ENSG00000089012,ENSG00000005075
ENSG00000089012:::ENSG00000011485,ratio-cluster-0,ENSG00000089012,ENSG00000011485
ENSG00000089012:::ENSG00000013523,ratio-cluster-0,ENSG00000089012,ENSG00000013523
...,...,...,...
ENSG00000280137:::ENSG00000183665,ratio-cluster-0,ENSG00000280137,ENSG00000183665
ENSG00000280137:::ENSG00000204390,ratio-cluster-0,ENSG00000280137,ENSG00000204390
ENSG00000280137:::ENSG00000223508,ratio-cluster-0,ENSG00000280137,ENSG00000223508
ENSG00000280137:::ENSG00000266964,ratio-cluster-1,ENSG00000280137,ENSG00000266964


In [9]:
ratio_clusters[ratio_clusters["denominator"].isin(minmax.index)]

Unnamed: 0,cluster,numerator,denominator
ENSG00000028116:::ENSG00000004864,ratio-cluster-0,ENSG00000028116,ENSG00000004864
ENSG00000064012:::ENSG00000032444,ratio-cluster-0,ENSG00000064012,ENSG00000032444
ENSG00000089012:::ENSG00000005075,ratio-cluster-0,ENSG00000089012,ENSG00000005075
ENSG00000089012:::ENSG00000011485,ratio-cluster-0,ENSG00000089012,ENSG00000011485
ENSG00000089012:::ENSG00000013523,ratio-cluster-0,ENSG00000089012,ENSG00000013523
...,...,...,...
ENSG00000280137:::ENSG00000183665,ratio-cluster-0,ENSG00000280137,ENSG00000183665
ENSG00000280137:::ENSG00000204390,ratio-cluster-0,ENSG00000280137,ENSG00000204390
ENSG00000280137:::ENSG00000223508,ratio-cluster-0,ENSG00000280137,ENSG00000223508
ENSG00000280137:::ENSG00000266964,ratio-cluster-1,ENSG00000280137,ENSG00000266964


In [10]:
set(ratio_clusters["denominator"].values) & set(minmax.index.values) 

{'ENSG00000004059',
 'ENSG00000004864',
 'ENSG00000005075',
 'ENSG00000005812',
 'ENSG00000011114',
 'ENSG00000011485',
 'ENSG00000013288',
 'ENSG00000013523',
 'ENSG00000028528',
 'ENSG00000029364',
 'ENSG00000032444',
 'ENSG00000040487',
 'ENSG00000040531',
 'ENSG00000042317',
 'ENSG00000059691',
 'ENSG00000066135',
 'ENSG00000072071',
 'ENSG00000076321',
 'ENSG00000077312',
 'ENSG00000078699',
 'ENSG00000079999',
 'ENSG00000081377',
 'ENSG00000083290',
 'ENSG00000083817',
 'ENSG00000083844',
 'ENSG00000085377',
 'ENSG00000085415',
 'ENSG00000085832',
 'ENSG00000087253',
 'ENSG00000091732',
 'ENSG00000100023',
 'ENSG00000100058',
 'ENSG00000100395',
 'ENSG00000100413',
 'ENSG00000100461',
 'ENSG00000100564',
 'ENSG00000100865',
 'ENSG00000100982',
 'ENSG00000101138',
 'ENSG00000101290',
 'ENSG00000101365',
 'ENSG00000101439',
 'ENSG00000102221',
 'ENSG00000102805',
 'ENSG00000102977',
 'ENSG00000103111',
 'ENSG00000104164',
 'ENSG00000104903',
 'ENSG00000105088',
 'ENSG00000105248',
