In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

In [None]:
def parse_gmt(gmt_file_path):
    pathway_dict = {}
    with open(gmt_file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            pathway_name = parts[0]
            genes = parts[2:]
            pathway_dict[pathway_name] = genes
    return pathway_dict

In [3]:
pathways = parse_gmt('../reference/c2.cp.v2025.1.Hs.symbols.gmt')
mutation_matrix = pd.read_csv("../data/gene_panel_first_filtered_train.csv.gz",index_col=0)
subtype = pd.read_csv("../data/train.csv",usecols=['Tumor_Sample_Barcode',"subtype_PAM50"],index_col='Tumor_Sample_Barcode').sort_index(axis=0)


In [4]:
X = mutation_matrix.values
y = subtype.values.flatten()

In [5]:
all_mi_scores = mutual_info_classif(X, y, discrete_features=True)
mi_series = pd.Series(all_mi_scores, index=mutation_matrix.columns)

In [6]:
selected_genes_info = []

for path_name, genes in pathways.items():
    valid_genes = [g for g in genes if g in mutation_matrix.columns]

    if len(valid_genes) < 10 or len(valid_genes) > 200:
        continue

    best_gene = mi_series[valid_genes].idxmax()
    best_mi_score = mi_series[best_gene]

    if best_gene not in [info['gene'] for info in selected_genes_info]:
        selected_genes_info.append({
            'gene': best_gene,
            'pathway': path_name,
            'mi_score': best_mi_score
        })

selected_df = pd.DataFrame(selected_genes_info)

In [7]:
selected_df.to_csv("../results/feature_gene.csv",index=False)

In [8]:
mi_df = (mi_series
         .rename_axis('gene')          
         .reset_index(name='mi'))
mi_df['selected'] = mi_df['gene'].isin(selected_df['gene'])

In [9]:
mi_df.to_csv("../results/gene_filtered_mi.csv",index=False)