In [1]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [2]:
def load(clinical_path, gene_expression_path):
    clinical = pd.read_parquet(clinical_path)
    gene_expression = pd.read_parquet(gene_expression_path)
    
    clinical.set_index("sampleID", inplace=True)
    gene_expression.set_index("caseID", inplace=True)
    
    gene_expression.index = gene_expression.index.str.split("-").str[:4].str.join("-")

    if not clinical.index.is_unique:
        raise ValueError
    
    if not gene_expression.index.is_unique:
        raise ValueError

    common_case = clinical.index.intersection(gene_expression.index)

    clinical = clinical.loc[common_case]
    gene_expression = gene_expression.loc[common_case]

    data = pd.concat({"clinical": clinical, "gene_expression": gene_expression}, axis=1)
    data.index.name = "caseID"

    return data

In [3]:
data = load("../data/label.parquet", "../data/mRNA.omics.parquet")

data = data.loc[
    data["clinical", "patient"].notnull()
    & data["clinical", "cancer_type"].notnull()
    & data["gene_expression"].notnull().all(axis=1)
    & (data["clinical", "cancer_type"] != "Normal")
    & (data["clinical", "sample_type"] == "Tumour")
]

In [4]:
X = data["gene_expression"]
y = data["clinical", "cancer_type"]
g = data["clinical", "patient"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(  # TODO use groups
    X, y, train_size=930, test_size=1000, stratify=y, shuffle=True, random_state=0,
)

In [8]:
non_zero_variance = VarianceThreshold()

select_k_best = SelectKBest(f_classif, k=50000)

scaler = StandardScaler()

classifier = LogisticRegression(
    solver="lbfgs",
    multi_class="multinomial",
    max_iter=2000,
    tol=1e-2,
    class_weight="balanced",
    n_jobs=8,
    random_state=0,
)

model = make_pipeline(non_zero_variance, select_k_best, scaler, classifier)

model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)

report = metrics.classification_report(y_test, y_test_pred)
print(report)
print(f"\nAccuracy: {metrics.accuracy_score(y_test, y_test_pred)}")

              precision    recall  f1-score   support

   TCGA-BLCA       0.86      0.91      0.89        47
   TCGA-BRCA       1.00      0.94      0.97       127
   TCGA-CESC       0.88      0.80      0.84        35
   TCGA-COAD       1.00      0.98      0.99        53
   TCGA-HNSC       0.84      0.97      0.90        60
   TCGA-KIRC       0.95      0.97      0.96        62
   TCGA-KIRP       0.97      0.88      0.92        34
    TCGA-LGG       0.95      1.00      0.98        62
   TCGA-LIHC       0.96      1.00      0.98        43
   TCGA-LUAD       0.87      0.92      0.89        60
   TCGA-LUSC       0.96      0.79      0.87        58
     TCGA-OV       0.96      1.00      0.98        49
   TCGA-PRAD       0.98      1.00      0.99        57
   TCGA-SARC       0.82      0.93      0.88        30
   TCGA-SKCM       1.00      0.98      0.99        54
   TCGA-STAD       0.96      1.00      0.98        47
   TCGA-THCA       1.00      1.00      1.00        59
   TCGA-UCEC       0.98    