In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from ssrl_rnaseq.split import pretrain_downstream_split, GroupKShotsFold
from ssrl_rnaseq.data import load_tcga

In [None]:
data = load_tcga("../../data/label.parquet", "../../data/mRNA.omics.parquet")

data = data.loc[
    data["clinical", "patient"].notnull()
    & data["clinical", "sample_type"].isin(["Normal", "Tumour"])
    & data["gene_expression"].notnull().all(axis=1)
]

In [None]:
data = pd.concat([
    data.loc[data["clinical", "sample_type"] == "Normal"],
    data.loc[data["clinical", "sample_type"] == "Tumour"][:669],
], axis=0)

In [None]:
X = data["gene_expression"]
y = (data["clinical", "sample_type"] == "Tumour")
g = data["clinical", "patient"]

In [None]:
unique_groups, unique_classes, groups, stratify = pretrain_downstream_split(
    X, y, g, pretrain_size=1000, downstream_size=295, groups=g, stratify=y, random_state=0,
)

In [None]:
s = np.zeros((unique_groups, unique_classes), dtype=np.int64)
np.add.at(s, (groups, 0 * stratify), 1)
s.max()

In [None]:
s

In [None]:
s = np.zeros((2, 3), dtype=np.int64)
np.add.at(s, ([0, 1, 1, 0], [0, 1, 2, 0]), 1)
s

In [None]:
s.max()

In [None]:
inverse

In [None]:
stratify

In [None]:
X_pretrain, X_downstream, _, y_downstream, g_pretrain, g_downstream = pretrain_downstream_split(
    X, y, g, pretrain_size=1000, downstream_size=295, groups=g, stratify=y, random_state=0,
)

In [None]:
set(g_pretrain) & set(g_downstream)

## Without pretrained PCA

In [None]:
cv = GroupKShotsFold(10, k=1, random_state=0)

non_zero_variance = VarianceThreshold()

scaler = StandardScaler()

classifier = LogisticRegression(
    solver="lbfgs",
    multi_class="multinomial",
    max_iter=2000,
    tol=1e-2,
    class_weight="balanced",
    n_jobs=8,
    random_state=0,
)

model = make_pipeline(non_zero_variance, scaler, classifier)

y_downstream_score = cross_val_predict(model, X_downstream, y_downstream, groups=g_downstream, cv=cv, method="predict_proba")
y_downstream_score = y_downstream_score[:, 1]
y_downstream_pred = y_downstream_score > 0.5

report = metrics.classification_report(y_downstream, y_downstream_pred, zero_division=np.nan)
print(report)
print(f"\nAccuracy: {100 * metrics.accuracy_score(y_downstream, y_downstream_pred):.2f}%")
print(f"ROC AUC: {100 * metrics.roc_auc_score(y_downstream, y_downstream_score):.2f}%")

## With pretrained PCA

In [None]:
non_zero_variance = VarianceThreshold()
scaler = StandardScaler()
pca = PCA(n_components=1000, random_state=0)

encoder = make_pipeline(non_zero_variance, scaler, pca)

encoder.fit(X_pretrain)

In [None]:
cv = GroupKShotsFold(10, k=1, random_state=0)

model = LogisticRegression(
    solver="lbfgs",
    multi_class="multinomial",
    max_iter=2000,
    tol=1e-2,
    class_weight="balanced",
    n_jobs=8,
    random_state=0,
)

e_downstream = encoder.transform(X_downstream)
y_downstream_score = cross_val_predict(model, e_downstream, y_downstream, groups=g_downstream, cv=cv, method="predict_proba")
y_downstream_score = y_downstream_score[:, 1]
y_downstream_pred = y_downstream_score > 0.5

report = metrics.classification_report(y_downstream, y_downstream_pred, zero_division=np.nan)
print(report)
print(f"\nAccuracy: {100 * metrics.accuracy_score(y_downstream, y_downstream_pred):.2f}%")
print(f"ROC AUC: {100 * metrics.roc_auc_score(y_downstream, y_downstream_score):.2f}%")