In [1]:
import pandas as pd
import optuna
import numpy as np
import logging
import copy
import os
import timeout_decorator

from IPython.display import display

import matplotlib.pyplot as plt

from sklearn.metrics import RocCurveDisplay
from scipy.stats import ttest_ind

from lightgbm import LGBMClassifier
from sklearn import linear_model
from sklearn.svm import SVC

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

logger = logging.getLogger()

optuna.logging.set_verbosity(optuna.logging.ERROR)

os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"

In [2]:
date = '2024-01-09'

In [3]:
def min_max_norm(log2tpm1):
    
    from sklearn import preprocessing

    tpm = (2**log2tpm1) - 1.0

    scaler = preprocessing.MinMaxScaler(feature_range=(1, 100))
    norm = pd.DataFrame(index=tpm.index, columns=log2tpm1.columns)

    gene_bounds = pd.DataFrame(index=tpm.index, columns=['q1', 'q2', 'q3', 'upper_outlier', 'lower_outlier'])

    for gene in norm.index.values:

        try:

            vec = tpm.loc[gene, :].values.reshape(-1, 1)
            q1 = np.percentile(vec, 25)
            q2 = np.percentile(vec, 50)
            q3 = np.percentile(vec, 75)

            upper_outlier = q3 + 1.5 * (q3 - q1)
            lower_outlier = q1 - 1.5 * (q3 - q1)

            gene_bounds.loc[gene, :] = [q1, q2, q3, upper_outlier, lower_outlier]

            idx, _ = np.where(vec > upper_outlier)
            vec[idx] = upper_outlier

            idx, _ = np.where(vec < lower_outlier)
            vec[idx] = lower_outlier

            norm.loc[gene, :] = scaler.fit_transform(vec).flatten()

        except Exception as e:
            print(e)
            
    return norm

In [4]:
for effect in [3.0, 2.0, 1.0, 0.75, 0.5, 0.25, 0.10, 0.05, 0.01]:
    for ndiff in [50, 25, 15, 10, 5]:
        for corr in [0.25, 0.5, 0.75, 0.9]:
            for frac in [0.05, 0.10, 0.15, 0.2, 0.25]:
                print(f"Effect Size: {effect} N DEGs: {ndiff} Response Fraction: {frac} Correlation thresh: {corr}")
                
                train_path = f"../../data/test/test_input/{date}/synthetic-train-eff-{effect}-ndiff-{ndiff}-frac-{frac}-corr-{corr}-{date}.tsv"
                assert os.path.exists(train_path)
                
                test_path = f"../../data/test/test_input/{date}/synthetic-test-eff-{effect}-ndiff-{ndiff}-frac-{frac}-corr-{corr}-{date}.tsv"
                assert os.path.exists(test_path)
                
                
                trainX = pd.read_csv(train_path, sep='\t', index_col=0)
                trainY = pd.Series(index=trainX.columns, data=[0 if x.startswith("nonresponders") else 1 for x in trainX.columns])
                
                testX = pd.read_csv(test_path, sep='\t', index_col=0)
                testY = pd.Series(index=testX.columns, data=[0 if x.startswith("nonresponders") else 1 for x in testX.columns])
                
                mergedX = pd.concat([trainX, testX], axis=1)
                
                merged_norm = min_max_norm(mergedX)
                
                trainN = merged_norm.iloc[:, 0:len(trainX.columns)]
                
                norm_train_path = f"../../data/test/test_input/{date}/synthetic-train-minmax-eff-{effect}-ndiff-{ndiff}-frac-{frac}-corr-{corr}-{date}.tsv"
                trainN.to_csv(norm_train_path, sep='\t', index=True)
                trainY.to_csv(f"../../data/test/test_input/{date}/synthetic-train-label-eff-{effect}-ndiff-{ndiff}-frac-{frac}-corr-{corr}-{date}.tsv",
                              sep='\t',
                              index=True,
                              header=None)
                
                testN = merged_norm.iloc[:, len(trainX.columns):]
                norm_test_path = f"../../data/test/test_input/{date}/synthetic-test-minmax-eff-{effect}-ndiff-{ndiff}-frac-{frac}-corr-{corr}-{date}.tsv"
                testN.to_csv(norm_test_path, sep='\t', index=True)
                testY.to_csv(f"../../data/test/test_input/{date}/synthetic-test-label-eff-{effect}-ndiff-{ndiff}-frac-{frac}-corr-{corr}-{date}.tsv",
                              sep='\t',
                              index=True,
                              header=None)                

Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.05 Correlation thresh: 0.25
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.1 Correlation thresh: 0.25
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.15 Correlation thresh: 0.25
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.2 Correlation thresh: 0.25
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.25 Correlation thresh: 0.25
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.05 Correlation thresh: 0.5
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.1 Correlation thresh: 0.5
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.15 Correlation thresh: 0.5
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.2 Correlation thresh: 0.5
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.25 Correlation thresh: 0.5
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.05 Correlation thresh: 0.75
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.1 Correlation thresh: 0.75
Effect Size: 3.0 N DEGs: 50 Response Fraction: 0.15 Correlation thresh: 0.75
Effect Si