In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from bioinfokit.analys import norm
from datetime import datetime, timezone

In [2]:
# Set random seed so this part is reproducible
# https://www.random.org/ 2023-08-09
np.random.seed(3866)

In [3]:
pth = "../../data/reference/gene-data.tsv"
gene_data = pd.read_csv(pth, sep='\t')

In [4]:
def minmax_norm(tpm):

    scaler = preprocessing.MinMaxScaler(feature_range=(1, 100))
    norm = pd.DataFrame(index=tpm.index, columns=tpm.columns)

    gene_bounds = pd.DataFrame(index=tpm.index, columns=['q1', 'q2', 'q3', 'upper_outlier', 'lower_outlier'])

    for gene in norm.index.values:

        try:

            vec = tpm.loc[gene, :].values.reshape(-1, 1)
            q1 = np.percentile(vec, 25)
            q2 = np.percentile(vec, 50)
            q3 = np.percentile(vec, 75)

            upper_outlier = q3 + 1.5 * (q3 - q1)
            lower_outlier = q1 - 1.5 * (q3 - q1)

            gene_bounds.loc[gene, :] = [q1, q2, q3, upper_outlier, lower_outlier]

            idx, _ = np.where(vec > upper_outlier)
            vec[idx] = upper_outlier

            idx, _ = np.where(vec < lower_outlier)
            vec[idx] = lower_outlier

            norm.loc[gene, :] = scaler.fit_transform(vec).flatten()

        except Exception as e:
            print(e)
            continue
            
    return norm

In [11]:
def generate_tcga_luad_data():
    
    tcga_luad_tpm = pd.read_csv("../../data/tcga/TCGA-LUAD-TPM-recount3-2023-02-21.tsv",
                                sep='\t',
                                index_col=0)
    
    tcga_luad_tpm.index = [x.split(".")[0] for x in tcga_luad_tpm.index.values]
    
    tcga_luad_log2tpm1 = np.log2(tcga_luad_tpm + 1.0)
    tcga_luad_tpm = tcga_luad_tpm[tcga_luad_log2tpm1.mean(axis=1) > 1.0]
    
    tcga_luad_minmax = minmax_norm(tcga_luad_tpm)
    
    print(tcga_luad_minmax.shape)
    
    tcga_luad_minmax.to_csv("../../data/expression/processed/TCGA-LUAD-MinMaxNorm-2023-08-28.tsv", sep='\t')
    
generate_tcga_luad_data()

(21135, 601)
