In [1]:
import numpy as np
import pandas as pd
import itertools
from scipy.stats import pearsonr
from tmaR import FeatureProcessing
from sklearn import preprocessing
from sklearn import feature_selection as fs
from sklearn.feature_selection import VarianceThreshold
from umap import UMAP
from combat.pycombat import pycombat
from sklearn.preprocessing import StandardScaler

In [2]:
def pairwise_pearsonr(df,threshold=0.95,p_val=0.05):
    
    from random import choice
    
    columns = df.columns.tolist()
    redundant = np.ones(np.size(columns),dtype=bool)

    for col1, col2 in itertools.combinations(columns,2):
        ind1, ind2 = columns.index(col1), columns.index(col2)
        if (redundant[ind1]==False) | (redundant[ind2] == False):
            continue
        else:
            correlation = pearsonr(df[col1],df[col2])
            if (correlation[0]>threshold) & (correlation[1]<p_val):
                redundant[choice([ind1,ind2])]=False
    return redundant

In [3]:
%store -r radiomics
%store -r meta_data
%store -r ICC
%store -r ICC_features

# Extracting batch labels required for combat
batch = pd.factorize(meta_data['TMA'].astype(str) + meta_data['Grid'])[0]
batch = pd.Series(batch)

# Define threshold and p-value criteria
threshold = 0.75
p_value_threshold = 0.05

# Determine reliable and unreliable features
reliable_features = [
    feature for feature in ICC_features
    if (ICC[feature]['ICC'][2] > threshold) and (ICC[feature]['pval'][2] < p_value_threshold)
]

unreliable_features = [feature for feature in ICC_features if feature not in reliable_features]

# Output results
num_reliable = len(reliable_features)
num_unreliable = len(unreliable_features)

print(f"Threshold: {threshold}")
print(f"Reliable features: {num_reliable}")
print(f"Unreliable features: {num_unreliable}")

Threshold: 0.75
Reliable features: 242
Unreliable features: 1045


In [4]:
radiomics = radiomics[reliable_features]

In [5]:
# Assuming 'radiomics' is your input DataFrame
selector = VarianceThreshold(threshold=0.0)

# Fit and transform the feature selection
temp = selector.fit_transform(radiomics)

# Get the selected feature names correctly
selected_features = radiomics.columns[selector.get_support()]

# Create a DataFrame with the correct columns
radiomics = pd.DataFrame(temp, index=radiomics.index, columns=selected_features)

# Apply log transformation
radiomics = np.sign(radiomics) * np.log1p(abs(radiomics))

radiomics_transpose = radiomics.T  # Now features are rows, samples are columns
radiomics_transpose = pycombat(radiomics_transpose, batch)

radiomics = radiomics_transpose.T

Found 6 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data


In [6]:
redundant_features = pairwise_pearsonr(radiomics,threshold=0.95,p_val=0.05)

  correlation = pearsonr(df[col1],df[col2])


In [11]:
# Select columns where the boolean array is True
radiomics = radiomics.loc[:, redundant_features]

# Print selected column names
print("Selected Columns:", list(radiomics.columns))

Selected Columns: ['original_firstorder_Kurtosis', 'original_firstorder_Skewness', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_glcm_Imc2', 'original_glcm_SumAverage', 'original_glszm_GrayLevelVariance', 'original_glszm_ZoneEntropy', 'logarithm_glcm_Correlation', 'logarithm_glcm_MCC', 'gradient_gldm_DependenceEntropy', 'gradient_gldm_LargeDependenceHighGrayLevelEmphasis', 'gradient_glcm_ClusterTendency', 'gradient_glcm_Correlation', 'gradient_glrlm_GrayLevelNonUniformityNormalized', 'gradient_glrlm_RunEntropy', 'gradient_glszm_GrayLevelVariance', 'gradient_glszm_ZoneEntropy', 'squareroot_firstorder_Maximum', 'squareroot_firstorder_Range', 'squareroot_glcm_Imc1', 'squareroot_glcm_Imc2', 'squareroot_glszm_ZoneEntropy', 'exponential_firstorder_10Percentile', 'exponential_firstorder_RobustMeanAbsoluteDeviation', 'exponential_firstorder_Variance', 'exponential_glrlm_GrayLevelNonUniformityNormalized', 'exponential_glrlm_ShortRunEmphasis', 'exponential_glrlm_ShortRunHighGra

In [12]:
# Initialize StandardScaler
scaler = StandardScaler()
# Fit and transform the data
radiomics = pd.DataFrame(scaler.fit_transform(radiomics), columns=radiomics.columns, index=radiomics.index)


In [13]:
%store radiomics

Stored 'radiomics' (DataFrame)
