In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import pingouin as pg
import tmaR

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn import feature_selection as fs, preprocessing
from combat.pycombat import pycombat
from pingouin import intraclass_corr
from glob import glob
from tqdm import tqdm

In [2]:
def icc(raters, scale=False, feature_info=False):
    """
    Computes the Intraclass Correlation Coefficient (ICC) for multiple raters.

    Parameters:
    - raters (dict): Dictionary of dataframes where keys are rater IDs and values are dataframes.
    - scale (bool): If True, scales the data using StandardScaler.
    - low_var_removal (bool): If True, removes features with zero variance.
    - feature_info (bool): If True, prints excluded features due to low variance removal.

    Returns:
    - ICCs (dict): Dictionary with feature names as keys and their ICC statistics as values.
    - cols (list): List of feature names after preprocessing.
    """

    data = {}

    # Iterate over raters
    for rater_id, df in raters.items():
        temp = df.copy()
        feature_names = temp.columns.tolist()

        # Scale the data if required
        if scale:
            scaler = preprocessing.StandardScaler()
            temp = scaler.fit_transform(temp)

        # Convert processed data back to a DataFrame
        temp = pd.DataFrame(temp, columns=feature_names)

        # Add 'rater' and 'TMA' columns
        temp.insert(0, 'rater', rater_id, allow_duplicates=True)
        temp.insert(1, 'TMA', np.arange(1, temp.shape[0] + 1, 1), True)

        # Store the processed data
        data[rater_id] = temp

    # Print excluded features if requested
    if feature_info:
        excluded = {}
        for rater_id, df in raters.items():
            print(f"From {rater_id}, excluded features are:")
            excluded_features = set(df.columns) - set(data[rater_id].columns)
            excluded[rater_id] = list(excluded_features)
            print(excluded[rater_id], "\n------------------")

    # Concatenate all raters' data into a single dataframe
    df = pd.concat(data.values(), ignore_index=True).dropna(axis='columns')
    print(f"Final dataframe shape: {df.shape}")

    # Extract feature column names (excluding 'rater' and 'TMA')
    cols = df.columns.tolist()[2:]

    ICCs = {}

    # Compute ICC for each feature
    for col in tqdm(cols, desc="Computing ICC"):
        stats = intraclass_corr(data=df, targets='TMA', raters='rater', ratings=col, nan_policy='omit')
        ICCs[col] = stats

    return ICCs, cols


In [3]:
df = pd.read_csv("J:\\TMAs\\Feature Reproducibility V2\\radiomics.csv")
df = df[df.columns.drop(list(df.filter(regex='diagnos')))] # removes metadata from extraction
df = df[df.columns.drop(list(df.filter(regex='Unnamed')))] # removes metadata from extraction

# Extracting batch labels required for combat
batch = pd.factorize(df['TMA'].astype(str) + df['Grid'])[0]
batch = pd.Series(batch)

meta_data = df[['TMA', 'Grid', 'x', 'y']]
radiomics = df.drop(['TMA', 'Grid', 'x', 'y'], axis = 1) 

In [None]:
# Assuming 'radiomics' is your input DataFrame
selector = VarianceThreshold(threshold=0.0)

# Fit and transform the feature selection
temp = selector.fit_transform(radiomics)

# Get the selected feature names correctly
selected_features = radiomics.columns[selector.get_support()]

# Create a DataFrame with the correct columns
radiomics = pd.DataFrame(temp, index=radiomics.index, columns=selected_features)

# Apply log transformation
radiomics = np.sign(radiomics) * np.log1p(abs(radiomics))

radiomics_transpose = radiomics.T  # Now features are rows, samples are columns
radiomics_transpose = pycombat(radiomics_transpose, batch)

radiomics = radiomics_transpose.T

# Splitting the dataframe into two based on 'tma' column values
df_dict = {
    'H': radiomics[meta_data['TMA'] == 'H64'],
    'V': radiomics[meta_data['TMA'] == 'V64']
}

print('Number of included observations =',df_dict.get('H').shape[0])
print('Number of features =',df_dict.get('V').shape[1])

In [None]:
r = {'rater1':df_dict.get('H'),'rater2':df_dict.get('V')}
ICC,ICC_features = icc(r,scale=True,feature_info=True)

In [None]:
# Define threshold and p-value criteria
threshold = 0.75
p_value_threshold = 0.05

# Determine reliable and unreliable features
reliable_features = [
    feature for feature in ICC_features
    if (ICC[feature]['ICC'][2] > threshold) and (ICC[feature]['pval'][2] < p_value_threshold)
]

unreliable_features = [feature for feature in ICC_features if feature not in reliable_features]

# Output results
num_reliable = len(reliable_features)
num_unreliable = len(unreliable_features)

%store ICC
%store ICC_features

print(f"Threshold: {threshold}")
print(f"Reliable features: {num_reliable}")
print(f"Unreliable features: {num_unreliable}")