In [45]:
# PREPROCESS_identify_cytokines_from_significant_proteins
#
# Purpose: Identify differentially abundant cytokines from a list of differentially abundant proteins.
# Cytokine definitions are based on ImmPort (https://www.immport.org/shared/genelists).

import pandas as pd

In [46]:
# Helper function to read a single-column file and return a list of features
def get_list_from_file(data_file):
    data_df = pd.read_csv(data_file, sep="\t", header=None)
    feature_list = [data_df.iloc[i][0] for i in range(data_df.shape[0])]
    return feature_list

# Helper function to extract cytokine symbols from the second column of a file
# def get_cytokine_list_from_file(data_file):
#     data_df = pd.read_csv(data_file, sep="\t")
#     feature_list = [data_df.iloc[i][1] for i in range(data_df.shape[0])]
#     return feature_list

def get_cytokine_list_from_file(data_file):
    data_df = pd.read_csv(data_file, sep="\t", header=None)
    feature_list = data_df.iloc[:, 0].tolist()
    return feature_list


# Main function to identify cytokine features from significant protein list
def find_significant_cytokine(data_dict, cytokine_list):
    all_cytokine_list = []
    
    #IMPORTANT_NOTE: mannually added complement factor list. These are not in ImmPort cytokine list. 
    #These were found by using DAVID, pathway mapping: Complement and coagulation cascades (from kegg)
    #If the given data_dict is not from P < 0.01, abs(Cohen's D) > 0.5, we may need to re-evaluate this list.
    mannual_CF_list = ["A2M", "F9", "C5", "C9", "CFB", "CFD", "CFHR5", "CFI", "NAMPT"]

    for data_type in list(data_dict.keys()):
        significant_protein_list = data_dict[data_type]
        temp_cytokine_list = []

        for protein_symbol_with_barcode in significant_protein_list:
            protein_symbol = protein_symbol_with_barcode.split('_')[0]  # Get protein name prefix
            if protein_symbol in cytokine_list:
                print (protein_symbol)
                temp_cytokine_list.append(protein_symbol_with_barcode)
                all_cytokine_list.append(protein_symbol_with_barcode)
            if protein_symbol in mannual_CF_list:
                temp_cytokine_list.append(protein_symbol_with_barcode)
                all_cytokine_list.append(protein_symbol_with_barcode)

    # Remove duplicates
    all_cytokine_list = list(set(all_cytokine_list))
    return all_cytokine_list

In [47]:
# Load cytokine symbol list
# cytokine_file = '../../etc/Cytokines.txt'
# cytokine_list = get_cytokine_list_from_file(cytokine_file)

cytokine_file = '../../etc/Cytokines_immport_registary.txt'
cytokine_list = get_cytokine_list_from_file(cytokine_file)

# Define data location and filtering parameters
data_dir = '../../analysis/statistics/linear_regression/proteomics'
condition_list = ["cVSneg", "cVSpos"]
abundance_type_list = ["up", "down"]

# Dictionary to hold filtered protein features
data_dict = {}

for abundance_type in abundance_type_list:
    for condition in condition_list:
        # Load result file from linear regression
        data_file = '%s/linear_regression.%s.proteomics.tsv' % (data_dir, condition)
        data_df = pd.read_csv(data_file, sep="\t")

        # Filter for up/down-regulated proteins using effect size and FDR threshold
        if abundance_type == "up":
            filtered_df = data_df[(data_df["cohen_d"] > 0.5) & (data_df["adj_pval"] < 0.01)]
        elif abundance_type == "down":
            filtered_df = data_df[(data_df["cohen_d"] < -0.5) & (data_df["adj_pval"] < 0.01)]

        # Replace '.' with '-' to match identifiers in downstream analysis
        feature_list = filtered_df["feature"].str.replace(".", '-', regex=False)
        # print (feature_list)

        # Store features under a condition+direction key
        data_type = "%s_%s" % (condition, abundance_type)
        data_dict[data_type] = feature_list

# Identify cytokines from significant features
all_cytokine_list = find_significant_cytokine(data_dict, cytokine_list)




CXCL13
IL1RN
CXCL13
TNFRSF17


In [48]:
# Load full proteomics matrix
proteomic_file = '../../preprocessed_data/proteomics/somascan_anml.T.v2.tsv'
protein_df = pd.read_csv(proteomic_file, sep="\t", index_col=0)

# Subset the cytokine-related protein rows
subset_protein_df = protein_df.loc[all_cytokine_list]

# Load patient metadata and extract ACPA status
patient_info_file = '../../preprocessed_data/meta/patient_info.ML_ready.tsv'
patient_df = pd.read_csv(patient_info_file, sep="\t", index_col=0)
acpa_info = patient_df.loc["acpa"]

# Add ACPA info as the first row to the protein matrix
subset_protein_df = pd.concat([acpa_info.to_frame().T, subset_protein_df], axis=0)

# Save cytokine expression matrix with ACPA annotation
subset_protein_df.to_csv('../../analysis/cytokines/all_cytokine_df.tsv', sep="\t")