In [39]:
import pandas as pd
import os

In [40]:
def merge_protein_csvs(folder_path):
    # Step 1: Get all CSVs
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # Step 2: Identify the DAPI file (case-insensitive search)
    dapi_file = next((f for f in csv_files if 'dapi' in f.lower()), None)
    if not dapi_file:
        raise ValueError("No DAPI file found in folder.")

    # Step 3: Load DAPI DataFrame as base
    dapi_df = pd.read_csv(os.path.join(folder_path, dapi_file), skiprows=3)
    dapi_df.columns = [col.strip().rstrip(',').upper() for col in dapi_df.columns]
    dapi_df = dapi_df[['POSITION X', 'POSITION Y']].drop_duplicates()
    
    # Ensure we don't modify DAPI filename in the loop
    remaining_files = [f for f in csv_files if f != dapi_file]

    # Step 4: Process each protein file
    for file in remaining_files:
        filepath = os.path.join(folder_path, file)
        df = pd.read_csv(filepath, skiprows=3)
        df.columns = [col.strip().rstrip(',').upper() for col in df.columns]
        
        # Drop duplicates just in case
        df = df[['POSITION X', 'POSITION Y']].drop_duplicates()
        
        # Extract protein name and convert to uppercase
        protein_name = file.split('_')[0].upper()
        
        # Mark presence of (X,Y) in current protein file
        df[protein_name] = 1
        
        # Merge into base (DAPI) on POSITION X and Y
        dapi_df = dapi_df.merge(df, on=['POSITION X', 'POSITION Y'], how='left')
    
    # Step 5: Replace NaN with 0 in presence columns
    presence_cols = [col for col in dapi_df.columns if col not in ['POSITION X', 'POSITION Y']]
    dapi_df[presence_cols] = dapi_df[presence_cols].fillna(0).astype(int)

    return dapi_df

# Example usage:
folder_path = './data/'
test1 = merge_protein_csvs(folder_path)
test1


Unnamed: 0,POSITION X,POSITION Y,CD3,CD4,FOXP3,NK1.1
0,55878.777,41778.797,0,1,0,0
1,57359.824,40297.508,0,1,0,0
2,56290.895,41362.340,0,1,0,0
3,56432.293,41224.496,0,1,0,0
4,55830.648,41824.309,0,1,0,0
...,...,...,...,...,...,...
114264,53369.691,38341.012,0,1,0,0
114265,53274.793,38437.641,0,0,0,0
114266,53579.414,38130.742,0,1,0,0
114267,53456.289,38254.949,0,1,0,0


In [45]:
def row_labeler(row):
    #start with checking for nk1.1. then cd3 then cd11b

    # NK cells section, first check for NK1.1
    if 'NK1.1'in row and row['NK1.1'] ==1:
        if 'CD3' in row and row['CD3'] == 0:
            if ('CD69' in row and row['CD69'] == 1) and ('GRANZYME' in row and row['GRANZYME'] == 1):
                return 'Cytotoxic Activated NK Cell' 

            elif ('CD69' in row and row['CD69'] == 0 ) and ('GRANZYME' in row and row['GRANZYME'] == 1):
                return 'Cytotoxic NK Cell'  
            
            elif ('CD69' in row and row['CD69'] == 1) and ('GRANZYME' in row and row['GRANZYME'] == 0):
                return 'Activated NK Cell'
            
            return 'NK Cell'


    # T cells section, first check for CD3
    if 'CD3' in row and row['CD3'] == 1:
        if 'NK1.1'in row and row['NK1.1'] ==1:
            return 'NKT Cell'
        if 'CD8'in row and row['CD8'] ==1:
            return 'Cytotoxic T Cell'
        if 'CD4' in row and row['CD4'] ==1:
            if 'T-BET' in row and row['T-BET'] == 1:
                return 'Th1 Cell'
            if 'FOXP3' in row and row['FOXP3'] == 1: #might need to check for CD25 too
                return 'Treg Cell'
    
            return 'Helper T Cell'
    

    return 'Other'

In [None]:
# df['label'] = df.apply(assign_label, axis=1)

from IPython.display import FileLink

test1['LABEL'] = test1.apply(row_labeler, axis=1)

# Save CSV
test1.to_csv('final_output.csv', index=False)

# Display a download link in the notebook
FileLink('final_output.csv')


In [None]:
# old non uppercase code
import pandas as pd
import os

def merge_protein_csvs(folder_path):
    # Step 1: Get all CSVs
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # Step 2: Identify the DAPI file (case-insensitive search)
    dapi_file = next((f for f in csv_files if 'dapi' in f.lower()), None)
    if not dapi_file:
        raise ValueError("No DAPI file found in folder.")

    # Step 3: Load DAPI DataFrame as base
    dapi_df = pd.read_csv(os.path.join(folder_path, dapi_file), skiprows=3)
    dapi_df.columns = [col.strip().rstrip(',') for col in dapi_df.columns]
    dapi_df = dapi_df[['Position X', 'Position Y']].drop_duplicates()
    
    # Ensure we don't modify DAPI filename in the loop
    remaining_files = [f for f in csv_files if f != dapi_file]

    # Step 4: Process each protein file
    for file in remaining_files:
        filepath = os.path.join(folder_path, file)
        df = pd.read_csv(filepath, skiprows=3)
        df.columns = [col.strip().rstrip(',') for col in df.columns]
        
        # Drop duplicates just in case
        df = df[['Position X', 'Position Y']].drop_duplicates()
        
        # Extract protein name from filename
        protein_name = file.split('_')[0]
        
        # Mark presence of (X,Y) in current protein file
        df[protein_name] = 1
        
        # Merge into base (DAPI) on X and Y
        dapi_df = dapi_df.merge(df, on=['Position X', 'Position Y'], how='left')
    
    # Step 5: Replace NaN with 0 in presence columns
    presence_cols = [col for col in dapi_df.columns if col not in ['Position X', 'Position Y']]
    dapi_df[presence_cols] = dapi_df[presence_cols].fillna(0).astype(int)

    return dapi_df

folder_path = './data/'
test1 = merge_protein_csvs(folder_path)
test1