In [1]:
import os
import pandas as pd
import numpy as np

# Directory containing TSV files
ts_directory = "/mnt/shared/Sayedali/Kolling/Uniti_correct/84-85"  # Change this to your folder path

# List of features to extract
features = ["CADD_PHRED", "gene_score", "Phenotype", "PH", "DCs_score", "Adjusted AF for CRI"]

# Aggregation function
def aggregate_patient_data(directory):
    """Aggregate features for each patient."""
    aggregated_data = []

    for filename in os.listdir(directory):
        if filename.endswith(".tsv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath, sep='\t')  # Read TSV file

            # Ensure all required columns exist
            if not set(features).issubset(df.columns):
                print(f"Skipping {filename}: Missing required columns.")
                continue

            # Replace 'inf' values in gene_score with 0
            df["gene_score"] = df["gene_score"].replace([np.inf, -np.inf], 0)

            # Aggregate data for the patient
            aggregated_row = {
                "Patient_ID": filename,  # Use the filename as the patient ID
                "CADD_PHRED": df["CADD_PHRED"].max(),  # Max pathogenicity
                "gene_score": df["gene_score"].max(),  # Max gene score
                "Phenotype": df["Phenotype"].eq(1).sum() / len(df),  # Proportion of '1's in Phenotype
                "PH": df["PH"].mean(),  # Mean for continuous property
                # "DCs_score": df["DCs_score"].max(),  # Max disease-causing score
                 "DCs_score": (df["DCs_score"] == df["DCs_score"].max()).sum() / len(df),  #Proportion of rows with max DCs_score
                "Adjusted AF for CRI": df["Adjusted AF for CRI"].mean()  # Mean allele frequency
            }
            aggregated_data.append(aggregated_row)

    return pd.DataFrame(aggregated_data)

# Aggregate the data
aggregated_data = aggregate_patient_data(ts_directory)

# Save aggregated data to a CSV file
output_file = "aggregated_patient_data_caddphredd_fixed_Uniti_84_85.csv"
aggregated_data.to_csv(output_file, index=False)
print(f"Aggregated data saved to {output_file}")




  df = pd.read_csv(filepath, sep='\t')  # Read TSV file
  df = pd.read_csv(filepath, sep='\t')  # Read TSV file


Aggregated data saved to aggregated_patient_data_caddphredd_fixed_Uniti_84_85.csv


### Aggregated data with the new feature Compound_Het_Score

In [2]:
# Family data aggregation
# -------------------------------------

import pandas as pd
import numpy as np
import os


# Load the existing aggregated patient-level data
aggregated_df_path = "aggregated_patient_data_caddphredd_fixed_Uniti_84_85.csv"
aggregated_data = pd.read_csv(aggregated_df_path)
# ‚ûï New column name
new_column_name = "Compound_Het_Score"

# üìÅ Point to the updated .tsv files (with Rare_AF_Pair_Count)
tsv_dir = "/mnt/shared/Sayedali/Kolling/Uniti_correct/84-85"  # Change this to your folder path

# üßÆ Function to calculate compound het score
def compute_compound_het_score(tsv_path):
    try:
        df = pd.read_csv(tsv_path, sep='\t')

        # Only keep rows with Rare_AF_Pair_Count > 0
        rare_df = df[df["Rare_AF_Pair_Count"] > 0]

        # Unique candidate genes
        num_candidate_genes = rare_df["SYMBOL"].nunique()

        # Total rare AF pairs across all candidate genes
        total_pair_count = rare_df["Rare_AF_Pair_Count"].sum()

        return num_candidate_genes * total_pair_count

    except Exception as e:
        print(f"Error reading {tsv_path}: {e}")
        return None

# üîÅ Loop through each patient and calculate Compound_Het_Score
compound_scores = []

for patient_id in aggregated_data["Patient_ID"]:
    patient_file = os.path.join(tsv_dir, patient_id)
    score = compute_compound_het_score(patient_file)
    compound_scores.append(score)

# Add the new feature to the existing dataframe
aggregated_data[new_column_name] = compound_scores

# üíæ Save the updated dataframe
updated_output_file = "aggregated_patientl_data_with_compound_het_score_Uniti_84_85.csv"
aggregated_data.to_csv(updated_output_file, index=False)

print(f"‚úÖ Updated file saved: {updated_output_file}")


  df = pd.read_csv(tsv_path, sep='\t')
  df = pd.read_csv(tsv_path, sep='\t')


‚úÖ Updated file saved: aggregated_patientl_data_with_compound_het_score_Uniti_84_85.csv
