# Integrated Analysis of OncoKids Tables

This notebook integrates all DNA, RNA, and CMA tables extracted from OncoKids PDF reports.

## Overview of the Process

1. **DNA Variant Integration**
   - Reads CSV files with DNA variant data
   - Standardizes column names and formats
   - Creates a unified DNA variant table

2. **RNA Fusion Integration**
   - Processes files containing RNA fusion data
   - Maps columns to standard names
   - Creates a unified RNA fusion table

3. **CMA (Chromosomal Microarray) Integration**
   - Identifies and processes CMA data tables
   - Standardizes formats
   - Creates a unified CMA results table

In [1]:
import pandas as pd
import os
import sys
import re
from IPython.display import display
from datetime import datetime

# Add the src directory to the path so we can import our modules
sys.path.append('..')

# Import our custom modules
from src.data_integration import (
    standardize_and_merge_dna_data,
    standardize_and_merge_rna_data,
    integrate_cma_data
)

## 1. Set Global Parameters

In [2]:
# Define common paths and settings
WORK_DIR = ".."
INPUT_DIR = WORK_DIR + "/table/extracted_tables/CoPath_OncoKids_All" 

# Define directories for each data type
DNA_OUTPUT_DIR = INPUT_DIR + "/integrated_DNA_table"
RNA_OUTPUT_DIR = INPUT_DIR + "/integrated_RNA_table"
CMA_OUTPUT_DIR = INPUT_DIR + "/integrated_CMA_table"

# Define output filenames
DNA_OUTPUT_FILENAME = "integrated_DNA_table.csv"
RNA_OUTPUT_FILENAME = "integrated_RNA_table.csv"
CMA_OUTPUT_FILENAME = "integrated_CMA_table.csv"

# Define table types for metadata
DNA_TABLE_TYPE = "DNA_variant"
RNA_TABLE_TYPE = "RNA_variant"
CMA_TABLE_TYPE = "CMA_variant"

## 2. Process DNA Variant Tables

In [3]:
# Process DNA data
try:
    dna_df = standardize_and_merge_dna_data(
        input_directory=INPUT_DIR,
        output_directory=DNA_OUTPUT_DIR,
        output_filename=DNA_OUTPUT_FILENAME,
        table_type=DNA_TABLE_TYPE
    )

    # Display results if data exists
    if not dna_df.empty:
        # Display processing summary
        display("#### DNA Data Processing Summary")
        summary_df = pd.DataFrame({
            'Metric': ['Total Files Processed', 'Total Rows', 'Total Columns'],
            'Value': [
                len(dna_df['report_name'].unique()),
                len(dna_df),
                len(dna_df.columns)
            ]
        })
        display(summary_df)
        
        # Display column information
        display("#### DNA Data Column Information")
        column_info = pd.DataFrame({
            'Column Name': dna_df.columns,
            'Non-null Count': dna_df.count(),
            'Unique Values': [dna_df[col].nunique() for col in dna_df.columns],
            'Sample Values': [', '.join(dna_df[col].unique()[:2].astype(str)) 
                             for col in dna_df.columns]
        })
        display(column_info)
        
        # Display sample of processed data
        display("#### DNA Data Sample (First 5 Rows)")
        display(dna_df.head())
        
        # Display value counts for key columns
        display("#### DNA Data Key Column Statistics")
        key_columns = ['Classification', 'Gene Name', 'Variant Allele Frequency']
        for col in key_columns:
            if col in dna_df.columns:
                display(f"\nValue counts for {col}:")
                display(dna_df[col].value_counts().head())

except Exception as e:
    display(f"⚠️ Error processing DNA data: {str(e)}")

## 3. Process RNA Fusion Tables

In [4]:
# Process RNA data
try:
    rna_df = standardize_and_merge_rna_data(
        input_directory=INPUT_DIR,
        output_directory=RNA_OUTPUT_DIR,
        output_filename=RNA_OUTPUT_FILENAME,
        table_type=RNA_TABLE_TYPE
    )

    # Display results if data exists
    if not rna_df.empty:
        # Display processing summary
        display("#### RNA Data Processing Summary")
        summary_df = pd.DataFrame({
            'Metric': ['Total Files Processed', 'Total Rows', 'Total Columns'],
            'Value': [
                len(rna_df['report_name'].unique()),
                len(rna_df),
                len(rna_df.columns)
            ]
        })
        display(summary_df)
        
        # Display column information
        display("#### RNA Data Column Information")
        column_info = pd.DataFrame({
            'Column Name': rna_df.columns,
            'Non-null Count': rna_df.count(),
            'Unique Values': [rna_df[col].nunique() for col in rna_df.columns],
            'Sample Values': [', '.join(rna_df[col].unique()[:2].astype(str)) 
                             for col in rna_df.columns]
        })
        display(column_info)
        
        # Display sample of processed data
        display("#### RNA Data Sample (First 5 Rows)")
        display(rna_df.head())
        
        # Display value counts for 'Classification' column
        display("#### RNA Data Key Column Statistics")
        if 'Classification' in rna_df.columns:
            display("\nValue counts for Classification:")
            display(rna_df['Classification'].value_counts().head())

except Exception as e:
    display(f"⚠️ Error processing RNA data: {str(e)}")

## 4. Process CMA Tables

In [5]:
# Process CMA data
try:
    cma_df = integrate_cma_data(
        input_directory=INPUT_DIR,
        output_directory=CMA_OUTPUT_DIR,
        output_filename=CMA_OUTPUT_FILENAME,
        table_type=CMA_TABLE_TYPE
    )

    # Display results if data exists
    if not cma_df.empty:
        # Display processing summary
        display("#### CMA Data Processing Summary")
        summary_df = pd.DataFrame({
            'Metric': ['Total Files Processed', 'Total Rows', 'Total Columns'],
            'Value': [
                len(cma_df['report_name'].unique()),
                len(cma_df),
                len(cma_df.columns)
            ]
        })
        display(summary_df)
        
        # Display column information
        display("#### CMA Data Column Information")
        column_info = pd.DataFrame({
            'Column Name': cma_df.columns,
            'Non-null Count': cma_df.count(),
            'Unique Values': [cma_df[col].nunique() for col in cma_df.columns],
            'Sample Values': [', '.join(cma_df[col].unique()[:2].astype(str)) 
                             for col in cma_df.columns]
        })
        display(column_info)
        
        # Display sample of processed data
        display("#### CMA Data Sample (First 5 Rows)")
        display(cma_df.head())
        
        # Display some column statistics if appropriate
        if 'Chromosome' in cma_df.columns:
            display("\nValue counts for Chromosome:")
            display(cma_df['Chromosome'].value_counts().head())
        
        if 'CNV Type' in cma_df.columns:
            display("\nValue counts for CNV Type:")
            display(cma_df['CNV Type'].value_counts().head())

except Exception as e:
    display(f"⚠️ Error processing CMA data: {str(e)}")

## 5. Summary and Comparison

In [6]:
# Create a summary of all processed data
display("#### Overall Data Processing Summary")

summary_data = {
    'Data Type': ['DNA Variants', 'RNA Fusions', 'CMA Results'],
    'Files Processed': [0, 0, 0],
    'Rows': [0, 0, 0],
    'Columns': [0, 0, 0],
    'Output Path': [os.path.join(DNA_OUTPUT_DIR, DNA_OUTPUT_FILENAME),
                   os.path.join(RNA_OUTPUT_DIR, RNA_OUTPUT_FILENAME),
                   os.path.join(CMA_OUTPUT_DIR, CMA_OUTPUT_FILENAME)]
}

# Update with actual data if available
if 'dna_df' in locals() and not dna_df.empty:
    summary_data['Files Processed'][0] = len(dna_df['report_name'].unique())
    summary_data['Rows'][0] = len(dna_df)
    summary_data['Columns'][0] = len(dna_df.columns)

if 'rna_df' in locals() and not rna_df.empty:
    summary_data['Files Processed'][1] = len(rna_df['report_name'].unique())
    summary_data['Rows'][1] = len(rna_df)
    summary_data['Columns'][1] = len(rna_df.columns)

if 'cma_df' in locals() and not cma_df.empty:
    summary_data['Files Processed'][2] = len(cma_df['report_name'].unique())
    summary_data['Rows'][2] = len(cma_df)
    summary_data['Columns'][2] = len(cma_df.columns)

summary_table = pd.DataFrame(summary_data)
display(summary_table)

# Create a timestamp for this run
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
display(f"Integration completed at: {timestamp}")