In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from typing import Optional, Dict, Any


In [15]:

# =============================================================
# Data Processing Class 
# =============================================================

class DataProcessor:
    def __init__(self, molecular_data: pd.DataFrame, clinical_data: pd.DataFrame, target_data: pd.DataFrame) -> None:
        """
        Initialize the DataProcessor with molecular, clinical, and target data.
        """
        self.molecular_data : pd.DataFrame = molecular_data
        self.clinical_data : pd.DataFrame = clinical_data
        self.target_data : pd.DataFrame = target_data
        self.merged_table : pd.DataFrame = None


    def get_data_shapes(self) -> tuple:
        """
        Returns the shapes of the molecular, clinical, and target datasets.
        """
        return (self.molecular_data.shape, self.clinical_data.shape, self.target_data.shape)

    def value_types(self) -> dict:
        """
        Returns the data types of each column in the datasets.
        """
        return {
            'molecular_data_types': self.molecular_data.dtypes.to_dict(),
            'clinical_data_types': self.clinical_data.dtypes.to_dict(),
            'target_data_types': self.target_data.dtypes.to_dict()
        }
        
    def show(self) -> None:
        """
        Display the current format of the Data
        """
        if self.merged_table is None : 
            print("Molecular Data :")
            print(self.molecular_data.head())
            print("\nClinical Data :")
            print(self.clinical_data.head())    
            print("\nTarget Data :")
            print(self.target_data.head())
        else :
            print("Merged Data table : ")
            print(self.merged_table.head())
        
    def mergedatabis(self) -> pd.DataFrame:
        """
        Merges Molecular, clinical and target data on ID. 
        - Keeps multiple rows per patient. These rows are treated in different ways by another method
        """
        merged : pd.DataFrame = self.clinical_data.merge(self.molecular_data, on = "ID", how="left").merge(self.target_data, on="ID", how="left")
        self.merged_table = merged
        return merged

    
    def preprocess(self) -> tuple:
        # Impute missing values
        imputer = SimpleImputer(strategy='mean')
        self.molecular_data.iloc[:, 1:] = imputer.fit_transform(self.molecular_data.iloc[:, 1:])
        self.clinical_data.iloc[:, 1:] = imputer.fit_transform(self.clinical_data.iloc[:, 1:])

        # Standardize features
        scaler = StandardScaler()
        self.molecular_data.iloc[:, 1:] = scaler.fit_transform(self.molecular_data.iloc[:, 1:])
        self.clinical_data.iloc[:, 1:] = scaler.fit_transform(self.clinical_data.iloc[:, 1:])

        return self.molecular_data, self.clinical_data, self.target_data
    
    def plot_quant_var(self) -> None:
        """
        Plots quantitative variables from molecular and clinical datasets.
        """
        # Plotting molecular data
        quant_cols = self.molecular_data.select_dtypes(include=[np.number]).columns.tolist()

        self.molecular_data[quant_cols].hist(bins=30, figsize=(15, 10))
        plt.suptitle("Molecular Data Distributions")
        plt.show()
        
        # Plotting clinical data
        quant_cols = self.clinical_data.select_dtypes(include=[np.number]).columns.tolist()

        self.clinical_data[quant_cols].hist(bins=30, figsize=(15, 10))
        plt.suptitle("Clinical Data Distributions")
        plt.show()



In [16]:

Xtrain_molecular: str = r"./data/molecular_train.csv"
Xtrain_clinical: str = r"./data/clinical_train.csv"
target_train: str = r"./data/target_train.csv"

data_train_mol: pd.DataFrame = pd.read_csv(Xtrain_molecular)
data_train_cli: pd.DataFrame = pd.read_csv(Xtrain_clinical)
target_df: pd.DataFrame = pd.read_csv(target_train)

In [17]:
data_train_mol

Unnamed: 0,ID,CHR,START,END,REF,ALT,GENE,PROTEIN_CHANGE,EFFECT,VAF,DEPTH
0,P100000,11,119149248.0,119149248.0,G,A,CBL,p.C419Y,non_synonymous_codon,0.0830,1308.0
1,P100000,5,131822301.0,131822301.0,G,T,IRF1,p.Y164*,stop_gained,0.0220,532.0
2,P100000,3,77694060.0,77694060.0,G,C,ROBO2,p.?,splice_site_variant,0.4100,876.0
3,P100000,4,106164917.0,106164917.0,G,T,TET2,p.R1262L,non_synonymous_codon,0.4300,826.0
4,P100000,2,25468147.0,25468163.0,ACGAAGAGGGGGTGTTC,A,DNMT3A,p.E505fs*141,frameshift_variant,0.0898,942.0
...,...,...,...,...,...,...,...,...,...,...,...
10930,P131472,,,,,,MLL,MLL_PTD,PTD,,
10931,P131505,,,,,,MLL,MLL_PTD,PTD,,
10932,P131816,,,,,,MLL,MLL_PTD,PTD,,
10933,P132717,,,,,,MLL,MLL_PTD,PTD,,


In [18]:
data_train_cli

Unnamed: 0,ID,CENTER,BM_BLAST,WBC,ANC,MONOCYTES,HB,PLT,CYTOGENETICS
0,P132697,MSK,14.0,2.80,0.20,0.70,7.6,119.0,"46,xy,del(20)(q12)[2]/46,xy[18]"
1,P132698,MSK,1.0,7.40,2.40,0.10,11.6,42.0,"46,xx"
2,P116889,MSK,15.0,3.70,2.10,0.10,14.2,81.0,"46,xy,t(3;3)(q25;q27)[8]/46,xy[12]"
3,P132699,MSK,1.0,3.90,1.90,0.10,8.9,77.0,"46,xy,del(3)(q26q27)[15]/46,xy[5]"
4,P132700,MSK,6.0,128.00,9.70,0.90,11.1,195.0,"46,xx,t(3;9)(p13;q22)[10]/46,xx[10]"
...,...,...,...,...,...,...,...,...,...
3318,P121828,VU,1.0,3.70,2.53,0.53,8.9,499.0,"46,xy[20]"
3319,P121829,VU,0.0,4.20,2.40,0.22,10.6,49.0,"46,xy,del(13)(q12q14)[1]/45,x,-y,del(13)(q12q1..."
3320,P121830,VU,0.0,1.80,0.55,0.29,9.4,86.0,"46,xy,del(20)(q11.2q13.1)[4]/45,xy,idem,-7[16]"
3321,P121853,VU,5.0,1.37,0.37,0.11,11.4,102.0,"46,xx,del(1)(p34)[5]/45,xx,sl,-18[12]/46,xx,sd..."


In [19]:
target_df

Unnamed: 0,ID,OS_YEARS,OS_STATUS
0,P132697,1.115068,1.0
1,P132698,4.928767,0.0
2,P116889,2.043836,0.0
3,P132699,2.476712,1.0
4,P132700,3.145205,0.0
...,...,...,...
3318,P121828,,
3319,P121829,,
3320,P121830,1.997260,0.0
3321,P121853,0.095890,1.0


In [20]:

# Convert 'OS_YEARS' to numeric, forcing errors to NaN 

target_df['OS_YEARS'] = pd.to_numeric(target_df['OS_YEARS'], errors='coerce')

# Ensure 'OS_STATUS' is boolean
target_df['OS_STATUS'] = target_df['OS_STATUS'].astype(bool)
print(f" Shape des données moléculaires {data_train_mol.shape} \n shape des données cliniques {data_train_cli.shape} \n shape des données cibles {target_df.shape}")

b = DataProcessor(data_train_cli, data_train_mol, target_df)


 Shape des données moléculaires (10935, 11) 
 shape des données cliniques (3323, 9) 
 shape des données cibles (3323, 3)
