# Test File for Datasets.py #

This contains the code to test the datasets.py file

In [2]:
from dataset_creation.datasets import *

In [7]:
h5_path = 'data/ASP_dataset_slices/all_256mfpdnarnaprot.h5'
data_path = 'data/ASP_dataset_slices/drug_comboscore_hsa_zip.csv'

# Test creating mfp bc balanced dataset
mfp_bc_bal_dataset = H5Dataset(
    h5_path=h5_path,
    data_path=data_path,
    target_column='COMBOSCORE',
    binary_classification=True,
    balance_classes=True,
    cancer_type='all_cancer',
    drug_class='all_drugs',
    use_mfp=True,
    use_dna=False,
    use_rna=False,
    use_prot=False
)
print('Y values (should be 0 or 1s):', mfp_bc_bal_dataset.y[:10])

# Test breast indices
mfp_bc_bal_breast_dataset = H5Dataset(
    h5_path=h5_path,
    data_path=data_path,
    target_column='COMBOSCORE',
    binary_classification=True,
    balance_classes=True,
    cancer_type='breast',
    drug_class='all_drugs',
    use_mfp=True,
    use_dna=False,
    use_rna=False,
    use_prot=False
)

# Test other other indices
mfp_bc_bal_other_dataset = H5Dataset(
    h5_path=h5_path,
    data_path=data_path,
    target_column='COMBOSCORE',
    binary_classification=True,
    balance_classes=True,
    cancer_type='all_cancer',
    drug_class='other_other',
    use_mfp=True,
    use_dna=False,
    use_rna=False,
    use_prot=False
)

# Test mfp prot csreg melanoma ZIP dataset
mfp_prot_csreg_melanoma_dataset = H5Dataset(
    h5_path=h5_path,
    data_path=data_path,
    target_column='ZIP',
    binary_classification=False,
    balance_classes=False,
    cancer_type='melanoma',
    drug_class='all_drugs',
    use_mfp=True,
    use_dna=False,
    use_rna=False,
    use_prot=True
)
print('Y values (should be continuous):', mfp_prot_csreg_melanoma_dataset.y[:10])

# Test HSA mfp rna colon dataset
mfp_rna_hsa_colon_dataset = H5Dataset(
    h5_path=h5_path,
    data_path=data_path,
    target_column='HSA',
    binary_classification=False,
    balance_classes=False,
    cancer_type='colon',
    drug_class='all_drugs',
    use_mfp=True,
    use_dna=False,
    use_rna=True,
    use_prot=False
)
print('Y values (should be continuous):', mfp_rna_hsa_colon_dataset.y[:10])

Initial dataset size: 538920
Balancing classes
Total feature dimensions: 512
Created feature matrix of shape torch.Size([331676, 512])
Total feature dimensions: 512
Y values (should be 0 or 1s): tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]])
Initial dataset size: 31378
Balancing classes
Total feature dimensions: 512
Created feature matrix of shape torch.Size([20668, 512])
Total feature dimensions: 512
Initial dataset size: 15790
Balancing classes
Total feature dimensions: 512
Created feature matrix of shape torch.Size([6272, 512])
Total feature dimensions: 512
Total feature dimensions: 1298
Created feature matrix of shape torch.Size([79984, 1298])
Total feature dimensions: 1298
Y values (should be continuous): tensor([[ -6.0537],
        [-14.7642],
        [ -9.0054],
        [ -5.5900],
        [ -8.1201],
        [ -4.6181],
        [ -1.7048],
        [ -4.7410],
        [ -1.7563],
      

In [9]:
pg_data_path = 'data/ASP_dataset_slices/drug_percent_growth.csv'
# Test percent growth throws an error if using binary classification
try:
    mfp_pg_bal_dataset = H5Dataset(
        h5_path=h5_path,
        data_path=pg_data_path,
        target_column='PERCENTGROWTH',
        binary_classification=True,
        balance_classes=True,
        cancer_type='all_cancer',
        drug_class='all_drugs',
        use_mfp=True,
        use_dna=False,
        use_rna=False,
        use_prot=False
    )
except ValueError as e:
    print('Expected error:', e)

# Test percent growth with balanced dataset throws an error if not using binary classification
try:
    mfp_pg_bal_dataset = H5Dataset(
        h5_path=h5_path,
        data_path=pg_data_path,
        target_column='PERCENTGROWTH',
        binary_classification=False,
        balance_classes=True,
        cancer_type='all_cancer',
        drug_class='all_drugs',
        use_mfp=True,
        use_dna=False,
        use_rna=False,
        use_prot=False
    )
except ValueError as e:
    print('Expected error:', e)

# Test percent growth
mfp_pg_dataset = H5Dataset(
    h5_path=h5_path,
    data_path=pg_data_path,
    target_column='PERCENTGROWTH',
    binary_classification=False,
    balance_classes=False,
    cancer_type='all_cancer',
    drug_class='all_drugs',
    use_mfp=True,
    use_dna=False,
    use_rna=False,
    use_prot=False
)
print('Y values (should be floats):', mfp_pg_dataset.y[:10])

# Test percent growth prostate dataset
mfp_dna_pg_prostate_dataset = H5Dataset(
    h5_path=h5_path,
    data_path=pg_data_path,
    target_column='PERCENTGROWTH',
    binary_classification=False,
    balance_classes=False,
    cancer_type='prostate',
    drug_class='all_drugs',
    use_mfp=True,
    use_dna=True,
    use_rna=False,
    use_prot=False
)
print('Y values (should be floats):', mfp_dna_pg_prostate_dataset.y[:10])

Expected error: Cannot use binary classification on PERCENTGROWTH
Expected error: Cannot balance classes if not binary classification
Total feature dimensions: 514
Created feature matrix of shape torch.Size([4968384, 514])
Total feature dimensions: 514
Y values (should be floats): tensor([[ 85.9790],
        [100.9030],
        [ 14.1470],
        [ 71.2680],
        [ 89.2780],
        [  7.0070],
        [ 22.2880],
        [ 35.0960],
        [ 10.1170],
        [ 89.4840]])
Total feature dimensions: 1682
Created feature matrix of shape torch.Size([194382, 1682])
Total feature dimensions: 1682
Y values (should be floats): tensor([[95.5520],
        [86.0640],
        [56.1770],
        [51.5630],
        [56.1630],
        [52.3500],
        [20.7780],
        [23.1540],
        [24.6610],
        [90.4480]])
