In [1]:
import numpy as np
import pandas as pd
from scipy import io

In [2]:
# Load data
y_tr = pd.read_csv("tox21_labels_train.csv.gz", index_col=0, compression="gzip")
y_te = pd.read_csv("tox21_labels_test.csv.gz", index_col=0, compression="gzip")
x_tr_dense = pd.read_csv("tox21_dense_train.csv.gz", index_col=0, compression="gzip")
x_te_dense = pd.read_csv("tox21_dense_test.csv.gz", index_col=0, compression="gzip")
x_tr_sparse = io.mmread("tox21_sparse_train.mtx.gz").tocsc()
x_te_sparse = io.mmread("tox21_sparse_test.mtx.gz").tocsc()

In [3]:
# Function to print dataset info
def print_dataset_info(name, dataset):
    print(f"\n{name}:")
    print(f"Shape: {dataset.shape}")
    print(f"Type: {type(dataset)}")
    if isinstance(dataset, pd.DataFrame):
        print("Columns:")
        print(dataset.columns)
        print("\nSample data:")
        print(dataset.head())
        print("\nData types:")
        print(dataset.dtypes)
    elif isinstance(dataset, np.ndarray):
        print("Sample data:")
        print(dataset[:5, :5])
    elif hasattr(dataset, "toarray"):
        print("Sample data (dense form):")
        print(dataset[:5, :5].toarray())
    print("\nMissing values:")
    if isinstance(dataset, pd.DataFrame):
        print(dataset.isnull().sum())
    else:
        print("N/A for non-DataFrame objects")


# Print info for each dataset
print_dataset_info("Training Labels (y_tr)", y_tr)
print_dataset_info("Test Labels (y_te)", y_te)
print_dataset_info("Dense Training Features (x_tr_dense)", x_tr_dense)
print_dataset_info("Dense Test Features (x_te_dense)", x_te_dense)
print_dataset_info("Sparse Training Features (x_tr_sparse)", x_tr_sparse)
print_dataset_info("Sparse Test Features (x_te_sparse)", x_te_sparse)


Training Labels (y_tr):
Shape: (12060, 12)
Type: <class 'pandas.core.frame.DataFrame'>
Columns:
Index(['NR.AhR', 'NR.AR', 'NR.AR.LBD', 'NR.Aromatase', 'NR.ER', 'NR.ER.LBD',
       'NR.PPAR.gamma', 'SR.ARE', 'SR.ATAD5', 'SR.HSE', 'SR.MMP', 'SR.p53'],
      dtype='object')

Sample data:
                 NR.AhR  NR.AR  NR.AR.LBD  NR.Aromatase  NR.ER  NR.ER.LBD  \
NCGC00178831-03     NaN    NaN        NaN           NaN    NaN        NaN   
NCGC00166114-03     NaN    NaN        NaN           NaN    NaN        NaN   
NCGC00263563-01     NaN    NaN        NaN           NaN    NaN        NaN   
NCGC00013058-02     NaN    NaN        NaN           NaN    NaN        NaN   
NCGC00167516-01     NaN    0.0        NaN           NaN    NaN        NaN   

                 NR.PPAR.gamma  SR.ARE  SR.ATAD5  SR.HSE  SR.MMP  SR.p53  
NCGC00178831-03            NaN     NaN       NaN     0.0     NaN     NaN  
NCGC00166114-03            NaN     NaN       NaN     0.0     NaN     NaN  
NCGC00263563-01          

In [4]:
# Additional analysis
print("\nUnique values in target variables:")
for col in y_tr.columns:
    print(f"{col}: {y_tr[col].unique()}")

print("\nClass distribution in training set:")
for col in y_tr.columns:
    value_counts = y_tr[col].value_counts(normalize=True)
    print(f"{col}:")
    print(value_counts)
    print()

# Calculate and print the number of features after filtering
sparse_col_idx = ((x_tr_sparse > 0).mean(0) > 0.05).A.ravel()
num_dense_features = x_tr_dense.shape[1]
num_sparse_features = sparse_col_idx.sum()
total_features = num_dense_features + num_sparse_features

print(f"\nNumber of dense features: {num_dense_features}")
print(f"Number of sparse features (after filtering): {num_sparse_features}")
print(f"Total number of features: {total_features}")


Unique values in target variables:
NR.AhR: [nan  0.  1.]
NR.AR: [nan  0.  1.]
NR.AR.LBD: [nan  0.  1.]
NR.Aromatase: [nan  1.  0.]
NR.ER: [nan  1.  0.]
NR.ER.LBD: [nan  1.  0.]
NR.PPAR.gamma: [nan  0.  1.]
SR.ARE: [nan  0.  1.]
SR.ATAD5: [nan  0.  1.]
SR.HSE: [ 0.  1. nan]
SR.MMP: [nan  1.  0.]
SR.p53: [nan  1.  0.]

Class distribution in training set:
NR.AhR:
NR.AhR
0.0    0.883782
1.0    0.116218
Name: proportion, dtype: float64

NR.AR:
NR.AR
0.0    0.960327
1.0    0.039673
Name: proportion, dtype: float64

NR.AR.LBD:
NR.AR.LBD
0.0    0.965319
1.0    0.034681
Name: proportion, dtype: float64

NR.Aromatase:
NR.Aromatase
0.0    0.949194
1.0    0.050806
Name: proportion, dtype: float64

NR.ER:
NR.ER
0.0    0.878925
1.0    0.121075
Name: proportion, dtype: float64

NR.ER.LBD:
NR.ER.LBD
0.0    0.949558
1.0    0.050442
Name: proportion, dtype: float64

NR.PPAR.gamma:
NR.PPAR.gamma
0.0    0.971956
1.0    0.028044
Name: proportion, dtype: float64

SR.ARE:
SR.ARE
0.0    0.845156
1.0    0.154