In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from IPython.display import clear_output

In [8]:
# Load the data
X = np.load('../gabriel_data/X.npy')
y = np.load('../gabriel_data/y.npy')
dataset = pd.read_csv('../gabriel_data/clinical_data.csv')

print(X.shape)
print(y.shape)
print(dataset.shape)

(20589, 960, 2)
(20589,)
(20589, 7)


In [11]:
# Merge X, y, and dataset into a single DataFrame
X_list = [x for x in X]
y_list = y.tolist()
dataset['input_signals'] = X_list
dataset['label'] = y_list

In [12]:
# Define the columns of interest
columns_of_interest = ['OMAGE', 'OPAR1', 'OPAR2', 'RecGest', 'Diff', 'BMI_M', 'OSEX']

# Function to print stats for a given dataset
def print_stats(df, label):
    print(f"-------------------- Label {label} --------------------")
    df = df[df['label'] == label]
    count = len(df)
    print(f"Count: {count}")
    
    for col in columns_of_interest:
        if df[col].dtype == 'object' or len(df[col].unique()) <= 10:
            unique_values = df[col].unique()
            print(f"{col}: {len(unique_values)} unique values ({', '.join(map(str, unique_values))})")
        else:
            min_val = df[col].min()
            max_val = df[col].max()
            mean_val = df[col].mean()
            print(f"{col}: {min_val:.2f} - {max_val:.2f} (Mean: {mean_val:.2f})")
    
    print("\n")

print_stats(dataset, 0)
print_stats(dataset, 1) 

-------------------- Label 0 --------------------
Count: 10890
OMAGE: 18.00 - 49.00 (Mean: 30.61)
OPAR1: 7 unique values (1.0, 0.0, 4.0, 2.0, 3.0, 6.0, 5.0)
OPAR2: 10 unique values (0.0, 1.0, 2.0, 5.0, 3.0, 7.0, 4.0, 6.0, 14.0, 8.0)
RecGest: 27.00 - 41.00 (Mean: 34.85)
Diff: 1.00 - 2618.00 (Mean: 682.64)
BMI_M: 15.55 - 40.00 (Mean: 25.71)
OSEX: 2 unique values (1.0, 0.0)


-------------------- Label 1 --------------------
Count: 9699
OMAGE: 15.00 - 49.00 (Mean: 30.70)
OPAR1: 7 unique values (4.0, 1.0, 2.0, 0.0, 3.0, 5.0, 6.0)
OPAR2: 0.00 - 9.00 (Mean: 0.48)
RecGest: 27.00 - 41.00 (Mean: 34.76)
Diff: 0.00 - 168.00 (Mean: 62.35)
BMI_M: 15.42 - 39.82 (Mean: 26.03)
OSEX: 2 unique values (1.0, 0.0)




In [13]:
np.unique(y, return_counts=True)

(array([0, 1]), array([10890,  9699]))

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create TensorFlow datasets
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(16)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(16)