# import libraries


In [1]:
# Standard library imports
import os
import sys
from pathlib import Path
import pandas as pd
from google.oauth2 import service_account
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
from scipy import stats, ndimage, interpolate
from sklearn.preprocessing import StandardScaler
import google.auth
warnings.filterwarnings('ignore')  # Suppresses all warnings

# Add parent directory to Python path for local imports
sys.path.append('..')

# Local application imports
from src.mimicdf import MIMICDF
from src.DatePreprocessor import DataPreprocessor

def setup_mimic_connection():
    """
    Initialize MIMIC database connection with GCP credentials.
    """
    PROJECT_ID = 'copper-actor-403003'
    os.environ['GCP_PROJECT_ID'] = PROJECT_ID
    
    # Use application default credentials
    credentials, project = google.auth.default()
    
    return MIMICDF(source='gcp', credentials=credentials)

# Initialize MIMIC database connection
mimicdf = setup_mimic_connection()

ModuleNotFoundError: No module named 'src.DatePreprocessor'

# Data Preprocessing

In [2]:
data_preprocessor = DataPreprocessor(mimicdf)
df_clean = data_preprocessor.prepare_data()
df_clean.head()


NameError: name 'DataPreprocessor' is not defined

In [None]:
df_clean.describe().T

In [None]:
# 1. Select features
features = ['age_at_ed', 'los_minutes', 'heartrate', 'sbp', 'dbp', 'o2sat', 
           'resprate', 'temperature', 'pain']
df_prep = df_clean[features].copy()

# 2. Box-Cox transform los_minutes
df_prep['los_minutes'], lambda_param = stats.boxcox(df_prep['los_minutes'])
print(f"Box-Cox lambda parameter for los_minutes: {lambda_param:.3f}")

# 3. Standard scale all other features
other_features = [col for col in features if col != 'los_minutes']
scaler = StandardScaler()
df_prep[other_features] = scaler.fit_transform(df_prep[other_features])

# 4. Create histograms
n_features = len(features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
axes = axes.ravel()  # Flatten axes array for easier indexing

for idx, feature in enumerate(features):
    axes[idx].hist(df_prep[feature], bins=50)
    axes[idx].set_title(f'Distribution of {feature}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')

# Remove empty subplots if any
for idx in range(n_features, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()

# Print basic statistics of transformed data
print("\nBasic statistics of transformed features:")
print(df_prep.describe().round(3))

# Return the prepared DataFrame
df_prep.head()