In [1]:
# Mount Google Drive (optional, if you want to save outputs to Drive)
# from google.colab import drive
# drive.mount('/content/drive')

# Set your project path in Google Drive (adjust as needed)
# PROJECT_DRIVE_PATH = '/content/drive/MyDrive/your_colab_projects/dynamic_dnn_trainer'
# If not using Drive, you can clone directly into Colab's temporary storage.

In [2]:
# Only run this cell if you haven't cloned the repo in your Colab environment yet
# or if you want to pull the latest changes.

# !git clone <https://github.com/leiyese/dynamic_dnn_trainer> dynamic_dnn_trainer_colab
# %cd dynamic_dnn_trainer_colab

# If you mounted Drive and your project is there:
# %cd $PROJECT_DRIVE_PATH
# !git pull # To get latest changes

In [3]:
# Install requirements
# Make sure your requirements.txt is up-to-date in your repo
# !pip install -q -r requirements.txt

# Add src to Python path to import our modules
import sys
import os

# Adjust the path depending on where your notebook is relative to the 'src' directory
# If notebook is in 'dynamic_dnn_trainer/notebooks/' and src is 'dynamic_dnn_trainer/src/'
# And your current working directory is 'dynamic_dnn_trainer_colab' (or your project root)
if 'src' not in sys.path:
    sys.path.append('src')
# Or, more robustly if you know the project root:
# project_root = os.path.abspath(os.path.join(os.getcwd())) # Assumes CWD is project root
# src_path = os.path.join(project_root, 'src')
# if src_path not in sys.path:
#    sys.path.append(src_path)

print(f"Current working directory: {os.getcwd()}")
print(f"System path: {sys.path}")

Current working directory: /Users/leiye/dev/jensens/6_AI/excercises/6-CNN/dynamic_dnn_trainer/notebooks
System path: ['/Users/leiye/dev/jensens/6_AI/excercises/6-CNN/dynamic_dnn_trainer/notebooks', '/Users/leiye/anaconda3/lib/python312.zip', '/Users/leiye/anaconda3/lib/python3.12', '/Users/leiye/anaconda3/lib/python3.12/lib-dynload', '', '/Users/leiye/anaconda3/lib/python3.12/site-packages', '/Users/leiye/anaconda3/lib/python3.12/site-packages/aeosa', '/Users/leiye/anaconda3/lib/python3.12/site-packages/setuptools/_vendor', 'src']


In [7]:
import pandas as pd
import numpy as np

# Import from our custom modules in src
from src import config
from src.data_ingestion import loader
from src.eda import exploratory_analysis as eda
from src.preprocessing import transformers
from src.utils import helpers

print("Successfully imported all modules.")

ModuleNotFoundError: No module named 'src'

In [None]:
# You might want to define specific output paths for notebook experiments
# For now, we'll use the paths from config.py
NOTEBOOK_PLOTS_DIR = config.PLOTS_OUTPUT_DIR / "notebook_eda"
NOTEBOOK_PROCESSED_DATA_DIR = config.PROCESSED_DATA_DIR / "notebook_processed"
NOTEBOOK_PREPROCESSOR_PATH = NOTEBOOK_PROCESSED_DATA_DIR / "notebook_preprocessor.pkl"
NOTEBOOK_TARGET_ENCODER_PATH = NOTEBOOK_PROCESSED_DATA_DIR / "notebook_target_encoder.pkl"

# Ensure these directories exist
NOTEBOOK_PLOTS_DIR.mkdir(parents=True, exist_ok=True)
NOTEBOOK_PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Notebook plots will be saved to: {NOTEBOOK_PLOTS_DIR}")
print(f"Notebook processed data artifacts will be saved to: {NOTEBOOK_PROCESSED_DATA_DIR}")

In [None]:
print("--- 1. Loading Raw Data ---")
if config.RAW_DATA_FILE.exists():
    raw_df = loader.load_csv_data(file_path=config.RAW_DATA_FILE)
    print("\nRaw data loaded successfully. First 5 rows:")
    print(raw_df.head())
else:
    print(f"ERROR: Raw data file not found at {config.RAW_DATA_FILE}")
    raw_df = None # Or raise an error

In [None]:
if raw_df is not None:
    print("\n--- 2. Performing Exploratory Data Analysis (EDA) ---")
    _ = eda.generate_descriptive_stats(raw_df)
    _ = eda.get_null_counts(raw_df)
    eda.plot_histograms_for_numerical_features(
        df=raw_df,
        numerical_features=config.NUMERICAL_FEATURES,
        save_dir=NOTEBOOK_PLOTS_DIR
    )
    eda.plot_correlation_matrix(
        df=raw_df,
        numerical_features=config.NUMERICAL_FEATURES,
        save_path=NOTEBOOK_PLOTS_DIR / "correlation_matrix_notebook.png"
    )
    print("\nEDA plots saved. Check the output directory.")

In [None]:
if raw_df is not None:
    print("\n--- 3. Splitting Data ---")
    X_train_raw, X_test_raw, y_train_raw, y_test_raw = transformers.split_data(
        df=raw_df,
        target_column=config.TARGET_COLUMN,
        test_size=config.TEST_SPLIT_SIZE,
        random_state=config.RANDOM_STATE,
        stratify_col=raw_df[config.TARGET_COLUMN]
    )
    print("Data splitting complete.")
    print(f"X_train_raw shape: {X_train_raw.shape}")

In [None]:
if 'X_train_raw' in locals() and X_train_raw is not None: # Check if previous step ran
    print("\n--- 4. Preprocessing Features ---")
    X_train_proc, X_test_proc, fitted_preprocessor = transformers.preprocess_features(
        X_train=X_train_raw.copy(),
        X_test=X_test_raw.copy(),
        numerical_features=config.NUMERICAL_FEATURES,
        categorical_features=config.CATEGORICAL_FEATURES,
        preprocessor_save_path=NOTEBOOK_PREPROCESSOR_PATH,
        fit_preprocessor=True # Fit a new one for this notebook run
    )
    print("Feature preprocessing complete.")
    print(f"X_train_proc head (first 5 rows of processed data):\n{X_train_proc.head()}")

In [None]:
if 'y_train_raw' in locals() and y_train_raw is not None: # Check if previous step ran
    print("\n--- 5. Encoding Target Variable ---")
    y_train_enc, y_test_enc, fitted_target_encoder = transformers.encode_target(
        y_train=y_train_raw.copy(),
        y_test=y_test_raw.copy(),
        encoder_save_path=NOTEBOOK_TARGET_ENCODER_PATH,
        fit_encoder=True # Fit a new one for this notebook run
    )
    print("Target encoding complete.")
    print(f"y_train_enc head:\n{y_train_enc.head()}")
    print(f"Target classes: {fitted_target_encoder.classes_}")

In [None]:
if 'X_train_proc' in locals() and X_train_proc is not None:
    print("\n--- Processed Data Overview ---")
    print("X_train_proc (processed features) head:")
    display(X_train_proc.head()) # Use display for better DataFrame rendering in notebooks
    print("\ny_train_enc (encoded target) head:")
    display(y_train_enc.head())