# Kedro Project Explorer

This notebook helps you explore the Kedro project structure, source code, and data files.

## 1. Setup and Import

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

# Add project source to path
project_path = Path.cwd().parent
sys.path.append(str(project_path / 'src'))

print(f"Project path: {project_path}")

Project path: /home/user/projects/kedro_project/account-tax


## 2. Explore Project Structure

In [None]:
# List main directories
main_dirs = ['src', 'data', 'conf', 'notebooks', 'tests']

print("Project Structure:")
print("="*50)
for dir_name in main_dirs:
    dir_path = project_path / dir_name
    if dir_path.exists():
        print(f"\n📁 {dir_name}/")
        # List first level of contents
        for item in sorted(dir_path.iterdir())[:10]:  # Limit to first 10 items
            if item.is_dir():
                print(f"   📂 {item.name}/")
            else:
                print(f"   📄 {item.name}")

## 3. Explore Source Code

In [None]:
# Explore src directory structure
src_path = project_path / 'src'

def explore_directory(path, prefix="", max_depth=3, current_depth=0):
    """Recursively explore directory structure"""
    if current_depth >= max_depth:
        return
    
    items = sorted(path.iterdir())
    for item in items:
        if item.name.startswith('.'):
            continue
        if item.is_dir() and item.name != '__pycache__':
            print(f"{prefix}📂 {item.name}/")
            explore_directory(item, prefix + "  ", max_depth, current_depth + 1)
        elif item.suffix == '.py':
            print(f"{prefix}📄 {item.name}")

print("Source Code Structure:")
print("="*50)
explore_directory(src_path)

In [None]:
# Count Python files and lines of code
py_files = list(src_path.rglob('*.py'))
total_lines = 0

for py_file in py_files:
    with open(py_file, 'r', encoding='utf-8', errors='ignore') as f:
        total_lines += len(f.readlines())

print(f"Total Python files: {len(py_files)}")
print(f"Total lines of code: {total_lines:,}")

## 4. Explore Data Directory

In [None]:
# Explore data directory
data_path = project_path / 'data'

print("Data Directory Structure:")
print("="*50)

data_layers = ['01_raw', '02_intermediate', '03_primary', '04_feature', 
               '05_model_input', '06_models', '07_model_output', '08_reporting']

for layer in data_layers:
    layer_path = data_path / layer
    if layer_path.exists():
        files = list(layer_path.glob('*'))
        print(f"\n📁 {layer}/ ({len(files)} files)")
        for file in sorted(files)[:5]:  # Show first 5 files
            if file.is_file():
                size = file.stat().st_size / 1024  # Size in KB
                print(f"   📄 {file.name} ({size:.1f} KB)")

In [None]:
# List CSV and Parquet files in data directory
csv_files = list(data_path.rglob('*.csv'))
parquet_files = list(data_path.rglob('*.parquet'))

print(f"CSV files found: {len(csv_files)}")
print(f"Parquet files found: {len(parquet_files)}")

if csv_files:
    print("\nCSV files:")
    for csv_file in csv_files[:5]:  # Show first 5
        relative_path = csv_file.relative_to(data_path)
        print(f"  - {relative_path}")

if parquet_files:
    print("\nParquet files:")
    for parquet_file in parquet_files[:5]:  # Show first 5
        relative_path = parquet_file.relative_to(data_path)
        print(f"  - {relative_path}")

## 5. Load and Preview Data Files

In [None]:
# Function to safely load and preview data
def preview_data_file(file_path, n_rows=5):
    """Load and preview a data file"""
    try:
        if file_path.suffix == '.csv':
            df = pd.read_csv(file_path, nrows=n_rows)
        elif file_path.suffix == '.parquet':
            df = pd.read_parquet(file_path).head(n_rows)
        else:
            print(f"Unsupported file type: {file_path.suffix}")
            return None
        
        print(f"\nFile: {file_path.name}")
        print(f"Shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        return df
    except Exception as e:
        print(f"Error loading {file_path.name}: {e}")
        return None

# Try to preview a data file
if csv_files:
    df = preview_data_file(csv_files[0])
    if df is not None:
        display(df.head())
elif parquet_files:
    df = preview_data_file(parquet_files[0])
    if df is not None:
        display(df.head())

## 6. Kedro Context and Catalog

In [None]:
# Load Kedro context
try:
    from kedro.framework.session import KedroSession
    from kedro.framework.startup import bootstrap_project
    
    # Bootstrap the project
    bootstrap_project(project_path)
    
    # Create a session
    with KedroSession.create(project_path=project_path) as session:
        context = session.load_context()
        catalog = context.catalog
        
        # List available datasets
        datasets = list(catalog.list())
        print(f"Available datasets in catalog: {len(datasets)}")
        print("\nFirst 10 datasets:")
        for dataset_name in sorted(datasets)[:10]:
            print(f"  - {dataset_name}")
            
except Exception as e:
    print(f"Could not load Kedro context: {e}")
    print("Make sure you're running from the project directory")

## 7. Pipeline Information

In [None]:
# List available pipelines
try:
    from kedro.framework.session import KedroSession
    
    with KedroSession.create(project_path=project_path) as session:
        context = session.load_context()
        pipelines = context.pipelines
        
        print(f"Available pipelines: {list(pipelines.keys())}")
        
        # Show pipeline nodes
        for pipeline_name, pipeline in pipelines.items():
            print(f"\n{pipeline_name} pipeline:")
            print(f"  Nodes: {len(pipeline.nodes)}")
            if pipeline.nodes:
                print("  First 5 nodes:")
                for node in pipeline.nodes[:5]:
                    print(f"    - {node.name}")
                    
except Exception as e:
    print(f"Could not load pipelines: {e}")

## 8. Configuration Files

In [None]:
# Explore configuration files
conf_path = project_path / 'conf' / 'base'

print("Configuration files:")
print("="*50)

yaml_files = list(conf_path.glob('*.yml')) + list(conf_path.glob('*.yaml'))
for yaml_file in sorted(yaml_files):
    size = yaml_file.stat().st_size / 1024  # Size in KB
    print(f"📄 {yaml_file.name} ({size:.1f} KB)")

# Read parameters if available
params_file = conf_path / 'parameters.yml'
if params_file.exists():
    import yaml
    with open(params_file, 'r') as f:
        params = yaml.safe_load(f)
    print(f"\nParameters keys: {list(params.keys()) if params else 'None'}")

## 9. Quick Data Analysis Helper

In [None]:
# Helper function to analyze any data file
def analyze_data_file(file_path):
    """Provide quick analysis of a data file"""
    try:
        print(f"Analyzing: {file_path}")
        print("="*50)
        
        # Load data
        if file_path.suffix == '.csv':
            df = pd.read_csv(file_path)
        elif file_path.suffix == '.parquet':
            df = pd.read_parquet(file_path)
        else:
            print(f"Unsupported file type: {file_path.suffix}")
            return
        
        # Basic info
        print(f"Shape: {df.shape}")
        print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        print(f"\nColumn types:")
        print(df.dtypes.value_counts())
        print(f"\nMissing values:")
        missing = df.isnull().sum()
        if missing.any():
            print(missing[missing > 0])
        else:
            print("No missing values")
        
        # Show sample
        print(f"\nFirst 3 rows:")
        display(df.head(3))
        
        # Numeric columns statistics
        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
        if len(numeric_cols) > 0:
            print(f"\nNumeric columns statistics:")
            display(df[numeric_cols].describe())
            
    except Exception as e:
        print(f"Error analyzing file: {e}")

# Example usage (uncomment and modify path):
# analyze_data_file(project_path / 'data' / '01_raw' / 'your_file.csv')

## 10. Your Work Space

Use the cells below for your own exploration:

In [1]:
from load_intermediate_outputs import (
    get_catalog,
    load_prepared_datasets,
    load_text_datasets,
    summarise_dataframe,
)

In [2]:
prepared = load_prepared_datasets()  # returns {"train": DataFrame, "valid": ..., "test": ...}
textual = load_text_datasets()

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
prepared["train"].head()

Unnamed: 0,date,vat_type,party_name,company_name,company_vat_id,item,supply_amount,vat_amount,total_amount,in_multi,...,business_item,vat_message_code,close_date,product,company_business_type,company_business_item,company_vat_message_code,acct_code,labels,split
0,1970-01-01 00:00:00.020240131,51,지엠아이(GMI),주식회사 코윈,4438102301,보호테이프 외,5121180,512118.0,5633298.0,0.0,...,전자상거래 소매업,standard,,"자동차부품,점착테이프",제조업,그 외 자동차용 신품 부품 제조업,standard,15300,1,train
1,1970-01-01 00:00:00.020230411,57,주식회사 씨엔씨식자재유통,소울푸드 주식회사,1878702515,,21891,2189.0,24080.0,,...,슈퍼마켓,standard,,식잡,도매 및 소매업,기타 음ㆍ식료품 위주 종합 소매업,standard,14600,10,train
2,1970-01-01 00:00:00.020191004,57,한밭골 사골 순대＆정다운 다,유니슨방음방진(주),3188107301,한밭골 사골 순대＆정다운 다락방,7273,727.0,8000.0,0.0,...,,standard,20220428.0,,제조업,산업용 그 외 비경화 고무제품 제조업,standard,81100,36,train
3,1970-01-01 00:00:00.020230731,51,네이버파이낸셜 주식회사,(주)팔도로지스,2338100176,네이버페이 수수료,284160,28416.0,312576.0,,...,그 외 기타 금융 지원 서비스업,standard,,온라인 정보 제공/전자지급결제대행,도매 및 소매업,"빵류, 과자류, 당류, 초콜릿 도매업",standard,83100,24,train
4,1970-01-01 00:00:00.020241228,51,용달화물,주식회사 큐앤비푸드,2468801205,운송료,250000,25000.0,275000.0,0.0,...,,standard,,,제조업,과자류 및 코코아 제품 제조업,standard,52400,64,train


In [4]:
prepared.decribe

In [5]:
prepared.describe

In [6]:
type(prepared)

[1m<[0m[1;95mclass[0m[39m [0m[32m'dict'[0m[1m>[0m

In [7]:
prepared


[1m{[0m
    [32m'test'[0m:                                 date  vat_type   party_name     company_name  \
[1;36m0[0m      [1;36m1970[0m-[1;36m01[0m-[1;36m01[0m [1;92m00:00:00[0m.[1;36m020231030[0m        [1;36m57[0m         종합볼트          [1m([0m주[1m)[0m퀀텀테크   
[1;36m1[0m      [1;36m1970[0m-[1;36m01[0m-[1;36m01[0m [1;92m00:00:00[0m.[1;36m020230511[0m        [1;36m57[0m       서울명동찌개  [1;35m주식회사[0m[1m([0mYG기초[1m)[0m와이지기초   
[1;36m2[0m      [1;36m1970[0m-[1;36m01[0m-[1;36m01[0m [1;92m00:00:00[0m.[1;36m020230225[0m        [1;36m54[0m      [1;35m롯데렌탈[0m[1m([0m주[1m)[0m        [1;35m메이저씨엔엠[0m[1m([0m주[1m)[0m   
[1;36m3[0m      [1;36m1970[0m-[1;36m01[0m-[1;36m01[0m [1;92m00:00:00[0m.[1;36m020231016[0m        [1;36m70[0m       우정사업본부        주식회사 꿈의날개   
[1;36m4[0m      [1;36m1970[0m-[1;36m01[0m-[1;36m01[0m [1;92m00:00:00[0m.[1;36m020220601[0m        [1;36m51[0m       유병기부동산       주식회사 엔알시스템   
[33m.

In [8]:
prepared.shape

In [9]:
prepared.keys()

[1;35mdict_keys[0m[1m([0m[1m[[0m[32m'test'[0m, [32m'train'[0m, [32m'valid'[0m[1m][0m[1m)[0m

In [10]:
prepared.head()

In [11]:
def print_dict_structure(mapping, indent=0):
    prefix = " " * indent
    if isinstance(mapping, dict):
        for key, value in mapping.items():
            print(f"{prefix}- {key}: {type(value).__name__}")
            print_dict_structure(value, indent + 2)
    else:
        return  # dict가 아닌 값에 도달하면 종료

print_dict_structure(prepared)


- test: DataFrame
- train: DataFrame
- valid: DataFrame


In [12]:
prepared["train"].describe(include="all")

Unnamed: 0,date,vat_type,party_name,company_name,company_vat_id,item,supply_amount,vat_amount,total_amount,in_multi,...,business_item,vat_message_code,close_date,product,company_business_type,company_business_item,company_vat_message_code,acct_code,labels,split
count,2642343,2642343.0,2641545,2642343,2642343.0,1204781,2642343.0,2583423.0,2642342.0,842277.0,...,2028263,2642343,317276.0,1124071,2554758,2576820,2642343,2642343.0,2642343.0,2642343
unique,,,226732,380,371.0,306663,,,,,...,1126,8,2318.0,12519,14,170,3,,,1
top,,,비씨카드 (주),주식회사 장원,3128132710.0,유지보수비,,,,,...,한식 일반 음식점업,standard,20241231.0,보험서비스/신용카드,제조업,택시 운송업,standard,,,train
freq,,,64945,73945,73945.0,65921,,,,,...,174225,2464637,9054.0,76242,1271832,140703,2620520,,,2642343
mean,1970-01-01 00:00:00.020220029,50.3825,,,,,2535555.0,228258.4,2758724.0,0.108092,...,,,,,,,,63559.57,39.27025,
min,1970-01-01 00:00:00.020190101,11.0,,,,,-8830405000000.0,-10000000000.0,-8830405000000.0,0.0,...,,,,,,,,10100.0,0.0,
25%,1970-01-01 00:00:00.020210204,51.0,,,,,11181.0,1090.0,12255.0,0.0,...,,,,,,,,42300.0,13.0,
50%,1970-01-01 00:00:00.020220901,57.0,,,,,45455.0,4545.0,50000.0,0.0,...,,,,,,,,81100.0,28.0,
75%,1970-01-01 00:00:00.020231130,57.0,,,,,256127.5,23200.0,280500.0,0.0,...,,,,,,,,82200.0,62.0,
max,1970-01-01 00:00:00.020250330,70.0,,,,,8830405000000.0,10000000000.0,8830405000000.0,1.0,...,,,,,,,,98000.0,259.0,
