# Project Initialisation

In [None]:
# Kedro + PySpark + EDA Notebook Bootstrap (Kedro 1.0)
import os
import sys
from pathlib import Path

# Set Kedro project path
project_path = Path.cwd().parent

# Bootstrap Kedro
from kedro.framework.startup import bootstrap_project
from kedro.framework.session import KedroSession

bootstrap_project(project_path)
session = KedroSession.create(project_path=project_path)
context = session.load_context()
catalog = context.catalog

# Add src/ to Python path
sys.path.append(str(project_path / "src"))

# PySpark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("EGT305_EDA").getOrCreate()

In [None]:
# Import full modules (for reload)
import caie_nyp_batch3_mohammad_habib_410d.utils.etl as etl
import caie_nyp_batch3_mohammad_habib_410d.utils.viz as viz

import importlib
importlib.reload(etl)
importlib.reload(viz)

# Set custom plot style for consistency
viz.set_plot_style()

# Data Exploration Using Spark

In [None]:
# Loading Datasets
df_employee = catalog.load("employee_dataset")
df_salaries = catalog.load("employee_salaries")

## Employee Dataset Exploration

In [None]:
# Basic structure
print("Shape:", df_employee.shape)
display(df_employee.head())

# Column overview
df_employee.info()

# Missing values
print("\nMissing values:")
display(df_employee.isnull().sum())

# Duplicates
print("\nDuplicate rows:", df_employee.duplicated().sum())

# Unique values
print("\nUnique values per column:")
display(df_employee.nunique())

# Descriptive statistics for numeric columns
display(df_employee.describe())

# Count of values for key categorical features
categorical_cols = ["companyId", "jobRole", "education", "major", "industry"]
for col in categorical_cols:
    print(f"\nValue counts for '{col}':")
    display(df_employee[col].value_counts().head(10))
