# Project Initialisation

In [None]:
# Kedro 
import os
import sys
from pathlib import Path

# Set Kedro project path
project_path = Path.cwd().parent

# Bootstrap Kedro
from kedro.framework.startup import bootstrap_project
from kedro.framework.session import KedroSession

bootstrap_project(project_path)
session = KedroSession.create(project_path=project_path)
context = session.load_context()
catalog = context.catalog

# Add src/ to Python path
sys.path.append(str(project_path / "src"))

In [None]:
# Import full modules (for reload)
import egt305_job_market_analysis.utils.etl as etl
import egt305_job_market_analysis.utils.viz as viz

import importlib
importlib.reload(etl)
importlib.reload(viz)

# Set custom plot style for consistency
viz.set_plot_style()

# Data Injestion

In [None]:
# Loading Datasets
df_employee = catalog.load("employee_dataset")
df_salary = catalog.load("employee_salaries")

# EDA

## Employee Dataset

### Initial Inspection

In [None]:
import pandas as pd
from IPython.display import display

# Inspecting the employee dataset for basic information & statistics

# Shape of the dataset
print(f"Dataset shape: {df_employee.shape}")

# 2. Preview first 5 rows
display(df_employee.head())

# 3. Column names and data types
df_employee.info()

# 4. Descriptive statistics for numerical and categorical features
display(df_employee.describe(include='all'))

# 5. Check for missing values
missing_counts = df_employee.isnull().sum()
missing_perc = (missing_counts / len(df_employee) * 100).round(2)
missing_df = pd.DataFrame({'Missing Count': missing_counts, 'Missing %': missing_perc})
display(missing_df[missing_df['Missing Count'] > 0])


Initial issues or anomalies detected.

- Column names are not in a standard format as they have upper & lower case
- Data entries are unecessarily complex i.e. COMP37 as they could just be 37
- Columns are not in the correct dtype
- distanceFromCBD has a very large difference from 75% to MAX indicating high value outliers
- missing data in multiple columns

### Fixing data structure errors

In [None]:
# initial column names from data description
# standardizing column names for consistency
df_employee.rename(columns={
    'jobId': 'job_id',
    'companyId': 'company_id',
    'jobRole': 'job_role',
    'education': 'education',
    'major': 'major',
    'Industry': 'industry',
    'yearsExperience': 'years_experience',
    'distanceFromCBD': 'distance_from_cbd'
}, inplace=True)

df_employee.head(2)

Fixed column names to be more standardized

In [None]:
# Check value counts for company_id including NaN
company_counts = df_employee['company_id'].value_counts(dropna=False)

display(company_counts)
print(f"Unique company_id count (including NaN): {df_employee['company_id'].nunique(dropna=False)}")

Checking the various unique entries as well as ensuring the prefix is COMP for all, as well as keeping a before prefix drop state.

In [None]:
# Remove 'COMP' prefix and convert to integer
df_employee['company_id'] = (
    df_employee['company_id']
    .astype(str)
    .str.replace('COMP', '', regex=False)
    .replace('<NA>', pd.NA)  # make sure string '<NA>' is real missing value
    .astype('Int64')  # nullable integer dtype
)

In [None]:
# Check value counts for company_id including NaN
company_counts = df_employee['company_id'].value_counts(dropna=False)

display(company_counts)
print(f"Unique company_id count (including NaN): {df_employee['company_id'].nunique(dropna=False)}")

Fixed company id column to be more model friendly whilst maintaining all entries including NA (to be dealt with when handling missing or dupe data)

In [None]:
# Identify string/object columns, excluding job_id
string_cols = [col for col in df_employee.select_dtypes(include='string').columns if col != 'job_id']

# Display unique values for each string column
for col in string_cols:
    # Convert to string for consistent viewing
    temp_series = df_employee[col].astype(str)
    
    print(f"\n--- {col} ---")
    print(f"Unique values: {temp_series.nunique(dropna=False)}")
    print(temp_series.value_counts(dropna=False))


- job_role column seems to have a good spread of job roles except for a sole exception which is the president job role but all roles are valid and do not have semantic overlap. Some missing data but will be handled later.

- education column has a large amount of missing data which is labeled as NONE and NA which will be handled in missing data section. Good spread of data without any semantic overlap.

- major column has a similar issue with education column with missing data with 2 different labels but aside from missing data, there is a good spread of data among the categories with no semantic overlap.

- industry column has a similar issue with job_role column. One singular entry in a category i.e. governement. However, there is a good spread of data.

## 