In [2]:
import pandas as pd

In [4]:
# Load cleaned data
candidates = pd.read_csv('../../data/candidates.csv')
employees = pd.read_csv('../outputs/summary_tables/employees_cleaned.csv')
employments = pd.read_csv('../outputs/summary_tables/employments_cleaned.csv')
positions = pd.read_csv('../../data/positions.csv')
recruitments = pd.read_csv('../../data/recruitments.csv')

In [5]:
# Standardize column names
for df in [candidates, employees, employments, positions, recruitments]:
    df.columns = df.columns.str.lower()

In [6]:
print("Recruitments columns:", recruitments.columns.tolist())
print("Candidates columns:", candidates.columns.tolist())
print("Employees columns:", employees.columns.tolist())
print("Employments columns:", employments.columns.tolist())
print("Positions columns:", positions.columns.tolist())


Recruitments columns: ['id', 'candidate_id', 'position_id', 'stage', 'stage_start_date', 'recruitment_channel', 'recruitment_cost', 'notes']
Candidates columns: ['id', 'first_name', 'last_name', 'email', 'phone']
Employees columns: ['id', 'first_name', 'last_name', 'gender', 'birth_date', 'hire_date', 'termination_date', 'status', 'position_id', 'manager_id']
Employments columns: ['id', 'employee_id', 'position_id', 'start_date', 'salary', 'bonus', 'pension_contribution', 'vacation_allowance']
Positions columns: ['id', 'title', 'job_level', 'department']


In [7]:
# Merge recruitment with candidates
recruit_df = pd.merge(
    recruitments, 
    candidates, 
    how='left', 
    left_on='candidate_id', 
    right_on='id', 
    suffixes=('', '_candidate')
)

print(recruit_df.columns.tolist())

['id', 'candidate_id', 'position_id', 'stage', 'stage_start_date', 'recruitment_channel', 'recruitment_cost', 'notes', 'id_candidate', 'first_name', 'last_name', 'email', 'phone']


In [8]:
# Merge with positions to add job details
recruit_df = pd.merge(
    recruit_df,
    positions,
    how='left',
    left_on='position_id',
    right_on='id',
    suffixes=('', '_position')
)

# Drop redundant position ID from positions table
recruit_df.drop(columns=['id_position'], inplace=True)


print(recruit_df.columns.tolist())


['id', 'candidate_id', 'position_id', 'stage', 'stage_start_date', 'recruitment_channel', 'recruitment_cost', 'notes', 'id_candidate', 'first_name', 'last_name', 'email', 'phone', 'title', 'job_level', 'department']


In [9]:
# Convert to string to avoid merge issues
recruit_df['candidate_id'] = recruit_df['candidate_id'].astype(str)
employees['id'] = employees['id'].astype(str)

# Add 'hired' flag
recruit_df['hired'] = recruit_df['candidate_id'].isin(employees['id'])


Average Recruitment Cost by Department

In [11]:
avg_cost = (
    recruit_df.groupby('department')['recruitment_cost']
    .mean()
    .reset_index()
    .sort_values(by='recruitment_cost', ascending=False)
)

display(avg_cost.head())


Unnamed: 0,department,recruitment_cost
4,Management,2342.642857
0,Business Intelligence,2231.163265
1,Cybersecurity,2229.481818
3,Human Resources,2205.280899
5,Marketing,2196.483582


In [None]:
# Make sure output directory exists first
import os
os.makedirs('../../outputs/summary_tables', exist_ok=True)

# Save summary tables
avg_cost.to_csv('../outputs/summary_tables/recruitment_cost_by_department.csv', index=False)


In [None]:
# Convert date columns
employees['hire_date'] = pd.to_datetime(employees['hire_date'], errors='coerce')
employees['termination_date'] = pd.to_datetime(employees['termination_date'], errors='coerce')


Employee Tenure