# Custom Data Creation

## Importing the libraries

In [34]:
import os
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [35]:
df = pd.read_excel("1_updated_data/1_updated_data.xlsx")


In [36]:
print(df.head())

                Current_Role  Employee_ID  Years_Of_Service    Department  \
0   Senior Software Engineer           57               5.2   Engineering   
1   Associate Data Scientist           58               1.7  Data Science   
2  Associate Product Manager           59               1.3       Product   
3           Business Analyst           60               3.9       Finance   
4    Chief Operating Officer           61              15.2     Executive   

   Employee HR rate  # of Hours per week  \
0                60                   40   
1               117                   40   
2               147                   40   
3               153                   40   
4                79                   40   

   Calculated Column - Fully Loaded Cost   Monthly FLC  \
0                              163355.20  13612.933333   
1                               92926.13   7743.844167   
2                              160081.53  13340.127500   
3                              105775.38

In [37]:
print(df.columns)

Index(['Current_Role', 'Employee_ID', 'Years_Of_Service', 'Department',
       'Employee HR rate', '# of Hours per week',
       'Calculated Column - Fully Loaded Cost', 'Monthly FLC',
       'Years_Since_Last_Promotion', 'age', 'left'],
      dtype='object')


## Custom Time Series Dataset

In [38]:
from dateutil.relativedelta import relativedelta

# Prepare constants
start_date = datetime.today().replace(day=1)
months_back = 72
date_range = [(start_date - relativedelta(months=i)).strftime("%Y-%m") for i in range(months_back)][::-1]

# Prepare the final data list
final_data = []

for _, row in df.iterrows():
    emp_id = row["Employee_ID"]
    current_flc = row["Monthly FLC"]
    if pd.isna(current_flc):
        continue

    # Calculate descending salary segments to simulate real hike breakage
    segments = np.random.randint(3, 8)  # 3 to 7 segments (flat periods)
    break_points = sorted(np.random.choice(range(1, months_back), size=segments - 1, replace=False))
    break_points = [0] + break_points + [months_back]

    # Generate values for each segment
    segment_flcs = sorted(
        np.round(np.random.normal(loc=current_flc * 0.7, scale=500, size=segments)), reverse=False
    )
    segment_flcs[-1] = current_flc  # Set last segment to current average

    monthly_costs = []
    for i in range(len(break_points) - 1):
        monthly_costs.extend([segment_flcs[i]] * (break_points[i + 1] - break_points[i]))

    for ym, cost in zip(date_range, monthly_costs):
        final_data.append({
            "Employee_ID": emp_id,
            "year-month": ym,
            "Annual load cost per month": round(cost, 2)
        })

# Convert to DataFrame
flc_time_series_df = pd.DataFrame(final_data)


## Export Updated Excel

In [39]:
# Create the folder if it doesn't exist
os.makedirs("2_updated_data", exist_ok=True)

# Save the Excel file inside the folder
flc_time_series_df.to_excel("2_updated_data/2_updated_data.xlsx", index=False)
