# Data Preprocessing Tools

## Importing the libraries

In [283]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [284]:
# Load Excel files
df = pd.read_csv("0_service_now_raw_data/service_now_only.csv")

In [285]:
print(df.columns)

Index(['Asset Number', 'Incident_Number', 'Short_Description', 'Status',
       'Priority', 'Category', 'Subcategory', 'Assigned_To', 'Assigned_Group',
       'Created_Date'],
      dtype='object')


In [286]:
# Selective renaming for df
df_rename = df.rename(columns={
    'Asset Number': 'Asset_Number'
})


In [287]:
print(df_rename.columns)

Index(['Asset_Number', 'Incident_Number', 'Short_Description', 'Status',
       'Priority', 'Category', 'Subcategory', 'Assigned_To', 'Assigned_Group',
       'Created_Date'],
      dtype='object')


In [288]:
# Your selected columns
req_cols = ['Asset_Number', 'Category']

# Load only required columns
df = df_rename[req_cols]


In [289]:
print(df.head())

   Asset_Number    Category
0        100080    Security
1        100081  Facilities
2        300006    Hardware
3        300007    Hardware
4        300008     Network


## Custom Data Creation

In [290]:
df.columns = df.columns.str.strip()

# Extract unique assets and categories
asset_df = df[["Asset_Number", "Category"]].drop_duplicates().reset_index(drop=True)

# Simulate time-varying data
def simulate_time_varying(asset_id, category, n_intervals=100, max_time=500):
    # Generate non-uniform time intervals
    stop_times = np.sort(np.random.choice(range(10, max_time), size=n_intervals, replace=False))
    start_times = np.insert(stop_times[:-1], 0, 0)

    # Simulate features in range 0–100
    usage = np.clip(np.random.normal(loc=60, scale=10, size=n_intervals), 0, 100)
    temperature = np.clip(np.random.normal(loc=30 + usage * 0.2, scale=5, size=n_intervals), 0, 100)
    load = np.clip(np.random.normal(loc=70, scale=10, size=n_intervals), 0, 100)
    health_bar = np.clip(100.0 - np.linspace(0, 100, n_intervals) + np.random.normal(0, 5, n_intervals), 0, 100)

    # Initialize all events to 0
    event = [0] * n_intervals

    # Ensure at least one event is in the last 20 rows
    last_20_idx = np.random.choice(range(n_intervals - 20, n_intervals), size=1, replace=False)
    
    # Choose 4 more indices from the remaining part
    remaining_idx = list(set(range(n_intervals)) - set(last_20_idx))
    additional_idx = np.random.choice(remaining_idx, size=4, replace=False)

    final_event_indices = list(last_20_idx) + list(additional_idx)
    for idx in final_event_indices:
        event[idx] = 1

    return pd.DataFrame({
        "Asset_Number": [asset_id] * n_intervals,
        "start": start_times,
        "stop": stop_times,
        "event": event,
        "Category": [category] * n_intervals,
        "usage": usage,
        "temperature": temperature,
        "load": load,
        "health_bar": health_bar
    })

# Apply simulation for each asset
time_varying_data = []

for _, row in asset_df.iterrows():
    asset_id = row["Asset_Number"]
    category = row["Category"]
    asset_data = simulate_time_varying(asset_id, category)
    time_varying_data.append(asset_data)

# Final dataset
tv_df = pd.concat(time_varying_data, ignore_index=True)


In [292]:
# Load Excel files
df = pd.read_csv("1_service_now_survival_data/1_service_now_survival_data.csv")

In [293]:
print(tv_df.isna().sum())


Asset_Number    0
start           0
stop            0
event           0
Category        0
usage           0
temperature     0
load            0
health_bar      0
dtype: int64


## Export Updated Excel

In [291]:
# Create the folder if it doesn't exist
os.makedirs("1_service_now_survival_data", exist_ok=True)

# Save the Excel file inside the folder
tv_df.to_csv("1_service_now_survival_data/1_service_now_survival_data.csv", index=False)