# Data Preprocessing Tools

## Importing the libraries

In [229]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [230]:
# Load Excel files
df = pd.read_csv("0_service_now_raw_data/service_now_only.csv")

In [231]:
print(df.columns)

Index(['Asset Number', 'Incident_Number', 'Short_Description', 'Status',
       'Priority', 'Category', 'Subcategory', 'Assigned_To', 'Assigned_Group',
       'Created_Date'],
      dtype='object')


In [232]:
# Selective renaming for df
df_rename = df.rename(columns={
    'Asset Number': 'Asset_Number'
})


In [233]:
print(df_rename.columns)

Index(['Asset_Number', 'Incident_Number', 'Short_Description', 'Status',
       'Priority', 'Category', 'Subcategory', 'Assigned_To', 'Assigned_Group',
       'Created_Date'],
      dtype='object')


In [234]:
# Your selected columns
req_cols = ['Asset_Number', 'Category']

# Load only required columns
df = df_rename[req_cols]


In [235]:
print(df.head())

   Asset_Number    Category
0        100080    Security
1        100081  Facilities
2        300006    Hardware
3        300007    Hardware
4        300008     Network


## Custom Data Creation

In [None]:
df.columns = df.columns.str.strip()

# Extract unique assets and categories
asset_df = df[["Asset_Number", "Category"]].drop_duplicates().reset_index(drop=True)

# Simulate time-varying data
def simulate_time_varying(asset_id, category, n_intervals=100, max_time=500):
    # Generate non-uniform time intervals
    stop_times = np.sort(np.random.choice(range(10, max_time), size=n_intervals, replace=False))
    start_times = np.insert(stop_times[:-1], 0, 0)

    # Simulate features
    usage = np.random.normal(loc=0.6, scale=0.1, size=n_intervals)
    temperature = np.random.normal(loc=30 + usage * 10, scale=2, size=n_intervals)
    load = np.clip(np.random.normal(loc=0.7, scale=0.1, size=n_intervals), 0.3, 1.0)

    # Health bar degrades over time
    health_bar = np.clip(1.0 - np.linspace(0, 1, n_intervals) + np.random.normal(0, 0.05, n_intervals), 0.0, 1.0)

    # Simulate multiple failures (2 random + last row)
    event = [0] * n_intervals
    middle_indices = list(range(n_intervals // 4, n_intervals - n_intervals // 4 - 1))
    if len(middle_indices) >= 2:
        selected = np.random.choice(middle_indices, size=2, replace=False)
        for idx in selected:
            event[idx] = 1
    event[-1] = 1  # Force last row to be a failure

    return pd.DataFrame({
        "Asset_Number": [asset_id] * n_intervals,
        "start": start_times,
        "stop": stop_times,
        "event": event,
        "Category": [category] * n_intervals,
        "usage": usage,
        "temperature": temperature,
        "load": load,
        "health_bar": health_bar
    })

# Apply simulation for each asset
time_varying_data = []

for _, row in asset_df.iterrows():
    asset_id = row["Asset_Number"]
    category = row["Category"]
    asset_data = simulate_time_varying(asset_id, category)
    time_varying_data.append(asset_data)

# Final dataset
tv_df = pd.concat(time_varying_data, ignore_index=True)


## Export Updated Excel

In [237]:
# Create the folder if it doesn't exist
os.makedirs("1_service_now_survival_data", exist_ok=True)

# Save the Excel file inside the folder
tv_df.to_csv("1_service_now_survival_data/1_service_now_survival_data.csv", index=False)