# Data Preprocessing Tools

## Importing the libraries

In [104]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [105]:
# Load Excel files
df = pd.read_csv("0_service_now_raw_data/service_now_only.csv")

In [106]:
print(df.columns)

Index(['Asset Number', 'Incident_Number', 'Short_Description', 'Status',
       'Priority', 'Category', 'Subcategory', 'Assigned_To', 'Assigned_Group',
       'Created_Date'],
      dtype='object')


In [107]:
# Selective renaming for df
df_rename = df.rename(columns={
    'Asset Number': 'Asset_Number'
})


In [108]:
print(df_rename.columns)

Index(['Asset_Number', 'Incident_Number', 'Short_Description', 'Status',
       'Priority', 'Category', 'Subcategory', 'Assigned_To', 'Assigned_Group',
       'Created_Date'],
      dtype='object')


In [109]:
# Your selected columns
req_cols = ['Asset_Number', 'Category']

# Load only required columns
df = df_rename[req_cols]


In [110]:
print(df.head())

   Asset_Number    Category
0        100080    Security
1        100081  Facilities
2        300006    Hardware
3        300007    Hardware
4        300008     Network


## Custom Data Creation

In [111]:
df.columns = df.columns.str.strip() 

# Take only the relevant columns and drop duplicates
asset_df = df[["Asset_Number", "Category"]].drop_duplicates()

def simulate_survival_data(asset_number, category, n=50, seed=None):
    np.random.seed(seed or hash(asset_number) % (2**32))

    # Simulate realistic features
    usage = np.random.normal(loc=0.6, scale=0.1, size=n)
    temperature = np.random.normal(loc=30 + usage * 10, scale=3, size=n)
    age = np.random.randint(1, 20, size=n)

    # Simulate lifetime_days directly in range 50–200 with realistic skew
    # Skewed distribution: Most around 100–150, some as low as 50, few near 200
    lifetime_days = np.random.gamma(shape=4, scale=20, size=n)
    lifetime_days = np.clip(lifetime_days, 50, 200).astype(int)


    # Simulate censoring (80% observed, 20% censored)
    failure_event = np.random.binomial(n=1, p=0.8, size=n)

    return pd.DataFrame({
        "Asset_Number": [asset_number] * n,
        "lifetime_days": lifetime_days,
        "broken": failure_event,
        "Category": [category] * n,
        "usage": usage,
        "temperature": temperature,
        "age": age,
    })

# Generate data
all_assets_data = []

for _, row in asset_df.iterrows():
    simulated = simulate_survival_data(
        asset_number=row["Asset_Number"],
        category=row["Category"],
        n=100
    )
    all_assets_data.append(simulated)

# Combine all simulated data
final_df = pd.concat(all_assets_data, ignore_index=True)


## Export Updated Excel

In [112]:
# Create the folder if it doesn't exist
os.makedirs("1_service_now_survival_data", exist_ok=True)

# Save the Excel file inside the folder
final_df.to_csv("1_service_now_survival_data/1_service_now_survival_data.csv", index=False)