# Data Preprocessing Tools

## Importing the libraries

In [11]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [12]:
# Load Excel files
df = pd.read_csv("0_service_now_raw_data/service_now_only.csv")

In [13]:
print(df.columns)

Index(['Asset Number', 'Incident_Number', 'Short_Description', 'Status',
       'Priority', 'Category', 'Subcategory', 'Assigned_To', 'Assigned_Group',
       'Created_Date'],
      dtype='object')


In [14]:
# Selective renaming for df
df_rename = df.rename(columns={
    'Asset Number': 'Asset_Number'
})


In [15]:
print(df_rename.columns)

Index(['Asset_Number', 'Incident_Number', 'Short_Description', 'Status',
       'Priority', 'Category', 'Subcategory', 'Assigned_To', 'Assigned_Group',
       'Created_Date'],
      dtype='object')


In [18]:
# Your selected columns
req_cols = ['Asset_Number', 'Created_Date']

# Load only required columns
df = df_rename[req_cols]


In [20]:
print(df.head())

   Asset_Number         Created_Date
0        100080  2024 04 09T15:58:22
1        100081  2024 04 17T08:05:22
2        300006  2024 04 10T08:33:47
3        300007  2024 04 05T14:22:33
4        300008  2024 04 11T11:45:33


## Custom Data Creation

In [23]:
from datetime import timedelta
import random

df["Created_Date"] = pd.to_datetime(df["Created_Date"], format="%Y %m %dT%H:%M:%S")

# Initialize list to hold simulated data
simulated_data = []

# For each unique asset
for _, row in df.iterrows():
    asset_id = row["Asset_Number"]
    created_date = row["Created_Date"]

    # Define start of 36-month window
    start_date = created_date - pd.DateOffset(months=36)

    # Generate random number of incidents (e.g., between 50 and 300 total per asset)
    num_incidents = random.randint(50, 300)

    # Generate random dates within the range [start_date, created_date]
    incident_dates = [
        start_date + timedelta(days=random.randint(0, (created_date - start_date).days))
        for _ in range(num_incidents)
    ]

    # Sort incident dates
    incident_dates.sort()

    # Append to the simulated data list
    for date in incident_dates:
        simulated_data.append({"Asset_Number": asset_id, "Incident_Date": date})

# Convert to DataFrame
incident_df = pd.DataFrame(simulated_data)

# Sort by Asset and Incident Date
incident_df = incident_df.sort_values(["Asset_Number", "Incident_Date"])

# Compute the gap in days between consecutive incidents per asset
incident_df["Gap_Days"] = incident_df.groupby("Asset_Number")["Incident_Date"].diff().dt.days

incident_df.reset_index(drop=True, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Created_Date"] = pd.to_datetime(df["Created_Date"], format="%Y %m %dT%H:%M:%S")


## Export Updated Excel

In [24]:
# Create the folder if it doesn't exist
os.makedirs("1_service_now_historic_time_series_updated_data", exist_ok=True)

# Save the Excel file inside the folder
incident_df.to_csv("1_service_now_historic_time_series_updated_data/1_service_now_historic_time_series_updated_data.csv", index=False)