# Data Preprocessing Tools

## Importing the libraries

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [2]:
# Load Excel files
df = pd.read_csv("1_service_now_historic_time_series_data/1_service_now_historic_time_series_data.csv")

In [3]:
print(df.columns)

Index(['Asset_Number', 'Incident_Date', 'Gap_Days'], dtype='object')


In [4]:
print(df.head())

   Asset_Number Incident_Date  Gap_Days
0        100080    2021-04-09       3.0
1        100080    2021-04-12       1.0
2        100080    2021-04-13       2.0
3        100080    2021-04-15       1.0
4        100080    2021-04-16       7.0


## Inserting Data

In [5]:
import pandas as pd
from datetime import timedelta

df["Incident_Date"] = pd.to_datetime(df["Incident_Date"])

# Sort by Asset_Number and Incident_Date
df = df.sort_values(by=["Asset_Number", "Incident_Date"]).reset_index(drop=True)

# Create a new list to store complete rows with missing dates filled
filled_rows = []

# Group by Asset_Number
for asset_id, group in df.groupby("Asset_Number"):
    group = group.sort_values("Incident_Date")
    dates = list(group["Incident_Date"])

    # Start with the first date
    current_date = dates[0]
    end_date = dates[-1]

    # Use a set for fast lookup
    existing_dates = set(dates)

    # Fill in missing dates
    while current_date <= end_date:
        if current_date in existing_dates:
            # Use the existing row
            row = group[group["Incident_Date"] == current_date].iloc[0]
            filled_rows.append({
                "Asset_Number": asset_id,
                "Incident_Date": current_date,
                "Gap_Days": row["Gap_Days"]
            })
        else:
            # Insert missing date with Gap_Days = 0
            filled_rows.append({
                "Asset_Number": asset_id,
                "Incident_Date": current_date,
                "Gap_Days": 0
            })

        current_date += timedelta(days=1)

# Create a new DataFrame
df_filled = pd.DataFrame(filled_rows)

# Sort and reset index
df_filled = df_filled.sort_values(by=["Asset_Number", "Incident_Date"]).reset_index(drop=True)


## Export Updated Excel

In [6]:
# Create the folder if it doesn't exist
os.makedirs("2_service_now_historic_time_series_updated_data", exist_ok=True)

# Save the Excel file inside the folder
df_filled.to_csv("2_service_now_historic_time_series_updated_data/2_service_now_historic_time_series_updated_data.csv", index=False)