# 1. Data Information

In [1]:
import pandas as pd
import numpy as np

# Dataset
file_path = 'data/IncidentTableData.csv'
df = pd.read_csv(file_path)

# shape
print('Shape of the dataset:', df.shape)

# column names
print('\nColumn names:', df.columns.tolist())

# data types
print('\nData types:')
print(df.dtypes)


Shape of the dataset: (2117, 25)

Column names: ['Ticket Class', 'Ticket Priority', 'Ticket Number', 'Ticket Status', 'Opened Date', 'Hostname', 'Ticket Summary', 'Queue ID', 'Ticket Resolved Date', 'Resolution Code', 'Resolution Text', 'Ticket Closed Date', 'Call code', 'Reported By', 'Executed Automata', 'Recommended Automata', 'Actionable', 'Assignment Queue', 'Autogenerated', 'Automation Engine', 'Business Application', 'Closure Code', 'Sub Category', 'Category', 'Alert Key']

Data types:
Ticket Class             object
Ticket Priority           int64
Ticket Number            object
Ticket Status            object
Opened Date              object
Hostname                 object
Ticket Summary           object
Queue ID                 object
Ticket Resolved Date     object
Resolution Code          object
Resolution Text          object
Ticket Closed Date       object
Call code                object
Reported By              object
Executed Automata       float64
Recommended Automata  

In [2]:
# missing values count
print('\nMissing values per column:')
print(df.isnull().sum())


Missing values per column:
Ticket Class               0
Ticket Priority            0
Ticket Number              0
Ticket Status              0
Opened Date                0
Hostname                1261
Ticket Summary             0
Queue ID                   0
Ticket Resolved Date     190
Resolution Code            0
Resolution Text          188
Ticket Closed Date       432
Call code                  0
Reported By                0
Executed Automata       2117
Recommended Automata     176
Actionable                 0
Assignment Queue           0
Autogenerated              0
Automation Engine          0
Business Application    1382
Closure Code            2117
Sub Category            2114
Category                   0
Alert Key               1290
dtype: int64


In [3]:
# basic stats
print('\nBasic statistics:')
print(df.describe(include='all'))


Basic statistics:
       Ticket Class  Ticket Priority Ticket Number Ticket Status  \
count          2117      2117.000000          2117          2117   
unique            1              NaN          2117             7   
top        INCIDENT              NaN    IN88094767        CLOSED   
freq           2117              NaN             1          1685   
mean            NaN         2.490316           NaN           NaN   
std             NaN         0.673931           NaN           NaN   
min             NaN         1.000000           NaN           NaN   
25%             NaN         2.000000           NaN           NaN   
50%             NaN         2.000000           NaN           NaN   
75%             NaN         3.000000           NaN           NaN   
max             NaN         5.000000           NaN           NaN   

                Opened Date     Hostname  \
count                  2117          856   
unique                 1335           70   
top     Oct 2, 2025 5:29 AM  fgc

# 2. Data Cleaning


In [4]:
# column name standardization : remove spaces and convert to lowercase
df.columns = [col.strip().replace(' ', '_').lower() for col in df.columns]

In [5]:
# remove duplicate rows
df.drop_duplicates(inplace=True)


In [6]:
# Correct data types for date columns
date_cols = ['opened_date', 'ticket_resolved_date', 'ticket_closed_date', 'target_finish_date']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2117 entries, 0 to 2116
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   ticket_class          2117 non-null   object        
 1   ticket_priority       2117 non-null   int64         
 2   ticket_number         2117 non-null   object        
 3   ticket_status         2117 non-null   object        
 4   opened_date           2117 non-null   datetime64[ns]
 5   hostname              856 non-null    object        
 6   ticket_summary        2117 non-null   object        
 7   queue_id              2117 non-null   object        
 8   ticket_resolved_date  1927 non-null   datetime64[ns]
 9   resolution_code       2117 non-null   object        
 10  resolution_text       1929 non-null   object        
 11  ticket_closed_date    1685 non-null   datetime64[ns]
 12  call_code             2117 non-null   object        
 13  reported_by       

  df[col] = pd.to_datetime(df[col], errors='coerce')


# 3. Data Enhancement

In [7]:
# Extract date features from opened_date
if 'opened_date' in df.columns:
    df['opened_year'] = df['opened_date'].dt.year
    df['opened_month'] = df['opened_date'].dt.month
    df['opened_day'] = df['opened_date'].dt.day
    df['opened_hour'] = df['opened_date'].dt.hour


In [8]:
# Convert all object columns to string
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns})

In [9]:
# Normalize categorical values: convert to lowercase
categorical_cols = ['ticket_class', 'ticket_priority', 'ticket_status', 'hostname', 'os_type', 'business_application']
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype('string')
        df[col] = df[col].str.lower()

print(df.head())

  ticket_class ticket_priority ticket_number ticket_status  \
0     incident               2    IN88094767        closed   
1     incident               1    IN88160062      resolved   
2     incident               2    IN71843652        closed   
3     incident               2    IN71887046        queued   
4     incident               2    IN71887860        queued   

          opened_date  hostname  \
0 2025-11-16 03:20:00      <NA>   
1 2025-11-16 02:49:00      <NA>   
2 2025-11-16 01:00:00      <NA>   
3 2025-11-15 19:06:00  ngfed139   
4 2025-11-15 18:06:00  ngfed139   

                                      ticket_summary  \
0  acme#CRITICAL state: System acme_efbdnftwb31:N...   
1  acme#efbdnftwj78#VM VI agent can not access V ...   
2  [P2][BT WEB][PRD] eCommerce endpoint pl-0-eu-w...   
3  edd#ledcb795#SAP: PA1-sappa1as01_PA1_00:Ins:PA...   
4  edd#ledcb795#SAP: PA1-sappa1as01_PA1_00:Ins:PA...   

                          queue_id ticket_resolved_date resolution_code  ...  \

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2117 entries, 0 to 2116
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   ticket_class          2117 non-null   string        
 1   ticket_priority       2117 non-null   string        
 2   ticket_number         2117 non-null   string        
 3   ticket_status         2117 non-null   string        
 4   opened_date           2117 non-null   datetime64[ns]
 5   hostname              856 non-null    string        
 6   ticket_summary        2117 non-null   string        
 7   queue_id              2117 non-null   string        
 8   ticket_resolved_date  1927 non-null   datetime64[ns]
 9   resolution_code       2117 non-null   string        
 10  resolution_text       1929 non-null   string        
 11  ticket_closed_date    1685 non-null   datetime64[ns]
 12  call_code             2117 non-null   string        
 13  reported_by       

# 4. Derived Fields

In [11]:
# Resolution Time (hours)
if 'ticket_resolved_date' in df.columns and 'opened_date' in df.columns:
    df['resolution_time_hours'] = (df['ticket_resolved_date'] - df['opened_date']).dt.total_seconds() / 3600


In [12]:
# Ticket Age (days)
current_time = pd.Timestamp.now()
if 'opened_date' in df.columns:
    df['ticket_age_days'] = (current_time - df['opened_date']).dt.days


In [13]:
# Is Automated ('executed_automata' column)
if 'executed_automata' in df.columns:
    df['is_automated'] = df['executed_automata'].apply(lambda x: False if x == '' else True)


In [14]:
# Critical application flag (based on business_application column)
critical_apps = ['payment processing', 'internet banking', 'my web application']
if 'business_application' in df.columns:
    df['critical_application_flag'] = df['business_application'].apply(lambda x: True if isinstance(x, str) and x in critical_apps else False)


In [15]:
print(df[['resolution_time_hours', 'ticket_age_days', 'is_automated', 'critical_application_flag']].head())

   resolution_time_hours  ticket_age_days  is_automated  \
0               0.333333                0          True   
1               0.166667                0          True   
2               2.083333                0          True   
3                    NaN                0          True   
4                    NaN                1          True   

   critical_application_flag  
0                      False  
1                      False  
2                      False  
3                      False  
4                      False  


# 5. Cleaned Data Export

In [17]:
df.to_csv("data/IncidentTableData_cleaned.csv", index=False)