In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Set display options to show all columns
pd.set_option('display.max_columns', None)
np.random.seed(42)  # You can use any integer value as the seed

CWD = os.getcwd()
print(f'CWD: {CWD}')

CWD: /data_analysis/IT_support/src


In [26]:
cardinal_cols = ['number', 'caller_id', 'opened_by', 'sys_created_by', 'sys_updated_by', 'contact_type', 'location', 'u_symptom']
ordinal_cols = ['incident_state', 'reassignment_count', 'reopen_count', 'sys_mod_count', 'impact', 'urgency', 'priority']

boolean_cols = ['cmdb_ci', 'u_priority_confirmation',]

dt_cols = ['opened_at', 'sys_created_at', 'sys_updated_at']

# to_remove = ['made_sla', 'knowledge', 'notify', 'problem_id', 'rfc', 'vendor', 'caused_by', 'closed_code', 'resolved_by', 'resolved_at', 'closed_at',
#              'assignment_group', 'assigned_to', 'category', 'subcategory', 'active']

all_cols = cardinal_cols + ordinal_cols + boolean_cols + dt_cols
len(all_cols)

20

In [27]:
ROOT = os.path.dirname(CWD)
PATH = os.path.join(ROOT, 'data', 'incident_event_cleaned.csv')

df = pd.read_csv(PATH, low_memory=False, index_col=0)
# Convert the date columns in proper format so we can do calculations on them
for col in dt_cols:
    df[col] = pd.to_datetime(df[col])
df.sample(5)

Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,sys_created_by,sys_created_at,sys_updated_by,sys_updated_at,contact_type,location,category,subcategory,u_symptom,cmdb_ci,impact,urgency,priority,assignment_group,assigned_to,knowledge,u_priority_confirmation,notify,problem_id,rfc,vendor,caused_by,closed_code,resolved_by,resolved_at,update_to_closing_days,duration_days,update_duration_days
32731,INC0007320,Resolved,True,0.0,0.0,1.0,True,Caller 1177,Opened by 386,2016-03-15 13:54:00,,NaT,Updated by 723,2016-03-15 14:07:00,Phone,Location 55,Category 46,Subcategory 223,,,2 - Medium,2 - Medium,3 - Moderate,Group 70,Resolver 186,False,False,Do Not Notify,,,,,code 7,Resolved by 169,15/3/2016 14:07,9.204167,9.213194,0.009028
91383,INC0022001,Active,True,0.0,0.0,1.0,True,Caller 3912,Opened by 463,2016-04-25 00:40:00,,NaT,Updated by 846,2016-04-25 01:13:00,Phone,Location 161,Category 26,Subcategory 174,Symptom 491,,2 - Medium,2 - Medium,3 - Moderate,Group 70,Resolver 218,False,False,Do Not Notify,,,,,code 7,Resolved by 199,25/4/2016 01:37,5.0375,5.060417,0.022917
110358,INC0026712,Resolved,True,0.0,0.0,1.0,True,Caller 2535,Opened by 24,2016-05-06 09:21:00,,NaT,Updated by 60,2016-05-06 09:35:00,Phone,Location 44,Category 42,Subcategory 223,Symptom 534,,2 - Medium,2 - Medium,3 - Moderate,Group 70,Resolver 17,False,True,Do Not Notify,,,,,code 7,Resolved by 15,6/5/2016 09:35,5.022222,5.031944,0.009722
58225,INC0013455,Awaiting User Info,True,2.0,0.0,10.0,True,Caller 4976,,2016-03-31 10:57:00,,NaT,Updated by 421,2016-04-11 08:31:00,Phone,Location 108,Category 40,Subcategory 215,,,2 - Medium,2 - Medium,3 - Moderate,Group 70,,False,False,Do Not Notify,,,,,code 7,Resolved by 15,12/4/2016 16:12,6.358333,17.256944,0.0
36119,INC0008036,Awaiting User Info,True,0.0,0.0,8.0,True,Caller 1010,Opened by 443,2016-03-17 05:41:00,,NaT,Updated by 664,2016-03-18 16:36:00,Phone,Location 204,Category 57,Subcategory 170,,,1 - High,1 - High,1 - Critical,Group 65,Resolver 180,False,False,Do Not Notify,,,,,,Resolved by 163,23/3/2016 14:42,9.932639,11.3875,0.251389


## Feature Engineering
Create new features using Sklearn `FunctionTransformer` this way everything can be put together in the pipeline.

In [28]:
from sklearn.preprocessing import FunctionTransformer
def ticket_age(X):
    X['ticket_age_open'] = (X['sys_updated_at'] - X['opened_at']).dt.total_seconds() / (3600 * 24) # Days since ticket was opened
    X['ticket_age_create'] = (X['sys_updated_at'] - X['sys_created_at']).dt.total_seconds() / (3600 * 24) # Days since ticket was created in system
    X['days_before_sys_create'] = (X['opened_at'] - X['sys_created_at']).dt.total_seconds() / (3600 * 24) # Gap between opening and creation in system

    # Compute average ticket resolution time per priority
    avg_resolve_time = X[X['incident_state']=='Resolved'].groupby('priority')['ticket_age_create'].mean().reset_index()
    X = pd.merge(X, avg_resolve_time, on='priority', how='left', suffixes=('', '_mean'))

    # Compute average ticket age at each stage
    avg_age = X.groupby('incident_state')['ticket_age_create'].mean().reset_index()
    X = pd.merge(X, avg_age, on='incident_state', how='left', suffixes=('', '_mean'))

    return X

ticket_age_transform = FunctionTransformer(ticket_age)


In [29]:
# Models cannot handle datetime format by itself. Split datetime components to separate components.
def split_dt_parts(X):
    cols = X.select_dtypes(include=['datetime64']).columns.to_list()
    for col in cols:
        for component in ['year', 'month', 'day', 'hour', 'minute', 'second']:
            new_col_name = f'{col}_{component}'  # Create a dynamic column name
            X[new_col_name] = X[col].dt.__getattribute__(component).astype(float)
    return X

split_dt_parts_transform = FunctionTransformer(split_dt_parts)

# Drop the original datetime columns and the columns used for computations
def drop_columns(X):
    cols = X.select_dtypes(include=['datetime64']).columns.to_list()
    cols+=['number', 'incident_state','priority']
    X.drop(columns=cols, inplace=True)
    return X

drop_columns_transform = FunctionTransformer(drop_columns)

In [30]:
from sklearn.base import BaseEstimator, TransformerMixin
# Custom transformer for target encoding
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        cols = X.columns.to_list()
        temp_df = pd.concat([X, y], axis=1)
        category_means_ = []
        for col in cols:
            category_means_.append(temp_df.groupby(col)[y.name].mean().to_dict())
        self.category_means_ = category_means_
        return self

    def transform(self, X):
        X_encoded = X.copy()
        cols = X_encoded.columns.to_list()
        for i, col in enumerate(cols):
            X_encoded[col] = X[col].map(self.category_means_[i]).fillna(-1)
        return X_encoded

### Pipeline

In [31]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, chi2, SelectKBest
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso


from sklearn.metrics import mean_squared_error

In [32]:
# Create the column transformer

cardinal_cols = ['contact_type', 'location', 'u_symptom']
ordinal_cols = ['incident_state', 'sys_mod_count', 'priority',]
boolean_cols = ['cmdb_ci', 'u_priority_confirmation',]
dt_cols = ['opened_at', 'sys_created_at', 'sys_updated_at', 'number', 'incident_state', 'priority'] # Number & incident_state are here for feature engineering. will be dropped later.

categorical_features = cardinal_cols + ordinal_cols + boolean_cols
datetime_features = dt_cols

categorical_transformer = Pipeline(steps=[
    # ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
    # ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ('encoder', TargetEncoder()),
    ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True)),
])

datetime_transformer = Pipeline(steps=[
    ('compute_ticket_age', ticket_age_transform),
    # ('split_dt_parts', split_dt_parts_transform),
    ('drop_cols', drop_columns_transform),
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1.0, add_indicator=True)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('datetime', datetime_transformer, datetime_features),
    ]
)

# Create the final pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', XGBRegressor())
                           ])

pipeline

### Train Test Split

In [33]:
# Define the features and target
df.sort_values(['opened_at', 'number'])

# Define the groups based on the 'number' column
groups = df['number']

# Get unique group values
unique_groups = groups.unique()
split_idx = int(0.8 * len(unique_groups))

# train_num, test_num = train_test_split(unique_groups, test_size = 0.2, random_state=42)
train_num, test_num = unique_groups[:split_idx], unique_groups[split_idx:]
train = df[df['number'].isin(train_num)]
test = df[df['number'].isin(test_num)]

X_train, y_train = train.drop(columns=['update_to_closing_days',]), train['update_to_closing_days']
X_test, y_test = test.drop(columns=['update_to_closing_days',]), test['update_to_closing_days']

### Naive Predictions

In [34]:
# Compute the mean and standard deviation of y_train
mean_y_train = np.mean(y_train)
std_dev_y_train = np.std(y_train)

# Generate y_preds using random normal distribution
y_pred = np.random.normal(loc=mean_y_train, scale=std_dev_y_train, size=len(y_test))

# Calculate the mean squared error (MSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (MSE): {rmse}")

Root Mean Squared Error (MSE): 6.85105090751942


### Model Prediction

In [35]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Calculate the mean squared error (MSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (MSE): {rmse}")

Root Mean Squared Error (MSE): 3.944794146652581


### Fine tuning

In [36]:
from sklearn.model_selection import RandomizedSearchCV, GroupKFold

# Define the number of folds for cross-validation
n_splits = 5  # You can adjust this as needed

# Create a GroupKFold cross-validator using the 'number' column as groups
group_kfold = GroupKFold(n_splits=n_splits)

# Define the hyperparameter grid for XGBoostRegressor
param_grid_xgb = {
    'model': [XGBRegressor()],
    # 'model__n_estimators': np.linspace(100, 500, 5, dtype=int),
    'model__learning_rate': np.logspace(-3, 0, 20),
    # 'model__max_depth': np.linspace(3, 10, 8, dtype=int),
    # 'model__min_child_weight': np.linspace(1, 5, 5, dtype=int),
    # 'model__subsample': np.linspace(0.8, 1.0, 3),
    # 'model__colsample_bytree': np.linspace(0.8, 1.0, 3),
    'model__gamma': np.linspace(0,5,50)
}

param_grid_linear = {
    'model': [LinearRegression()],
}

param_grid_rf = {
    'model': [RandomForestRegressor()],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
}

param_grid_lasso = {
    'model': [Lasso()],
    'model__alpha': [0.01, 0.1, 1.0],
    'model__normalize': [False, True],
}

all_params = [param_grid_xgb, param_grid_rf, param_grid_linear, param_grid_lasso]

# Create the RandomizedSearchCV object with GroupKFold
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=all_params,
    n_iter=10,
    scoring='neg_mean_squared_error',
    cv=group_kfold,  # Use GroupKFold for cross-validation
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

# Fit the RandomizedSearchCV object on your data
# random_search.fit(X_train, y_train, groups=X_train['number'])  # Pass the 'number' column as groups

# # Get the best hyperparameters and estimator from the search
# best_params = random_search.best_params_
# best_estimator = random_search.best_estimator_

# # Print the best hyperparameters
# print(f'Best Hyperparameters: {best_params}')

# # Make predictions on the test data
# y_pred = best_estimator.predict(X_test)

# # Calculate the mean squared error (MSE)
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# print(f"Root Mean Squared Error (MSE): {rmse}")