# ticket-closure

This notebook contains sample code to build a model which can predictt how long it will take to resolve tickets in an IT support system. It uses the [Incident Management dataset from the UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Incident+management+process+enriched+event+log).

In [None]:
!pip uninstall -y scipy numpy pandas joblib scikit-learn
!pip install scipy==1.4.1 numpy==1.18.1 pandas==0.24.1 joblib==0.14.1 scikit-learn==0.22.1 lightgbm==2.3.1

In [None]:
import lightgbm as lgb
import numpy as np
import os
import pandas as pd
import pickle
import sklearn
from datetime import datetime

from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

from typing import Dict

In [None]:
# load training data
# https://archive.ics.uci.edu/ml/datasets/Incident+management+process+enriched+event+log
TRAIN_FILE = "incident_event_log.csv"

_date_parser = lambda x: pd.NaT if x == '?' else datetime.strptime(x, "%d/%m/%Y %H:%M")
train_df = pd.read_csv(
    TRAIN_FILE,
    parse_dates=[
        "opened_at",
        "resolved_at",
        "closed_at",
        "sys_created_at",
        "sys_updated_at"
    ],
    infer_datetime_format=False,
    converters={
        "opened_at": _date_parser,
        "resolved_at": _date_parser,
        "closed_at": _date_parser,
        "sys_created_at": _date_parser,
        "sys_updated_at": _date_parser
    },
    na_values = ['?']
)

In [None]:
train_df.head()

# drop columns
#    * stuff you can only know after you've closed the ticket
#    * 'number', which is an incident identifier that is super high-cardinality
#    * high-cardinality columns that are basically obfuscated employee IDs (like 'created_by')
#    * 'caller_id': not creating stateful features like "number of previous tickets from this caller"
drop_cols = [
    "active",
    "assigned_to",
    "caller_id",
    "caused_by",
    "closed_code",
    "incident_state",
    "knowledge",
    "made_sla",
    "number",
    "opened_by",
    "resolved_at",
    "resolved_by",
    "rfc",
    "sys_created_by",
    "sys_updated_by",
    "vendor"
]
train_df = train_df[[col for col in train_df.columns if col not in drop_cols]]
train_df.head()

In [None]:
# create target, 'time_to_close'
TARGET_COL = 'time_to_close'
train_df[TARGET_COL] = (train_df['closed_at'] - train_df['opened_at']) / np.timedelta64(1, 's')

In [None]:
from ticket_closure_lib.transformers import OrdinalConverter
from ticket_closure_lib.transformers import DateColTransformer
from ticket_closure_lib.transformers import FeatureRemover

feature_map = {
    "impact": {
        "3 - Low": 1,
        "2 - Medium": 2,
        "1 - High": 3
    },
    "priority": {
        "4 - Low": 1,
        "3 - Moderate": 2,
        "2 - High": 3,
        "1 - Critical": 4
    },
    "urgency": {
        "3 - Low": 1,
        "2 - Medium": 2,
        "1 - High": 3
    }
}
ordinal_transformer = Pipeline(
    steps=[
        ('convert_some_to_int', OrdinalConverter(feature_map=feature_map)),
        ('fill_na', SimpleImputer(strategy="constant", fill_value="placeholder")),
        ('encode', OrdinalEncoder())
    ]
)

numeric_features = [
    col for col in dict(train_df.dtypes).keys()
    if (
        train_df.dtypes[col] == np.dtype('int64') or
        train_df.dtypes[col] == np.dtype('float64')
    )
]
categorical_features = [
    col for col in dict(train_df.dtypes).keys()
    if train_df.dtypes[col] == np.dtype('O')
]

#  https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
# create a feature engineering Python
lgb_estimator = lgb.LGBMRegressor(
    boosting_type='gbdt',
    max_depth=10,
    learning_rate=0.01,
    n_estimators=1000,
    num_leaves = 30,
    objective='regression',
    n_jobs=4,
    silent=False
)

cols_to_drop = [
    "sys_created_at",
    "sys_updated_at",
    "opened_at",
    "closed_at"
]

ordinal_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="placeholder")),
        ("encode", OrdinalEncoder())
    ]
)

categorical_cols = list(train_df.select_dtypes("O").columns)
pipeline = Pipeline(
    steps=[
        ('date_cols', DateColTransformer()),
        ('remove_intermediate', FeatureRemover(cols_to_drop=cols_to_drop)),
        ('convert_some_to_int', OrdinalConverter(feature_map=feature_map)),
        ('ordinal_col_transformer', ColumnTransformer(
            transformers=[
                ("ordinal", ordinal_transformer, categorical_cols)
            ]
        )),
#         ('fill_na', SimpleImputer(strategy="constant", fill_value="placeholder")),
#         ('encode', OrdinalEncoder()),
        ('regressor', lgb_estimator)
    ]
)

In [None]:
# train-test split
X = train_df[[col for col in train_df.columns if col is not TARGET_COL]]
y = train_df[TARGET_COL]

In [None]:
mod = pipeline.fit(X=X, y=y)

In [None]:
preds = mod.predict(X)

In [None]:
print(f"median ticket duration: {round(train_df[TARGET_COL].median() / (60 * 60 * 24.0), 2)} days")
print(f"MAE: {round(sklearn.metrics.mean_absolute_error(preds, y) / (60.0 * 60.0 * 24.0), 2)} days")
print(f"MSE: {sklearn.metrics.mean_squared_error(preds, y)}")

## Save the model

Now that the model is trained, save it to local storage so it can be used in an application.

In [None]:
with open("model.pkl", "wb") as f:
    pickle.dump(mod, f)

## Upload to Cloud Storage

At this point in the notebook, we've trained a model but that model only exists on the same machine as this notebook. Let's push it to [Amazon S3](https://aws.amazon.com/s3/) so that we can pull it and re-use it later.

In [None]:
import boto3
S3_TRAINING_ARTIFACT_BUCKET = "ticket-closure-model-artifacts-358790040914-us-east-1"
S3 = boto3.resource('s3')
S3.meta.client.upload_file(
    "model.pkl",
    S3_TRAINING_ARTIFACT_BUCKET,
    "model.pkl"
)