# Incident classification

In [95]:
import os

# import pandas as pd
from pandas import DataFrame

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import spacy
from spacy.language import Language

In [96]:
# Download language model
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [97]:
# Load language model
nlp = spacy.load("en_core_web_sm")

## Read incidents

### Read development database data

In [98]:
# host = os.getenv("DB_HOST")
# db = os.getenv("DB_NAME")
# user = os.getenv("DB_USER")
# password = os.getenv("DB_PASSWORD")
# connection = f"postgresql://{user}:{password}@{host}/{db}"

In [99]:
# incidents_df = pd.read_sql(
#     "SELECT C.username AS user, A.subject, A.description, B.name AS category FROM core_incident A "
#     "LEFT JOIN core_category B ON A.category_id = B.id "
#     "LEFT JOIN auth_user C on A.opened_by_id = C.id "
#     "WHERE B.name IS NOT NULL "
#     "ORDER BY A.id;",
#     connection
# )

### Create test data

In [100]:
incidents_df = DataFrame(
    data={
        "user": ["user1", "user1", "user2", "user2", "user2", "user3"],
        "subject": [
            "The main process failed",
            "A function has a bug",
            "A class must be reviewed as it might contain a bug",
            "The new release has to be deployed in Production",
            "We found a bug in the code",
            "The database doesn't work"
        ],
        "description": [None, None, None, None, None, None],
        "category": ["Error", "Error", "Error", "Production", "Error", "Production"]
    }
)

In [101]:
# Check data
incidents_df

Unnamed: 0,user,subject,description,category
0,user1,The main process failed,,Error
1,user1,A function has a bug,,Error
2,user2,A class must be reviewed as it might contain a...,,Error
3,user2,The new release has to be deployed in Production,,Production
4,user2,We found a bug in the code,,Error
5,user3,The database doesn't work,,Production


## Preprocess data

In [102]:
def standardize_text(nlp: Language, x: str) -> str:
    """Standardize text by removing stop words, punctuation and other symbols.

    :param nlp: Language model.
    :type nlp: Language
    :param x: Source text.
    :type x: str
    :return: Lower cased standardized text.
    :rtype: str
    """
    doc = nlp(x)

    tokens = [
        t.lemma_.lower()
        for t in doc
        if (
            not t.is_punct and
            not t.is_bracket and
            not t.is_currency and
            not t.is_digit and
            not t.is_space and
            not t.is_stop
        )
    ]

    return " ".join(tokens)

In [103]:
incidents_df["description"] = incidents_df["description"].fillna("")
incidents_df["text"] = incidents_df["user"] + " " + incidents_df["subject"] + " " + incidents_df["description"]
incidents_df = incidents_df[["text", "category"]]
incidents_df.loc[:, "text"] = incidents_df["text"].apply(lambda x: standardize_text(nlp, x))

In [104]:
# Check data
incidents_df

Unnamed: 0,text,category
0,user1 main process fail,Error
1,user1 function bug,Error
2,user2 class review contain bug,Error
3,user2 new release deploy production,Production
4,user2 find bug code,Error
5,user3 database work,Production


## Split dataset into training and test datasets

In [105]:
x = incidents_df["text"]
y = incidents_df["category"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## Create and train a classification model

In [106]:
# Create a pipeline that contains the classification model and a TFIDF vectorizer
cls_model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("svc", LinearSVC(dual=True, random_state=10))
])

In [107]:
# Train the model
cls_model.fit(x_train, y_train)

# Evaluate model

In [108]:
# Predict the labels/categories of the test features
y_pred = cls_model.predict(x_test)

In [109]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

print(cls_model.classes_)
print()
print(cm)

['Error' 'Production']

[[1 0]
 [1 0]]


In [110]:
# Classfication report
cr = classification_report(y_test, y_pred, zero_division=0.0)
print(cr)

              precision    recall  f1-score   support

       Error       0.50      1.00      0.67         1
  Production       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



In [111]:
# Accuracy score
ac = accuracy_score(y_test, y_pred)
print(ac)

0.5
