In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline 
from collections import Counter
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
from collections import Counter
from sklearn import model_selection
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from datetime import datetime
from IPython import display
from collections import Counter
import imblearn
from imblearn.over_sampling import SMOTE
import pickle
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils.class_weight import compute_class_weight

# Background
This notebook ensures that the model results are replicatable. The other notebooks in this repo show the entire data science process for building the model and cleaning the data. You will only need the serialized data that is good to go. See the next cells for details.

[Pickle Documentation](https://docs.python.org/3/library/pickle.html)
<br>
The cleaned, preprocessed, and resampled data for final use is pickled into objects called X_final which is the vectorized text data and y_final which is the label encoded data that represents the different IT teams. By reading this into your environment, you now have this available for use (as long as it is in your working directory). These pickle objects are the only data in the repo, not the actual raw data itself. This notebook will be the only replicatable one, since the master notebook requires the raw data (which is not available in the repo).

In [2]:
with open('X_final.pickle', 'rb') as f: 
    X_final = pickle.load(f)
    
with open('y_final.pickle', 'rb') as f: 
    y_final = pickle.load(f)

After we have read in our data, we will train and test our model by splitting our data into two different sets. 

In [5]:
#split data into train, valid sets
train_X, valid_X, train_y, valid_y= train_test_split(X_final, y_final, random_state = 42, test_size=0.2)

#create model and specify parameters
model = Pipeline([('tfidf', TfidfVectorizer(max_features=10000, max_df=0.75, min_df=2,
                                            stop_words=stopwords.words('english'), ngram_range=(1,2))),
                ('clf', LogisticRegression(penalty='l2', solver='liblinear', class_weight='balanced', C=10, random_state=42)),
               ])

In [8]:
#train our model
model.fit(train_X, train_y)
#test results 
pred_y = model.predict(valid_X)
#compare test results to actual and create confusion matrix
print(classification_report(valid_y, pred_y))
cm=confusion_matrix(valid_y, pred_y)

#specify the metric we want to see: precision
pc_micro = (precision_score(valid_y, pred_y, average="micro"))
print('Micro Average Precision Score: {0:.2g}'.format(pc_micro))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96      3015
           1       0.90      0.96      0.93      1696
           2       0.86      0.79      0.82      5047
           3       0.82      0.86      0.84      1763
           4       0.92      0.94      0.93      2336
           5       0.97      0.96      0.96      1995
           6       0.80      0.84      0.82      2394
           7       0.84      0.86      0.85      1954

    accuracy                           0.88     20200
   macro avg       0.88      0.90      0.89     20200
weighted avg       0.88      0.88      0.88     20200

Micro Average Precision Score: 0.88


We get precision as the metric we optimize for as we are trying to reduce False Positives. More on this [here](https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9).

Next, since have the results we want. We will fit the model to all our data instead of the partitions of train/test we did prior. This is the final model used in production.

In [3]:
final_model = Pipeline([('tfidf', TfidfVectorizer(max_features=10000, max_df=0.75, min_df=2,
                                            stop_words=stopwords.words('english'), ngram_range=(1,2))),
                ('clf', LogisticRegression(penalty='l2', solver='liblinear', class_weight='balanced', C=10, random_state=42)),
               ])

final_model.fit(X_final, y_final)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.75, max_features=10000, min_df=2,
                                 ngram_range=(1, 2),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('clf',
                 LogisticRegression(C=10, class_weight='balanced',
                                    random_state=42, solver='liblinear'))])

We save our final model for future use by pickling it.

In [None]:
#saving classifier
with open ('final_model.pickle','wb') as f:#wb, write-byte
    pickle.dump (final_model,f)

# How our Model works in Production

The following cells show how when in production, the user will input text which will be passed to the data. Remember, since we are dealing with raw text, we need to normalze it as vectors, which will be done through a normalization pipeline shown below. Then we can apply our model and receive a prediction/class probability.

In [12]:
wpt=nltk.WordPunctTokenizer()
stop_words=nltk.corpus.stopwords.words('english')

#normalization pipeline
def normalize_doc(doc):
    doc=re.sub(r'[^a-zA-Z\s]', '', doc) 
    doc=doc.lower() 
    doc=doc.strip() 
    tokens=wpt.tokenize(doc)
    filtered_tokens=[token for token in tokens if token not in stop_words]
    doc=' '.join(filtered_tokens)
    return doc

In [13]:
#user input field
sample_text = ['SL2 self checkout printer not functioning']

In [14]:
#add new clean text to dataframe
normalize_corpus=np.vectorize(normalize_doc) #create a vectorized object for our normalization pipeline
norm_text=normalize_corpus(sample_text) #clean and normalize the ticket

In [15]:
#normalized text befor text extraction
norm_text

array(['sl self checkout printer functioning'], dtype='<U36')

In [16]:
#access the steps of our finalized model to apply to a single ticket
vect=final_model.named_steps['tfidf']
clf=final_model.named_steps['clf']

In [17]:
trans_text=vect.transform(norm_text).toarray() #apply text extraction using TFIDF 
print(clf.predict(trans_text)) #apply logistic regression model classifier to text 

[7]


In [18]:
normalize_corpus=np.vectorize(normalize_doc) #create a vectorized object for our normalization pipeline
norm_text=normalize_corpus(sample_text) #clean and normalize the ticket

vect=final_model.named_steps['tfidf']
clf=final_model.named_steps['clf']

trans_text=vect.transform(norm_text).toarray()
prediction=clf.predict(trans_text)
probabilities=(clf.predict_proba(trans_text))

In [19]:
def output_prediction(label_prediction):
    #identify the class prediction 
    if label_prediction==0:
        owner='EAM & MRO Inventory Support'
    elif label_prediction==1:
        owner='HRIS Team'
    elif label_prediction==2: 
        owner='IT Service Desk Team'
    elif label_prediction==3:
        owner='Network Team'
    elif label_prediction==4:
        owner='OTM Support'
    elif label_prediction==5:
        owner='Planning Support'
    elif label_prediction==6:
        owner='System Admin Team'
    else:
        owner='WMS Team'
    return print(owner)

In [20]:
output_prediction(prediction)

WMS Team


In [21]:
def output_probabilities(array_prob):
    prob_classes=array_prob[0][:]
    prob_classes_num = ["{:.2%}".format(prob) for prob in prob_classes]
    
    classes = ['EAM & MRO Inventory Support', 'HRIS Team', 'IT Service Desk Team', 'Network Team', 'OTM Support', 
          'Planning Support', 'System Admin Team', 'WMS Team']
    ownerDict = dict(zip(classes, prob_classes_num))
    ownerDf = pd.DataFrame(ownerDict.items(), columns=['Owner', 'Probability'])
    
    return(ownerDf)

In [22]:
output_probabilities(probabilities)

Unnamed: 0,Owner,Probability
0,EAM & MRO Inventory Support,0.06%
1,HRIS Team,0.04%
2,IT Service Desk Team,16.78%
3,Network Team,5.16%
4,OTM Support,0.05%
5,Planning Support,0.03%
6,System Admin Team,1.59%
7,WMS Team,76.28%
