In [None]:
import sys
import os
sys.path.insert(0, 'code')
sys.path.append(os.path.abspath('../'))
from data import  loadFile,removeValues,convertColumnToCategorical,categoryProportion,retrieveValues,categoryCount
import functions as utils
from featEng import (lemmatizePipeStep,htmlCleanerPipeStep,lowerCasesPipeStep,stopWordsPipeStep,stemmizePipeStep,
word2vecPipeStep,tfIdfVectorizerPipeStep)
from resampler import upsampleRandom,downsampleRandom,upsampleSvmSmote
from reporter import giveScore,giveWordCloud,writePathologicalCases
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import numpy as np
from sklearn.pipeline import Pipeline

# Data

In [None]:
df = loadFile('../../data/it_tickets.xlsx') #THIS IS TOY DATA FOR CONFIDENTIALITY MATTERS
df = removeValues(df,'State','') #Remove emtpy values
df = removeValues(df,'State','Removed') #Remove 'Removed' State
df = removeValues(df,'Component','')
df = removeValues(df,'Type','')


Visualize Proportion a little bit

In [None]:
df['Type'].value_counts().plot(kind='bar')
categoryCount(df,'Type'),categoryCount(df,'Component')

In [None]:

df['Component'].value_counts().plot(kind='bar')
categoryProportion(df,'Type'),categoryProportion(df,'Component')

# Type Training

In [None]:
df_type = df.copy()
df_type = convertColumnToCategorical(df_type,'Type')
print(categoryProportion(df_type,'Type'))
X = df_type.loc[:,['Title', "Description",'Type']]
y = df_type.loc[:,['Type']]
print("---Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
#     random_state=42, 
    stratify=y 
)
print(categoryCount(X_train,'Type'))

print("---Random Upsampling")
X_train = upsampleRandom(X_train,'Type','Incident', 2) #duplicamos los datos del set de entrenamiento

print("---Random Downsampling")
X_train = downsampleRandom(X_train,'Type','Requirement', 0.80) #Nos quedamos con el 80% de los datos

y_train = X_train.loc[:,['Type']]
print(categoryCount(X_train,'Type'))

#Si nos equivocamos, cortamos aca.
assert X_train.shape[0] == y_train.shape[0] , 'Must have same numbers of both training and target examples'

type_pipeline_preprocess = Pipeline([
    ('clean', htmlCleanerPipeStep()), 
    ('lower', lowerCasesPipeStep()),
    ('stopwords', stopWordsPipeStep('../../data/other/stopwords.txt')),
    ('lemmatize', lemmatizePipeStep())
])

type_pipeline_fe = Pipeline([
    ('preprocess', type_pipeline_preprocess),
    ('word2vec', word2vecPipeStep())
])

print("---Applying Feat Eng")
type_pipeline_fe.fit(X_train,y_train)
X_train_transform = type_pipeline_fe.transform(X_train)
y_train_transform = y_train
print(X_train.shape,y_train.shape)

# print("---Over sampling X_train") NOT A GOOD IDEA FOR NLP. TRY IT IF YOU WANT
# X_train_transform, y_train_transform=upsampleSvmSmote({'sampling_strategy':'minority',
#                                                       'n_jobs':-1,
#                                                       'k_neighbors':15,
#                                                       'm_neighbors':5},
#                                                       X_train_transform,y_train)
le = preprocessing.LabelEncoder()
y_train_transform = le.fit_transform(y_train_transform)
print(X_train_transform.shape,y_train_transform.shape)
print("Transforming test set")
X_test_transform=type_pipeline_fe.transform(X_test)
y_test_transform = le.transform(y_test)
print("Done.")


In [None]:
from sklearn.neural_network import MLPClassifier
# ,100,25,10
clf_t_nn = MLPClassifier(
    hidden_layer_sizes=(100,75),
    activation='relu',
    random_state=42,
    tol=0.001,
    alpha=1.3,
    early_stopping=True,
    n_iter_no_change=20,
    validation_fraction=0.1,
    verbose=False,
    warm_start=False
)


_ = clf_t_nn.fit(X_train_transform,y_train_transform)

labels = le.inverse_transform(np.unique(y_test_transform))
predictions = clf_t_nn.predict(X_test_transform)
predictions_proba = clf_t_nn.predict_proba(X_test_transform)
giveScore(clf_t_nn,X_train_transform,y_train_transform,X_test_transform,y_test_transform,True,labels=labels)

### Predicciones y casos patologicos

In [None]:
max_pred = np.amax(predictions_proba,axis=1)

predicted_values = le.inverse_transform(np.array(predictions))

import pandas as pd
import matplotlib.pyplot as plt

X_test_predicted = X_test.copy()
X_test_predicted['Predicted'] = predicted_values
X_test_predicted['Confidence'] = max_pred

casos_malos = X_test_predicted[pd.Series(X_test_predicted['Type'] != X_test_predicted['Predicted'])]
confianza_de_casos_malos = casos_malos['Confidence']
#Estos son aquellos casos donde la prediccion fue con mucha confianza en el sentido inverso.
casos_peores = casos_malos[pd.Series(casos_malos['Confidence'] > 0.9)]


casos_buenos = X_test_predicted[pd.Series(X_test_predicted['Type'] == X_test_predicted['Predicted'])]
confianza_de_casos_buenos = casos_buenos['Confidence']


plt.figure(figsize=(12,10))
plt.boxplot([confianza_de_casos_malos,confianza_de_casos_buenos],labels=['Erradas', 'Acertadas'])
plt.title("Distribucion de Predicciones de Type")
plt.show()
print("Valores de Distribucion de predicciones erradas")
#Descripcion general de los casos en donde predijimos mal
print(confianza_de_casos_malos.describe())
print('----------------------------------------------------')
print("Valores de Distribucion de predicciones acertadas")
#Descripcion general de los casos en donde predijimos bien
print(confianza_de_casos_buenos.describe())

plt.figure(figsize=(8,6))
plt.title("Wrong Predictions Distribution")
plt.hist(confianza_de_casos_malos,10)
plt.xlabel("predict probability")
plt.ylabel("# cases")
mean = np.mean(confianza_de_casos_malos)
plt.axvline(mean, color='k', linestyle='dashed', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(mean*1.05, max_ylim*0.9, 'Mean: {0:.3f}'.format(mean))
plt.show()



### Visualization

In [None]:
X_incident = retrieveValues(X_train,'Type','Incident')
X_requirement = retrieveValues(X_train,'Type','Requirement')
giveWordCloud(type_pipeline_preprocess.transform(X_incident),utils.get_stopwords()),giveWordCloud(type_pipeline_preprocess.transform(X_requirement),utils.get_stopwords())


# Component Training

###### Se debe volver a entrenar antes de cualquier cosa, pues los nombres de las variables son los mismos que en Type

In [None]:
df_component = df.copy()
df_component = removeValues(df_component,'Component','Help Desk')
df_component = convertColumnToCategorical(df_component,'Component')
print(categoryProportion(df_component,'Component'))
X = df_component.loc[:,['Title', "Description",'Component']]
y = df_component.loc[:,['Component']]
print("---Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=42, 
    stratify=y 
)
print(categoryProportion(X_train,'Component'))

print("---Random Upsampling")
X_train = upsampleRandom(X_train,'Component','Recambio de PC', 3)#triplicamos los datos del set de entrenamiento
X_train = upsampleRandom(X_train,'Component','Alta y Baja de Usuarios', 2) #triplicamos los datos del set de entrenamiento
X_train = upsampleRandom(X_train,'Component','No Conformidad Compra / Garantia', 2)#duplicamos los datos del set de entrenamiento

print("---Random Downsampling")
X_train = downsampleRandom(X_train,'Component','Administracion de Servidores y Herramientas', 0.75)


y_train = X_train.loc[:,['Component']]
print(categoryProportion(X_train,'Component'))

#Si nos equivocamos, cortamos aca.
assert X_train.shape[0] == y_train.shape[0] , 'Must have same numbers of both training and target examples'

component_pipeline_preprocess = Pipeline([
    ('clean', htmlCleanerPipeStep()), 
    ('lower', lowerCasesPipeStep()),
    ('stopwords', stopWordsPipeStep('../../data/other/stopwords.txt')),
    ('stemmize', stemmizePipeStep())
])
component_pipeline_fe = Pipeline([
    ('preprocess', component_pipeline_preprocess),
    ('tfidf', tfIdfVectorizerPipeStep({
        'stop_words':utils.get_stopwords(), 
        'strip_accents':'unicode',
        'use_idf':True,
        'ngram_range':(1,3)
    }))
])

print("---Applying Feat Eng")
component_pipeline_fe.fit(X_train,y_train)
X_train_transform = component_pipeline_fe.transform(X_train)
print(X_train.shape,y_train.shape)
le = preprocessing.LabelEncoder()
y_train_transform = le.fit_transform(y_train)

print("Transforming test set")
X_test_transform=component_pipeline_fe.transform(X_test)
y_test_transform = le.transform(y_test)
print("Done.")

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier

clf_c = SGDClassifier()
clf_c.set_params(alpha=0.0005,learning_rate='optimal',penalty='l2',random_state=42,tol=0.0005,loss='modified_huber')

training = clf_c.fit(X_train_transform,y_train)
labels = le.inverse_transform(np.unique(y_test_transform))
predictions = clf_c.predict(X_test_transform)
predictions_proba = clf_c.predict_proba(X_test_transform)
giveScore(clf_c,X_train_transform,y_train,X_test_transform,y_test,True,labels=labels)

### Predicciones y Casos Patologicos

In [None]:
max_pred = np.amax(predictions_proba,axis=1)

predicted_values = np.array(predictions)

import pandas as pd
import matplotlib.pyplot as plt

X_test_predicted = X_test.copy()
X_test_predicted['Predicted'] = predicted_values
X_test_predicted['Confidence'] = max_pred

X_test_predicted
casos_malos = X_test_predicted[pd.Series(X_test_predicted['Component'] != X_test_predicted['Predicted'])]
confianza_de_casos_malos = casos_malos['Confidence']
#Estos son aquellos casos donde la prediccion fue con mucha confianza en el sentido inverso.
casos_peores = casos_malos[pd.Series(casos_malos['Confidence'] > 0.9)]

# casos_peores
casos_buenos = X_test_predicted[pd.Series(X_test_predicted['Component'] == X_test_predicted['Predicted'])]
confianza_de_casos_buenos = casos_buenos['Confidence']


plt.figure(figsize=(12,10))
plt.boxplot([confianza_de_casos_malos,confianza_de_casos_buenos],labels=['Erradas', 'Acertadas'])
plt.title("Distribucion de Predicciones de Component")
plt.show()

plt.figure(figsize=(8,6))
plt.title("Wrong Predictions Distribution")
plt.hist(confianza_de_casos_malos,10)
plt.xlabel("predict probability")
plt.ylabel("# cases")
mean = np.mean(confianza_de_casos_malos)
plt.axvline(mean, color='k', linestyle='dashed', linewidth=1)
min_ylim, max_ylim = plt.ylim()
plt.text(mean*1.05, max_ylim*0.9, 'Mean: {0:.3f}'.format(mean))
plt.show()
print("Valores de Distribucion de predicciones erradas")
#Descripcion general de los casos en donde predijimos mal
print(confianza_de_casos_malos.describe())
print('----------------------------------------------------')
print("Valores de Distribucion de predicciones acertadas")
#Descripcion general de los casos en donde predijimos bien
print(confianza_de_casos_buenos.describe())




### Visualization
###### Se debe correr primero la celda que procesa para que X_train este bien definido

In [None]:
X_alta_baja_usuarios = retrieveValues(X_train,'Component','Alta y Baja de Usuarios')
print(X_alta_baja_usuarios.shape)
X_servidores = retrieveValues(X_train,'Component','Administracion de Servidores y Herramientas')
print(X_servidores.shape)
giveWordCloud(component_pipeline_preprocess.transform(X_alta_baja_usuarios),utils.get_stopwords()),giveWordCloud(component_pipeline_preprocess.transform(X_servidores),utils.get_stopwords())