In [1]:
import pandas as pd
import numpy as np

t1 = pd.read_csv("bank4.csv", encoding = 'latin1')
t1.head()

Unnamed: 0,Line,Final intent
0,Hi,greetings
1,Hi. My name is,greetings
2,I am facing issues with my credit card,others
3,please help waive my annual membership,Annual Fee Reversal
4,please reverse my annual charges,Annual Fee Reversal


Para cada intent, tenemos la siguiente respuesta para representar el comienzo del flujo de trabajo. Esto esta en el conjunto de datos fp_qns

In [2]:
# Para cada intencion (intent) tenemos las siguientes respuestas para iniciar el flujo de trabajo

fp_qns = pd.read_csv('follow_up_qns_v1.csv')
fp_qns.head()

Unnamed: 0,cat,count,sent
0,greetings,1525,Hi how can I help you
1,Foreign Travel,700,I understad you want to notify us on your trav...
2,Card Delivery,600,I will quickly check on your card delivery status
3,Report Fraud,600,Please contact XXXX-XXXXX for reporting fraud
4,branch_atm_locator,550,Please provide your address


Distribucion de los intents

In [3]:
t1["Final intent"].value_counts()

greetings               1525
Foreign Travel           700
Report Fraud             600
Card Delivery            600
branch_atm_locator       550
Track application        450
Pin related              400
Credit Limit related     400
Card Activation          350
Blocked Card             350
Card Cancellation        350
Payment related          300
others                   227
Annual Fee Reversal      150
Statement related        150
Name: Final intent, dtype: int64

Hacer los siguientes reemplazos usando expresiones regulares:

1. Reemplazar caracteres especiales como \r y \n por un espacio
2. Reemplazar los enlaces web por una palabra que indique que son URL. En este caso sera url_pp.
3. Reemplazar fechas y numeros de telefono móvil
4. Reemplazar porcentajes
5. Reemplazar valores monetarios
6. Reemplazar los numeros de tarjeta y otp
7. Reemplazar cualquier otro numero que no este incluido en el paso 6 por simp_digit_pp
8. Dado que se han convertido todos los numeros y caracteres especiales hasta ahora, tratar cualquier caracter que no sea un dígito (excepto el espacio o "_") como un espacio

In [4]:
def preproc(newdf):
    newdf.loc[:,"line1"] =newdf.Line.str.lower()
    newdf.loc[:,"line1"] = newdf.line1.str.replace("inr|rupees","rs")
    newdf.loc[:,"line1"] = newdf.line1.str.replace("\r"," ")
    newdf.loc[:,"line1"] = newdf.line1.str.replace("\n"," ")
    newdf.loc[:,"line1"] = newdf.line1.str.replace("[\s]+"," ")

    newdf.loc[:,"line1"] = newdf.line1.str.replace('http[0-9A-Za-z:\/\/\.\?\=]*',' url_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[0-9]+\/[0-9]+\/[0-9]+',' date_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('91[7-9][0-9]{9}', ' mobile_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[7-9][0-9]{9}', ' mobile_pp ')

    newdf.loc[:,"line1"] = newdf.line1.str.replace('[0-9]+%', ' digits_percent_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[0-9]+percentage', ' digits_percent_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[0-9]+th', ' digits_th_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('rs[., ]*[0-9]+[,.]?[0-9]+[,.]?[0-9]+[,.]?[0-9]+[,.]?',' money_digits_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('rs[., ]*[0-9]+',' money_digits_small_pp ')

    newdf.loc[:,"line1"] = newdf.line1.str.replace('[0-9]+[x]+[0-9]*',' cardnum_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[x]+[0-9]+',' cardnum_pp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[0-9]{4,7}',' simp_digit_otp ')
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[0-9]+',' simp_digit_pp ')

    newdf.loc[:,"line1"] = newdf.line1.str.replace("a/c"," ac_pp ")
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[^a-z _]',' ')

    newdf.loc[:,"line1"] = newdf.line1.str.replace('[\s]+,'," ")
    newdf.loc[:,"line1"] = newdf.line1.str.replace('[^A-Za-z_]+', ' ')
    
    return newdf

In [5]:
t2 = preproc(t1)

Haremos un muestreo estratificado (stratified sampling) del conjunto de datos de prueba y entrenamiento. El entrenamiento se utiliza para construir el modelo y la prueba para la validacion. El stratified sampling se utiliza para mantener similar la distribucion de los 14 intents en el entrenamiento y en la prueba.

In [6]:
tgt = t2["Final intent"]

from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(test_size=0.1, random_state=42, n_splits=1)

for train_index, test_index in sss.split(t2, tgt):
    x_train, x_test = t2[t2.index.isin(train_index)], t2[t2.index.isin(test_index)]
    y_train, y_test = t2.loc[t2.index.isin(train_index), "Final intent"], t2.loc[t2.index.isin(test_index), "Final intent"]

Construir el modelo con embedding seguida de LSTM. Prepararemos la entrada (una matriz de 2 dimensiones) N filas, Longitud maxima de palabras en cada oracion. El valor de cada celda es un numero que representa la palabra

La funcion get_max_len obtiene la longitud maxima en el corpus. Esto se usa para rellenar las oraciones despues de la tokenización

In [7]:
def get_max_len(list1):
    len_list = [len(i) for i in list1]
    return max(len_list)

Usamos texts_to_sequences y pad_sequences para obtener el valor unico de la palabra en cada fila y luego para completar con "0" para que coincida con la longitud maxima de la oracian

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()

def conv_str_cols(col_tr,col_te):
    
    tokenizer = Tokenizer(num_words=1000)
    tokenizer.fit_on_texts(col_tr)
    
    col_tr1 = tokenizer.texts_to_sequences(col_tr)
    col_te1 = tokenizer.texts_to_sequences(col_te)
    
    max_len1 = get_max_len(col_tr1)
    
    col_tr2 = pad_sequences(col_tr1, maxlen=max_len1, dtype='int32', padding='post')
    col_te2 = pad_sequences(col_te1, maxlen=max_len1, dtype='int32', padding='post')
    
    return col_tr2,col_te2,tokenizer,max_len1

In [9]:
tr_padded,te_padded, tokenizer,max_len1 = conv_str_cols(x_train["line1"], x_test["line1"])

Tenemos en esta matriz N * L (longitud con relleno)

In [10]:
tr_padded.shape, te_padded.shape

((6391, 273), (711, 273))

Ahora convertimos la variable "dependiente" en un formato one-hot para que podamos aplicar un clasificador soft-max al final

In [11]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train1 = le.fit_transform(y_train)
y_test1 = le.fit_transform(y_test)

y_train2 = to_categorical(y_train1)
y_test2 = to_categorical(y_test1)

In [12]:
classes_num = len(y_train.value_counts())

In [13]:
y_train2

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [14]:
classes_num

15

In [15]:
# Guardamos las variables en archivos .pkl para usarlas en la prediccion final 
import pickle

tokenizer_pkl = open('tokenizer.pkl', 'wb')
pickle.dump(tokenizer, tokenizer_pkl)

y_train_pkl = open('y_train.pkl', 'wb')
pickle.dump(y_train, y_train_pkl)

y_test_pkl = open('y_test.pkl', 'wb')
pickle.dump(y_test, y_test_pkl)

fp_qns.to_pickle('fp_qns.pkl')

Construccion del modelo LSTM 

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM,Flatten,TimeDistributed

model = Sequential()
model.add(Embedding(1000, 100, input_length=max_len1))
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(50,return_sequences=True))

model.add(TimeDistributed(Dense(50, activation='relu')))
model.add(Flatten())
#model.add(Flatten())
model.add(Dense(classes_num, activation='softmax'))

# compilar el modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

print(model.summary())

# entrenar el modelo
model.fit(tr_padded, y_train2, epochs=10, verbose=2,batch_size=30)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 273, 100)          100000    
                                                                 
 lstm (LSTM)                 (None, 273, 100)          80400     
                                                                 
 lstm_1 (LSTM)               (None, 273, 50)           30200     
                                                                 
 time_distributed (TimeDistr  (None, 273, 50)          2550      
 ibuted)                                                         
                                                                 
 flatten (Flatten)           (None, 13650)             0         
                                                                 
 dense_1 (Dense)             (None, 15)                204765    
                                                        

<keras.callbacks.History at 0x7eff10080890>

In [17]:
# Guardar el modelo
model.save('virtual_assistant_LSTM_model.h5')

In [18]:
# Cargar el modelo y evaluar los datos de validacion
from tensorflow.keras.models import load_model

model = load_model('virtual_assistant_LSTM_model.h5')

Obtenemos la precision con respecto a nuestros de datos de prueba

In [19]:
pred_mat = np.argmax(model.predict(te_padded), axis=-1)

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

print (accuracy_score(y_test1, pred_mat))
print (f1_score(y_test1, pred_mat,average='micro'))
print (f1_score(y_test1, pred_mat,average='macro'))

1.0
1.0
1.0


In [20]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_test1, pred_mat))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,35,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,35,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,35,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,60,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,40,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,70,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,30,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,40,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,60,0,0,0,0,0


In [21]:
pd.Series(pred_mat).value_counts()

13    153
6      70
9      60
4      60
12     55
11     45
8      40
5      40
3      35
2      35
1      35
7      30
14     23
10     15
0      15
dtype: int64

Usamos el conjunto de datos "fp_qns" para obtener la respuesta correcta para el intent dado. Primero, cuando se ingresa una nueva oracion en el bot, preprocesamos y luego la convertimos en secuencia y la rellenamos para que corresponda con "maxlen". Luego, predecimos la respuesta correcta del conjunto de datos "fp_qns"

In [22]:
def pred_new_text(txt1):
    print ("customer_text:",txt1)
    
    newdf = pd.DataFrame([txt1])
    newdf.columns = ["Line"]
    newdf1 = preproc(newdf)
    
    col_te1 = tokenizer.texts_to_sequences(newdf1["line1"])
    col_te2 = pad_sequences(col_te1, maxlen=max_len1, dtype='int32', padding='post')

    class_pred = le.inverse_transform(np.argmax(model.predict(col_te2), axis=-1))[0]
    
    try:
        resp = fp_qns.loc[fp_qns.cat==class_pred,"sent"].values[0]
    except:
        resp = 'Please contact XXXX-XXXXX'
    
    print ("Bot Response:",resp,"\n")

    return 

In [23]:
pred_new_text("want to report card lost")
pred_new_text("good morning")
pred_new_text("where is atm")
pred_new_text("cancel my card")

customer_text: want to report card lost
Bot Response: Please contact XXXX-XXXXX 

customer_text: good morning
Bot Response: Hi how can I help you 

customer_text: where is atm
Bot Response: Please provide your address 

customer_text: cancel my card
Bot Response: Please contact XXXX-XXXXX for cancellation queries 

