# Data Preparation

In [None]:
def data_unzip(ip_file):
    with open(ip_file,"r", encoding="latin1") as f:
        ip = f.readlines()
    pr_list = [i.split(" ", 1) for i in ip]
    intent = []
    sub_intent = []
    que_text = []
    for i in pr_list:
        intent.append(i[0].split(':')[0])
        sub_intent.append(i[0].split(':')[1])
        que_text.append(i[1])    
    return intent, sub_intent, que_text

In [2]:
intents, sub_intents, ques = data_unzip("train_5500.label")

# Data Processing

In [3]:
import re
def replace_contraction(text):
    contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text
def replace_links(text, filler=' '):
        text = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
                      filler, text).strip()
        return text
def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = replace_contraction(text)
    text = replace_links(text, "link")
    text = remove_numbers(text)
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text

In [4]:
X = [cleanText(i) for i in ques]

In [5]:
from collections import Counter
Counter(intents)

Counter({'DESC': 1162,
         'ENTY': 1250,
         'ABBR': 86,
         'HUM': 1223,
         'NUM': 896,
         'LOC': 835})

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(intents)

LabelEncoder()

In [7]:
num_classes = le.classes_.shape[0]

In [8]:
import pickle
with open("le_quetopic.pickle","wb") as f:
    pickle.dump(le, f)

In [9]:
le.transform(intents[:2])

array([1, 2])

In [10]:
intents[:2]

['DESC', 'ENTY']

In [11]:
y = le.transform(intents)

In [12]:
import tensorflow as tf
y_encoded = tf.keras.backend.one_hot(y, num_classes)

In [13]:
y_encoded

<tf.Tensor 'one_hot:0' shape=(5452, 6) dtype=float32>

In [14]:
with tf.Session() as sess:
    y_encoded = sess.run(y_encoded )

In [15]:
y_encoded[1]

array([0., 0., 1., 0., 0., 0.], dtype=float32)

# Model Building

In [16]:
import tensorflow_hub as hub
embed = hub.Module("../module/module_useT")
def UseTEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

In [17]:
import tensorflow.keras as keras
from keras.layers import Input, Lambda, Dense
from keras.models import Model
import keras.backend as K
def build_model(): 
    input_text = Input(shape=(1,), dtype="string")
    embedding = Lambda(UseTEmbedding, output_shape=(512, ))(input_text)
    dense1 = Dense(256, kernel_regularizer=keras.regularizers.l2(0.001), \
                   activation=tf.nn.relu)(embedding)
    dense2 = Dense(256, kernel_regularizer=keras.regularizers.l2(0.001), \
                   activation=tf.nn.relu)(dense1)
    pred = Dense(num_classes, activation='sigmoid')(dense2)
    model = Model(inputs=[input_text], outputs=[pred])
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

Using TensorFlow backend.


In [18]:
model_useT = build_model()

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [19]:
model_useT.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 1542      
Total params: 198,662
Trainable params: 198,662
Non-trainable params: 0
_________________________________________________________________


In [20]:
y_encoded.shape

(5452, 6)

In [21]:
callbacks = [
         tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=0),
    ]

In [22]:
import numpy as np
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    history = model_useT.fit(np.asarray(X), y_encoded, epochs=50, batch_size=2048, validation_split = 0.1,
                                 verbose = 1, callbacks = callbacks)
    model_useT.save_weights('./quetopic_model_useT.h5')

Train on 4906 samples, validate on 546 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50


In [63]:
##Evaluation

In [23]:
intents, sub_intents, ques = data_unzip("TREC_10.label")

In [24]:
!ls

definition.html		saved_model.pb	  train_3000.label  TREC_10.label
le_quetopic.pickle	train_1000.label  train_4000.label  Untitled.ipynb
quetopic_model_useT.h5	train_2000.label  train_5500.label  variables


In [25]:
test_X = [cleanText(i) for i in ques]
import pickle
with open("le_quetopic.pickle","rb") as f:
    le = pickle.load(f)
test_y = le.transform(intents)
import tensorflow as tf
num_classes = le.classes_.shape[0]
test_y_enc = tf.keras.backend.one_hot(test_y, num_classes)
with tf.Session() as sess:
    test_y_enc = sess.run(test_y_enc)

In [None]:
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    pred = model_useT.predict(np.asarray(test_X))

In [None]:
test_X[2]

In [82]:
le.classes_

array(['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'], dtype='<U4')

In [88]:
pred[2]

array([0.16691467, 0.15996051, 0.16240428, 0.169468  , 0.1740457 ,
       0.16720681], dtype=float32)

In [89]:
test_y_enc[1]

array([0., 0., 0., 0., 1., 0.], dtype=float32)

# Prediction

In [2]:
import re
def replace_contraction(text):
    contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'can not'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text
def replace_links(text, filler=' '):
        text = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
                      filler, text).strip()
        return text
def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    return text
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = replace_contraction(text)
    text = replace_links(text, "link")
    text = remove_numbers(text)
    text = re.sub(r'[,!@#$%^&*)(|/><";:.?\'\\}{]',"",text)
    text = text.lower()
    return text

In [3]:
import tensorflow_hub as hub
import tensorflow as tf
embed = hub.Module("../module/module_useT")
def UseTEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

W0315 20:28:07.462201 139888066234176 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


UnsupportedHandleError: unsupported handle format '../module/module_useT'. No resolvers found that can successfully resolve it. If the handle points to the local filesystem, the error indicates that the module directory does not exist. Supported handle formats: URLs pointing to a TGZ  file (e.g. https://address/module.tgz), or Local File System directory file (e.g. /tmp/my_local_module).

In [None]:
import tensorflow.keras as keras
import numpy as np
from keras.layers import Input, Lambda, Dense
from keras.models import Model
import keras.backend as K
def build_model(): 
    input_text = Input(shape=(1,), dtype="string")
    embedding = Lambda(UseTEmbedding, output_shape=(512, ))(input_text)
    dense1 = Dense(256, kernel_regularizer=keras.regularizers.l2(0.001), \
                   activation=tf.nn.relu)(embedding)
    dense2 = Dense(256, kernel_regularizer=keras.regularizers.l2(0.001), \
                   activation=tf.nn.relu)(dense1)
    pred = Dense(num_classes, activation='sigmoid')(dense2)
    model = Model(inputs=[input_text], outputs=[pred])
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [None]:
ip_text = ['''how are you?''',
          '''where does he go?''',
          '''What is your return policy?''']

In [None]:
ip_X = [cleanText(i) for i in ip_text]
import pickle
with open("le_quetopic.pickle","rb") as f:
    le = pickle.load(f)
num_classes = le.classes_.shape[0]
model = build_model()
pred = None
with tf.Session() as session:
    K.set_session(session)
    session.run(tf.global_variables_initializer())  
    session.run(tf.tables_initializer())
    model.load_weights('./quetopic_model_useT.h5')
    pred = model.predict(np.asarray(ip_X))

In [20]:
le.classes_

array(['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'], dtype='<U4')

In [21]:
pred[2]/sum(pred[2])

array([0.01795071, 0.6648252 , 0.05166404, 0.00378014, 0.01417793,
       0.24760196], dtype=float32)

In [22]:
dict(zip(le.classes_, pred[2]/sum(pred[2])))

{'ABBR': 0.017950712,
 'DESC': 0.6648252,
 'ENTY': 0.051664036,
 'HUM': 0.0037801398,
 'LOC': 0.014177929,
 'NUM': 0.24760196}

In [None]:
result = []
for i in pred:
    result.append([])

In [6]:
pred.argsort(axis=1)

array([[4, 3, 0, 2, 5, 1],
       [0, 5, 2, 3, 1, 4]])

In [None]:
pred[0].argsort()[::-1]