In [1]:
!pip install nltk



In [2]:
import nltk
import numpy as np
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
corpus  = [ "We are seeing an increasing number of errors with our payment services. The issue has been reported by multiple users in the last 3 hours and this is affecting our revenue. We need to fix it immediately. ",
            "We need to make improvements to our landing page to convey our new branding guidelines.",
            "It looks like the issue is limited only to visa credit cards.",
            "We need to schedule a product meeting to discuss the new set of features and the roadmap.",
             ]
corpus

['We are seeing an increasing number of errors with our payment services. The issue has been reported by multiple users in the last 3 hours and this is affecting our revenue. We need to fix it immediately. ',
 'We need to make improvements to our landing page to convey our new branding guidelines.',
 'It looks like the issue is limited only to visa credit cards.',
 'We need to schedule a product meeting to discuss the new set of features and the roadmap.']

In [4]:
import re

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc


In [5]:
normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(corpus)
norm_corpus

array(['seeing increasing number errors payment services issue reported multiple users last hours affecting revenue need fix immediately',
       'need make improvements landing page convey new branding guidelines',
       'looks like issue limited visa credit cards',
       'need schedule product meeting discuss new set features roadmap'],
      dtype='<U128')

In [6]:
!pip install transformers
#!pip install torch



In [7]:
#import torch
from transformers import TFBertModel, BertTokenizer


In [8]:
import tensorflow as tf

In [9]:
# Initialize the tokenizer with a pretrained model
#tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [11]:
#get max_len
length=[len(tokenizer.encode(x,add_special_tokens=True)) for x in norm_corpus ]

#print(length)
max_len=max(length)
print('Max sentence tokens: ', max_len)

Max sentence tokens:  19


In [12]:
def encoding(sentences):
    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                          sent,                      
                          add_special_tokens = True,
                          max_length = 30,           
                          pad_to_max_length = True,
                          return_attention_mask = True,   
                          return_tensors = 'tf',   
                          truncation=True
                    )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = tf.concat(input_ids, axis=0)
    attention_masks = tf.concat(attention_masks, axis=0)

    return input_ids,attention_masks

In [13]:
def get_corpus_embedding(input_ids,attention_masks):
    out = model(input_ids, attention_mask= attention_masks)
    sentence_embedding = tf.reduce_mean(out[0], axis=1)
    #sentence_embedding= out[1]
    return sentence_embedding

In [14]:
def get_user_embedding(user_query):
    encoded_dict = tokenizer.encode_plus(
                        user_query,                      
                        add_special_tokens = True,
                        max_length = 30,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'tf',   
                        truncation=True
                   )
    output= model(encoded_dict['input_ids'],encoded_dict['attention_mask'] )
    user_query_embedding= tf.reduce_mean(output[0], axis=1)
    user_query_embedding= tf.squeeze(user_query_embedding)
    #user_query_embedding=output[1].squeeze()
    return user_query_embedding

In [15]:
from scipy import spatial
def cal_cosine_sim(emb1,emb2):
    output = 1 - spatial.distance.cosine(emb1,emb2)
    return output

In [16]:
def cal_cosine_sim2(emb1,emb2):
    output = tf.keras.losses.cosine_similarity(emb1,emb2,axis=0)
    return output

In [17]:
input_ids,attention_masks= encoding(norm_corpus)



In [18]:
#model = XLNetModel.from_pretrained('xlnet-base-cased', output_hidden_states=True)


model = TFBertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)



Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [19]:
sentence_embedding= get_corpus_embedding(input_ids,attention_masks)
sentence_embedding.shape

TensorShape([4, 768])

In [20]:
user_query= 'issue'

user_query_embedding= get_user_embedding(user_query)

user_query_embedding.shape

TensorShape([768])

In [21]:
for each in sentence_embedding:
  print(cal_cosine_sim(user_query_embedding, each))

0.5464975237846375
0.6496033072471619
0.6669994592666626
0.628105878829956


# ELASTIC SEARCH INSERTION AND SEARCHING

In [22]:
!pip install elasticsearch



In [23]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import numpy as np

In [24]:
es = Elasticsearch([{'host':'elasticsearch','port':9200}])

In [25]:
def getQuotes():
    for line in corpus:
        quote = line.strip().lower()
        if (len(quote.split()) <= 510): # 510 IS THE MAX
            vector = corpus
            yield {
                "_index": 'quotes',
                "quote" : quote,
                "vector" : vector
             }

In [26]:
bulk(client=es, actions = getQuotes(), chunk_size=1000, request_timeout = 120)

(4, [])

In [27]:
INDEX_NAME= 'conversation'

In [28]:
create_query = {
    "mappings": {
        "properties": {
            "description_vector": {
                "type": "dense_vector",
                "dims": 768
            }
        }
    }
}
es.indices.create(index=INDEX_NAME, body=create_query)

RequestError: RequestError(400, 'resource_already_exists_exception', 'index [conversation/OQKLO_IHTDCTgYVk-6GQfQ] already exists')

In [None]:
docs = [{
    '_index': INDEX_NAME,
    'description': sentence,
} for sentence in corpus]

requests = []
for i, doc in enumerate(docs):
    request = doc
    request['description_vector'] = sentence_embedding[i].numpy().tolist()
    requests.append(request)

In [None]:
from elasticsearch import helpers
helpers.bulk(es, requests)

In [None]:
search_query = {
    "size": 2,
    "_source": {
        "includes": ["description"]
    },
    "query": {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.queryVector, 'description_vector') + 1.0",
                "params": {
                    "queryVector": user_query_embedding.numpy().tolist()
                }
            }
        }
    }
}

In [None]:
response = es.search(
    index=INDEX_NAME,
    body=search_query
)
response