In [1]:
# imports

import re
import pandas as pd
import json
from elasticsearch import Elasticsearch, helpers
from pandasticsearch import Select
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

In [2]:
# credentials

credentials = {
    "ip_and_port": "127.0.0.1:9200",
    "username": "elastic",
    "password": "Welcometoerni!"
}

credentials2 = {
    "ip_and_port": "ws-tst-adb.erni2.ch:9200/",
    "username": "elastic",
    "password": "Welcometoerni!"
}

In [28]:
# open text file with the paragraphs
f = open("../data/rfc3095.txt")

lines = []
for line in f:
    lines.append(line)

In [29]:
# function to remove stopwords
stop_words = stopwords.words('english')
def remove_stopwords(texts):
    return [word for word in simple_preprocess(str(texts)) if word not in stop_words]

In [30]:
# function to lemmatize
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(words):
    lemmatizer = WordNetLemmatizer()
    wordsLemmatized = []
    return [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]

In [34]:
# pre-process paragraphs
paragraphs = []
paragraphsOriginal = []
paragraphsPreProcessed = []
start = 0
end = 0
for i, line in zip(range(1, len(lines)-1), lines[1:len(lines)-1]):
    prevLine = lines[i-1]
    currentLine = lines[i]
    nextLine = lines[i+1]
    if prevLine == "\n":
        start = i
    if nextLine == "\n":
        end = i
    if end > start:
        paragraph = [x for x in lines[start:end+1]]
        paragraphString = " ".join(paragraph).replace("\n", " ").strip()
        paragraphString = re.sub(' +', ' ', paragraphString)
        if(len(paragraph) > 0 and not paragraphString in paragraphs):
            paragraphs.append(paragraphString)
            paragraphsPreProcessed.append(paragraphString)
    # if i == 10:
    #     break

In [35]:
lemmatizedParagraphs = []
lemmatizedParagraphsStrings = []
for paragraph in paragraphs:
    lemmatizedParagraph = lemmatize(remove_stopwords(paragraph.lower().split(" ")))
    lemmatizedParagraphs.append(lemmatizedParagraph)
    lemmatizedParagraphsStrings.append(" ".join(lemmatizedParagraph))

In [36]:
# Upload paragraphs to elasticsearch

def addParagraphs(credentials, paragraphsDF, indexName):
    actions = [
        {
            "_index": indexName,
            "_source": {
                "originalParagraph": row["paragraph"],
                "preProcessedParagraph": row["lemmatizedParagraphString"]
            }
        }
        for index, row in paragraphsDF.iterrows()
    ]
    es = Elasticsearch(['http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"]], timeout=600)
    helpers.bulk(es, actions)


In [37]:
# upload technical standard paragraphs
paragraphsDF = pd.DataFrame()
paragraphsDF["paragraph"] = paragraphs
paragraphsDF["lemmatizedParagraphString"] = lemmatizedParagraphsStrings
paragraphsDF["lemmatizedParagraph"] = lemmatizedParagraphs

addParagraphs(credentials2, paragraphsDF, "technical-paragraphs")

In [11]:
# search function

def searchParagraphs(credentials, inputParagraph, field):
    es = Elasticsearch(['http://' + credentials["username"] + ':' + credentials["password"] + '@' + credentials["ip_and_port"]], timeout=600)
    doc = {
      "size" : 500,
      "query": {
        "multi_match" : {
          "query": inputParagraph, 
          "fields": [
            field
          ]
        }
      }
    }
    paragraphsDF = pd.DataFrame()
    data = es.search(index="paragraphs", body=doc, scroll='1m')
    # scrollId = data['_scroll_id']
    # scrollSize = len(data['hits']['hits'])
    # while scrollSize > 0:
    #     if paragraphsDF.empty:
    #         paragraphsDF = Select.from_dict(data).to_pandas()
    #     else:
    #         paragraphsDF = paragraphsDF.append(Select.from_dict(data).to_pandas())
    #     data = es.scroll(scroll_id = scrollId, scroll = '1m')
    #     scrollId = data['_scroll_id']
    #     scrollSize = len(data['hits']['hits'])
    # return paragraphsDF
    return data


In [23]:
# sample search of an input paragraph
originalParagraphs = [
    "A header decompression apparatus (709, 908) for decompressing a compressed header of a packet for transmission by referring to reference information being the same as reference information referred to for header compression by a transmitting side, said apparatus (709, 908) comprising",

    "The header decompression apparatus according to claim 1, wherein when X ≥ Y, said update request unit (708) determines that the reference information stored in said reference information manager (707) should be updated.",

    "A header decompression method for decompressing a compressed header of a packet for transmission by referring to reference information that is the same as reference information referred to for header compression by a transmitting side, said method comprising"
]

paragraphs = [p.lower() for p in originalParagraphs]

In [24]:
# remove stopwords
paragraphsWithoutStopWords = []
for paragraph in paragraphs:
    paragraphWords = paragraph.split(" ")
    paragraphWordsWithoutStopWords = remove_stopwords(paragraphWords)
    paragraphsWithoutStopWords.append(paragraphWordsWithoutStopWords)
paragraphsWithoutStopWords

[['header',
  'decompression',
  'apparatus',
  'decompressing',
  'compressed',
  'header',
  'packet',
  'transmission',
  'referring',
  'reference',
  'information',
  'reference',
  'information',
  'referred',
  'header',
  'compression',
  'transmitting',
  'side',
  'said',
  'apparatus',
  'comprising'],
 ['header',
  'decompression',
  'apparatus',
  'according',
  'claim',
  'wherein',
  'said',
  'update',
  'request',
  'unit',
  'determines',
  'reference',
  'information',
  'stored',
  'said',
  'reference',
  'information',
  'manager',
  'updated'],
 ['header',
  'decompression',
  'method',
  'decompressing',
  'compressed',
  'header',
  'packet',
  'transmission',
  'referring',
  'reference',
  'information',
  'reference',
  'information',
  'referred',
  'header',
  'compression',
  'transmitting',
  'side',
  'said',
  'method',
  'comprising']]

In [25]:
# lemmatize
paragraphsLemmatized = []
for paragraphWithoutStopWords in paragraphsWithoutStopWords:
    paragraphWordsLemmatized = lemmatize(paragraphWithoutStopWords)
    paragraphWordsLemmatizedString = " ".join(paragraphWordsLemmatized)
    paragraphsLemmatized.append(paragraphWordsLemmatizedString)
paragraphsLemmatized

['header decompression apparatus decompress compress header packet transmission refer reference information reference information refer header compression transmit side say apparatus comprise',
 'header decompression apparatus accord claim wherein say update request unit determines reference information store say reference information manager update',
 'header decompression method decompress compress header packet transmission refer reference information reference information refer header compression transmit side say method comprise']

In [26]:
# upload patent document paragraphs
paragraphsDF = pd.DataFrame()
paragraphsDF["paragraph"] = originalParagraphs
paragraphsDF["lemmatizedParagraphString"] = paragraphsLemmatized

addParagraphs(credentials2, paragraphsDF, "patent-paragraphs")

In [15]:
# execute search
results = searchParagraphs(credentials, paragraphWordsLemmatizedString, "preProcessedParagraph")
with open('../data/data.json', 'w') as outfile:
    json.dump(results, outfile)

In [85]:
# list words to highlight
listOfWordsToHighlight = []
for result in results["originalParagraph"]:
    wordsToHighlight = []
    resultWords = result.split(" ")
    for word in paragraphWordsLemmatized:
        if word in resultWords:
            wordsToHighlight.append(word)
    listOfWordsToHighlight.append(list(set(wordsToHighlight)))
results["wordsToHighlight"] = listOfWordsToHighlight

results.to_excel("../data/searchResults20200714.xlsx")