In [None]:
!pip install rake-nltk
!pip install spacy
!pip install azure-ai-textanalytics

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Collecting nltk<4.0.0,>=3.6.2
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 13.0 MB/s 
Collecting regex>=2021.8.3
  Downloading regex-2022.4.24-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 65.6 MB/s 
Installing collected packages: regex, nltk, rake-nltk
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.7 rake-nltk-1.0.6 regex-2022.4.24
Collecting azure-ai-textanalytics
  Downloading azure_ai_textanalytics-5.1.0-py2.py3-none-any.whl (153 kB)
[K     |████████████████████████████████| 153 kB 17.1 MB/s 
Colle

In [None]:
import numpy as np
import nltk
import os
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Example Queries

In [None]:
example_sentence = "BLEU is a standard algorithm for evaluating the machine translations against the human translations. At first I thought it should be very straightforward to use. However, it turns out that there are a lot of caveats."

## Spacy

In [None]:
# based https://betterprogramming.pub/extract-keywords-using-spacy-in-python-4a8415478fbf
!python -m spacy download en_core_web_lg

import spacy
from collections import Counter
from string import punctuation
import en_core_web_lg

nlp = en_core_web_lg.load()

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=62ba7de7e41d1fc77e7dfc24856f21e79cfa7e8ea92c61849b3d4e6af3f0cbca
  Stored in directory: /tmp/pip-ephem-wheel-cache-483i7304/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
def extract_spacy(content):
    result = []
    doc = nlp(content.lower())
    
    # only add keywords which are included in the following POS
    pos_tag = ['PROPN', 'ADJ', 'NOUN']
    
    for token in doc:
        if token.text in nlp.Defaults.stop_words or token.text in punctuation:
            continue

        if token.pos_ in pos_tag:
            result.append(token.text)
                
    return result

extract_spacy(example_sentence)

['bleu',
 'standard',
 'algorithm',
 'machine',
 'translations',
 'human',
 'translations',
 'straightforward',
 'lot',
 'caveats']

## Rake

In [None]:
def extract_rake(content):
  from rake_nltk import Rake

  rake = Rake()
  result = []

  doc = rake.extract_keywords_from_text(content.lower())
  doc = rake.get_ranked_phrases()

  for token in doc:
    if token in nlp.Defaults.stop_words or token in punctuation:
      continue
    result.append(token)

  return result

extract_rake(example_sentence)

['standard algorithm',
 'machine translations',
 'human translations',
 'use',
 'turns',
 'thought',
 'straightforward',
 'lot',
 'evaluating',
 'caveats',
 'bleu']

## Microsoft Cognitive Services

In [None]:
def extract_microsoft(content):
  key = "<key>"
  endpoint = "https://psychmatch-textanalytics.cognitiveservices.azure.com/"

  from azure.ai.textanalytics import TextAnalyticsClient
  from azure.core.credentials import AzureKeyCredential

  # Authenticate the client using your key and endpoint 
  def authenticate_client():
      ta_credential = AzureKeyCredential(key)
      text_analytics_client = TextAnalyticsClient(
              endpoint=endpoint, 
              credential=ta_credential)
      return text_analytics_client

  client = authenticate_client()

  def key_phrase_extraction_example(client):
      try:
          response = client.extract_key_phrases(documents = [content])[0]

          if not response.is_error:
            return response.key_phrases
          else:
              print(response.id, response.error)

      except Exception as err:
          print("Encountered exception. {}".format(err))
          
  return key_phrase_extraction_example(client)

extract_microsoft(example_sentence)

['standard algorithm',
 'machine translations',
 'human translations',
 'BLEU',
 'lot',
 'caveats']

# Spacy and RAKE
Trying to combine NLP and RAKE to improve the performance of RAKE.

In [None]:
def extract_spacy_rake(content):
  content = extract_spacy(content)
  content = extract_rake((". ").join(content))

  return content

extract_spacy_rake(example_sentence)

['bleu', 'standard', 'algorithm', 'machine', 'translations', 'human', 'translations', 'straightforward', 'lot', 'caveats']


['translations',
 'translations',
 'straightforward',
 'standard',
 'machine',
 'lot',
 'human',
 'caveats',
 'bleu',
 'algorithm']

# Loding Test Data

In [None]:
class KeywordExtraction:
  def __init__(self, name, extractor):
    self.name = name
    self.extractor = extractor
    self.total_precision = 0
    self.total_recall = 0
    self.total_fmeasure = 0

In [None]:
# variable section
data_dir = '/content/drive/MyDrive/Colab Notebooks/VM2/data/keyword/' # based on https://github.com/bohrjoce/keyword-extraction
number_of_files = 100
filenames = sorted(os.listdir(data_dir))[0:number_of_files + 1]
manual_keywords = []
total_precision = 0
total_recall = 0
total_docs = 0

language = 'english'

In [None]:
services = [
  KeywordExtraction("spacy", extract_spacy),
  KeywordExtraction("microsoft", extract_microsoft),
  KeywordExtraction("rake", extract_rake),
  KeywordExtraction("spacy_rake", extract_spacy_rake)
]

for filename in filenames:
  print(filename)

  # load golden truth
  if filename[-3:] == 'key':
    # ignored due to issue on Mac or empty keyfile
    if filename == "H-5.key" or filename == "C-86.key":
      continue

    with open(data_dir + filename, 'r') as f:
      last_key_file = filename
      key_lines = f.read().splitlines()

      # list of list of keywords by line and flatten it
      manual_keywords = [line.split() for line in key_lines]
      manual_keywords = [word for line in manual_keywords for word in line]
      manual_keywords = list(set(manual_keywords))
      manual_keywords = [t for t in manual_keywords if ( (len(t) > 1) and (t.lower()not in stopwords.words(language)) )]

  # load sample text
  elif filename[-3:] == 'txt':
    # ignored due to issue on Mac or empty keyfile
    if filename == "H-5.txt" or filename == "C-86.txt":
      continue

    total_docs += 1
    with open(data_dir + filename, 'r') as f:
      for service in services:
        correct = 0
        f = open(data_dir + filename, 'r')
        content = f.read()

        ## apply keyword extraction
        keywords = service.extractor(content[0:5120])
        
        # make unique and flatten
        keywords = list(set(keywords))

        for keyword in keywords:
          if keyword in set(manual_keywords):
            correct += 1
        
        if len(manual_keywords) == 0:
          print(filename)
          print(last_key_file)
          print('^^^^ issue with this file ^^^^')
          exit(0)

        service.total_precision += correct/float(len(keywords))
        service.total_recall += correct/float(len(manual_keywords))

C-1.key
C-1.txt
['scalable', 'grid', 'service', 'discovery', 'uddi', 'authors', 'alphabetical', 'order', 'sujata', 'banerjee$', 'sujoy', 'basu$', 'shishir', 'garg', 'sukesh', 'garg', 'sung', 'ju', 'lee$', 'pramila', 'mullan', 'puneet', 'sharma$', 'hp', 'labs', 'page', 'mill', 'road', 'palo', 'alto', 'usa', 'sujata.banerjee,sujoy.basu,sungju.lee,puneet.sharma}@hp.com', 'france', 'telecom', 'r&d', 'division', 'gateway', 'blvd', 'south', 'san', 'francisco', 'usa', '-875', 'abstract', 'efficient', 'discovery', 'grid', 'services', 'essential', 'success', 'grid', 'computing', 'standardization', 'grids', 'web', 'services', 'need', 'scalable', 'web', 'service', 'discovery', 'mechanisms', 'grids', 'uddi', 'industry', 'standard', 'web', 'services', 'discovery', 'requirements', 'tight', 'replication', 'registries', 'lack', 'autonomous', 'control', 'widespread', 'deployment', 'usage', 'advent', 'grid', 'scalability', 'issue', 'uddi', 'roadblock', 'deployment', 'grids', 'paper', 'web', 'service', '

In [None]:
# show output
for service in services:
  print("Service: " + service.name)

  service.total_precision /= total_docs
  service.total_recall /= total_docs
  service.total_fmeasure = 2 * service.total_precision * service.total_recall / (service.total_precision + service.total_recall)

  print('total docs: ' + str(round(total_docs, 5)))
  print('total precision: ' + str(round(service.total_precision, 5)))
  print('total recall: ' + str(round(service.total_recall, 5)))
  print('total f1-score: ' + str(round(service.total_fmeasure, 5)))

Service: spacy
total docs: 98
total precision: 0.04603
total recall: 0.32893
total f1-score: 0.08075
Service: microsoft
total docs: 98
total precision: 0.00305
total recall: 0.01522
total f1-score: 0.00508
Service: rake
total docs: 98
total precision: 0.01384
total recall: 0.13183
total f1-score: 0.02504
Service: spacy_rake
total docs: 98
total precision: 0.04594
total recall: 0.32986
total f1-score: 0.08065
