# **Extracting keyphrases with EmbedRank**

In [10]:
import os
import embedrank.launch as launch
import json
import requests

**Note:** Run Stanford Core NLP Server before continuing!

In [2]:
r = requests.get(url="http://localhost:9000")
r.status_code

200

In [3]:
embedding_distributor = launch.load_local_embedding_distributor()
pos_tagger = launch.load_local_corenlp_pos_tagger()

## Preprocessing of datasets

In [4]:
train_docs = []
documents_dir = "data/semeval2010-train-docs"
with open("data/train_docs_order.txt", "r", encoding="utf") as f:
    train_docs_order = f.read().split()
for doc_id in train_docs_order:
    filename = doc_id + ".txt"
    with open(f"{documents_dir}/{filename}", "r", encoding="utf") as f:
        doc = f.read()
    train_docs.append(doc)

print(len(train_docs))

144


In [5]:
test_docs = []
documents_dir = "data/semeval2010-test-docs"
with open("data/test_docs_order.txt", "r", encoding="utf") as f:
    test_docs_order = f.read().split()
for doc_id in test_docs_order:
    filename = doc_id + ".txt"
    with open(f"{documents_dir}/{filename}", "r", encoding="utf") as f:
        doc = f.read()
    test_docs.append(doc)

print(len(test_docs))

100


In [6]:
def extract_keyphrases(docs, n, betas):
    extracted_keyphrases = [] # Note: Unstemmed
    for doc in docs:
        doc_keyphrases = []
        for beta in betas:
            beta_keyphrases = launch.extract_keyphrases(
                embedding_distrib=embedding_distributor,
                ptagger=pos_tagger,
                raw_text=doc,
                N=n,
                lang="en",
                beta=beta
                )[0]
            if len(betas) == 1:
                doc_keyphrases.extend(beta_keyphrases)
            else:
                doc_keyphrases.append(beta_keyphrases)
        extracted_keyphrases.append(doc_keyphrases)
    
    return extracted_keyphrases

## Extract keyphrases from train set

In [None]:
dir = "../extractions/embedrank/raw/"
if not os.path.exists(dir):
    os.makedirs(dir)

In [11]:
# betas = [0.4, 0.45, 0.5, 0.55, 0.6]

# n_to_extr_train = dict()
# for n in [5, 10, 15]:
#     n_to_extr_train[n] = extract_keyphrases(docs=train_docs, n=n, betas=betas)

# # Serialize extracted keyphrases
# n_to_extr_train_json = json.dumps(n_to_extr_train)
# with open("../extractions/embedrank/raw/train.json", "w+") as f:
#     f.write(n_to_extr_train_json)

## Extract keyphrases from test set

### 1. Exact-match evaluation

In [12]:
# n_to_beta = {5: 0.45, 10: 0.45, 15: 0.45}
# n_to_extr_test = dict()

# for n, beta in n_to_beta.items():
#     n_to_extr_test[n] = extract_keyphrases(
#         docs=test_docs, 
#         n=n, 
#         betas=[beta]
#         )
    
# # Serialize extracted keyphrasess
# n_to_extr_test_json = json.dumps(n_to_extr_test)
# with open("../extractions/embedrank/raw/test_exact.json", "w+") as f:
#     f.write(n_to_extr_test_json)

### 2. Partial-match evaluation

In [13]:
# n_to_beta = {5: 0.55, 10: 0.55, 15: 0.55}
# n_to_extr_test = dict()

# for n, beta in n_to_beta.items():
#     n_to_extr_test[n] = extract_keyphrases(
#         docs=test_docs, 
#         n=n, 
#         betas=[beta]
#         )
    
# # Serialize extracted keyphrasess
# n_to_extr_test_json = json.dumps(n_to_extr_test)
# with open("../extractions/embedrank/raw/test_partial.json", "w+") as f:
#     f.write(n_to_extr_test_json)

### 3. Cross Encoder-based evaluation

In [14]:
# n_to_beta = {5: 0.6, 10: 0.6, 15: 0.6}
# n_to_extr_test = dict()

# for n, beta in n_to_beta.items():
#     n_to_extr_test[n] = extract_keyphrases(
#         docs=test_docs, 
#         n=n, 
#         betas=[beta]
#         )
    
# # Serialize extracted keyphrasess
# n_to_extr_test_json = json.dumps(n_to_extr_test)
# with open("../extractions/embedrank/raw/test_sim.json", "w+") as f:
#     f.write(n_to_extr_test_json)