## This notebook contains the code for creating the search index with ElasticSearch using the entity labels 

#### Required Files: 
2 Files containing the preferred labels (primary labels) and alternate labels (secondary labels) of all the entities in the Wikidata-Biomedical SubGraph.

Refer to the Readme for instructions for getting the files.

In [1]:
import pandas as pd

In [2]:
## Code for removing the emoticons, symbols etc from the labels
import re
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.search(emoj, data)

In [3]:
# File locations of the preferred label and alternate labels files

# Replace these file locations with the downloaded prefLabel.rdf and altLabel.rdf files.
labels_loc = "preLabels.rdf"
alt_labels_loc = "altLabels.rdf"

### Loading the preferred label and alternate labels files. 
This might take some time

In [4]:
labels_df = pd.read_csv(labels_loc, sep = "\t", header = None)

print("Starting: ", len(labels_df))

labels_df = labels_df[[0, 2]]
labels_df.head()

labels_df[2] = labels_df[2].apply(lambda x: x.split("@en")[0] if remove_emojis(x) is None else pd.NA)

labels_df = labels_df.drop_duplicates()
labels_df = labels_df.dropna()

print("Processed: ", len(labels_df))


Starting:  13649572
Processed:  13630189


In [5]:
alt_labels_df = pd.read_csv(alt_labels_loc, sep = "\t", header = None)
print("Starting: ", len(alt_labels_df))
alt_labels_df.head()

alt_labels_df = alt_labels_df[[0, 2]]
alt_labels_df.head()

alt_labels_df[2] = alt_labels_df[2].apply(lambda x: x.split("@en")[0] if remove_emojis(x) is None else pd.NA)
alt_labels_df.head()

alt_labels_df = alt_labels_df.drop_duplicates()

alt_labels_df = alt_labels_df.dropna()
print("Processed: ", len(alt_labels_df))


Starting:  4995703
Processed:  4994795


### Concatenating the preferred label and alternate label data

In [6]:
all_labels_df = pd.concat([labels_df, alt_labels_df], ignore_index=True)
all_labels_df.columns = ["uri", "label"]

In [7]:
all_labels_df.tail()

Unnamed: 0,uri,label
18624979,wd:Q113656636,CIBT
18624980,wd:Q113657342,Hijokaidan albums discography
18624981,wd:Q113657342,Hijokaidan discography - albums
18624982,wd:Q113657355,tone or pitch accent
18624983,wd:Q113657830,Method of designating colors


## Processing the entity URIs

In [8]:
all_labels_df['uri'] = all_labels_df['uri'].apply(lambda x: "<https://www.wikidata.org/wiki/" + x.split(":")[-1] + ">" )

### Creating a new search index in ElasticSearch

If the SEARCH_INDEX_NAME variable is changed, it needs to be updated in the evaluation notebooks as well.

In [9]:
import elasticsearch
config = {
    'host': 'localhost'
}

es = elasticsearch.Elasticsearch([config,], timeout=300)

SEARCH_INDEX_NAME = "wikidata_bio_entity_index"

if es.indices.exists(SEARCH_INDEX_NAME):
    es.indices.delete(index=SEARCH_INDEX_NAME)
es.indices.create(index=SEARCH_INDEX_NAME)

  if es.indices.exists(SEARCH_INDEX_NAME):


{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'wikidata_bio_entity_index'}

In [10]:
from elasticsearch import helpers
import tqdm.notebook as tq

def doc_generator(df, index_name):
    df_iter = df.iterrows()
    for index, document in tq.tqdm(df_iter):
        yield {
                "_index": index_name,
                "_type": "_doc",
                "_id" : f"{index+1}",
                "_source": document.to_dict(),
            }
        
    
    
elasticsearch.helpers.bulk(es, doc_generator(all_labels_df, index_name=SEARCH_INDEX_NAME))

0it [00:00, ?it/s]



(18624984, [])

### Sanity Check: Is the Index working

In [13]:
text = all_labels_df["label"][0]
text

'happiness'

In [16]:
elasticResults=es.search(index=SEARCH_INDEX_NAME, body={
    "query": {
        "match" : {
            "label" : {
                "query" : "protein",
                "fuzziness": "AUTO"
                    }
                }
            },
            "size":100
        }
    )

print(elasticResults["hits"]["hits"][0])

{'_index': 'wikidata_bio_entity_index', '_type': '_doc', '_id': '17064339', '_score': 5.541134, '_source': {'uri': '<https://www.wikidata.org/wiki/Q23627524>', 'label': 'proteic killer protein'}}


  elasticResults=es.search(index=SEARCH_INDEX_NAME, body={
