In [27]:
import traceback
import elasticsearch
import time
from elasticsearch import Elasticsearch, helpers
import nltk
from nltk.corpus import stopwords
import re
import warnings
warnings.filterwarnings("ignore")

es = Elasticsearch()
es.info()

{'name': 'LAPTOP-7MB0IEVT',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'oNpBH66-Rq-zeazU5rk3ew',
 'version': {'number': '7.17.6',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'f65e9d338dc1d07b642e14a27f338990148ee5b6',
  'build_date': '2022-08-23T11:08:48.893373482Z',
  'build_snapshot': False,
  'lucene_version': '8.11.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [28]:
FIELDS = ['abstract', 'instance']
INDEX_NAME = 'dbpedia1'
INDEX_SETTINGS = {
'settings': {
        'index': {
            'number_of_shards': 1,
            'number_of_replicas': 1,
            'similarity': {
                'default': {
                    'type': 'BM25'
                }
            }
        }
    },
    'mappings': {
            'properties': {
                'abstract': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'instance': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                }
            }
        }
    }

In [29]:
def generate_esindex():
    if es.indices.exists(INDEX_NAME):
        es.indices.delete(index=INDEX_NAME)    
    es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

generate_esindex()

In [30]:
def read_ttlfile(filename, size, enc='utf-8'):

    if size <= 0:
        print("size is larger")
        return

    with open(filename, encoding=enc) as f:
        for i,line in enumerate(f):
            if (size >= 0) and (i >= size+1):
                break
            if i == 0: 
                continue
            print(line.strip())

In [31]:
read_ttlfile("data/instance_types_en.ttl", 2)

<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
<http://dbpedia.org/resource/Achilles> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .


In [32]:
read_ttlfile("data/short_abstracts_en.ttl",2)

<http://dbpedia.org/resource/Animalia_(book)> <http://www.w3.org/2000/01/rdf-schema#comment> "Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket."@en .
<http://dbpedia.org/resource/Actrius> <http://www.w3.org/2000/01/rdf-schema#comment> "Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play E.R. by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996."@en .


In [33]:
stop_words = stopwords.words('english')
typesof_question=  ['what', 'why', 'where','who', 'which','when', 'whom', 'whose']
stop_words = [word for word in stop_words if word not in typesof_question]

In [34]:
def text_preprocessing(text):
    text = re.sub(' +', ' ', text)
    stop_words = stopwords.words('english')
    word_list = [word for word in text.split() if word not in stop_words]
    text = " ".join(word_list)
    return text


def abstracts_preprocessing(text):
    page_list = re.findall('<.*?>', text)

    try:
        abstract = re.findall('\".*?\"', text)[0]
        abstract = text_preprocessing(abstract)
        if len(abstract)>2:
            abstract=abstract[1:-1]
        entity= page_list[0].split('/')[-1] 
        entity= entity[:-1].replace('_', ' ')
    except:
        abstract = ''
        entity = ''

    return entity, abstract
   

def entity_type_preprocessing(text):
    page_list = re.findall('<.*?>', text)

    try: 
        entity = page_list[0].split('/')[-1][:-1].replace('_', ' ')
        entity_type = page_list[-1].split('/')[-1][:-1].replace('owl#', '').replace('_', ' ')
        if entity_type=='Thing':
            entity_type='owl:Thing'
        else:
            entity_type='dbo:'+entity_type
    except:
        entity = ''
        entity_type = ''

    return entity, entity_type


def categories_preprocessing(text):
    page_list = re.findall('<.*?>', text) 
    entity = page_list[0].split('/')[-1][:-1]
    category = page_list[-1].split('/')[-1][:-1]
    category = category.replace('Category:','').replace('_', '')
    return entity.replace('_', ' '), category
    

In [35]:
size = 1500

def parse_abstracts(data, return_keys=False):   

    with open(file="data\short_abstracts_en.ttl", encoding='utf-8') as file:
        for i,line in enumerate(file):
            if (size >= 0) and (i >= size+1):
                break
            if i == 0: 
                continue
            entity, abstract = abstracts_preprocessing(line)

            if len(abstract)>0 and len(entity)>0:
                data.update({
                        entity:{ 
                        "_id": entity,
                            "abstract": abstract,
                            "instance":''
                        }
                        })
    if return_keys:                    
        return list(data.keys())

def parse_entity_type(data):
    entity_list = []
    with open(file="data\instance_types_en.ttl", encoding='utf-8') as f:
        for i,line in enumerate(f):
            if (size >= 0) and (i >= size+1):
                break
            if i == 0: 
                continue
            entity, entity_type = entity_type_preprocessing(line)

            if len(entity_type)>0 and len(entity)>0:
                entity_list.append(entity)
                try:
                    previous_entity = data[entity]['instance']
                    data[entity]['instance']=entity_type 
                except:
                    pass
            


In [36]:
data = {}

print("parseabstracts\n:",parse_abstracts(data))
print("--")
print("parseentity\n:", parse_entity_type(data))
print("---")

parseabstracts
: None
--
parseentity
: None
---


In [37]:
print("data")
data

data


{'Animalia (book)': {'_id': 'Animalia (book)',
  'abstract': "Animalia illustrated children's book Graeme Base. It originally published 1986, followed tenth anniversary edition 1996, 25th anniversary edition 2012. Over three million copies sold. A special numbered signed anniversary edition also published 1996, embossed gold jacket.",
  'instance': 'dbo:Book'},
 'Actrius': {'_id': 'Actrius',
  'abstract': 'Actresses (Catalan: Actrius) 1997 Catalan language Spanish drama film produced directed Ventura Pons based award-winning stage play E.R. Josep Maria Benet Jornet. The film male actors, roles played females. The film produced 1996.',
  'instance': 'dbo:Film'},
 'Alain Connes': {'_id': 'Alain Connes',
  'abstract': 'Alain Connes (French: [alɛ̃ kɔn]; born 1 April 1947) French mathematician, currently Professor Collège de France, IHÉS, The Ohio State University Vanderbilt University. He Invited Professor Conservatoire national des arts et métiers (2000).',
  'instance': 'dbo:Scientist'},

In [38]:
#removing no entity types
def entity_typeremoval(data):

    print("data before remove: ", len(data))
    items = []
    for k,v in data.items():
        if len(data[k]["instance"])==0:
            items.append(k)
    for i in items:
        data.pop(i)
    print("data after remove: ", len(data))

In [39]:
entity_typeremoval(data)

data before remove:  1500
data after remove:  943


In [40]:
list(data.values())

[{'_id': 'Animalia (book)',
  'abstract': "Animalia illustrated children's book Graeme Base. It originally published 1986, followed tenth anniversary edition 1996, 25th anniversary edition 2012. Over three million copies sold. A special numbered signed anniversary edition also published 1996, embossed gold jacket.",
  'instance': 'dbo:Book'},
 {'_id': 'Actrius',
  'abstract': 'Actresses (Catalan: Actrius) 1997 Catalan language Spanish drama film produced directed Ventura Pons based award-winning stage play E.R. Josep Maria Benet Jornet. The film male actors, roles played females. The film produced 1996.',
  'instance': 'dbo:Film'},
 {'_id': 'Alain Connes',
  'abstract': 'Alain Connes (French: [alɛ̃ kɔn]; born 1 April 1947) French mathematician, currently Professor Collège de France, IHÉS, The Ohio State University Vanderbilt University. He Invited Professor Conservatoire national des arts et métiers (2000).',
  'instance': 'dbo:Scientist'},
 {'_id': 'An American in Paris',
  'abstract'

In [41]:
batch_size = 100
doc = list(data.values())
for i in range(0, len(data), batch_size):
    actions = [{
            "_index": INDEX_NAME,
            "_id": x["_id"],
            "_source": {
                "abstract": x["abstract"],
                "instance": x["instance"]
            }
        } for x in doc[i:i+batch_size]]
    helpers.bulk(es, actions, index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)


In [42]:
search_param={"match": {"instance": "owl:Thing"}}

In [43]:
response = es.search(index=INDEX_NAME, query=search_param)
print('matching files are\n', response['hits']['total']['value'])

matching files are
 358


In [44]:
response['hits']['hits'][0]['_source']['instance']

'owl:Thing'