In [1]:
import os
from tqdm import tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# Problem 1

### Create Index

In [5]:
def create_index(index_name, stopwords):
    es.indices.create(index=index_name, ignore=400, body={
        "settings" : {
            "number_of_shards": 1,
            "number_of_replicas": 1,
            "max_result_window" : 20000,
            "analysis": {
                "filter": {
                    "english_stop": {
                        "type": "stop",
                        "stopwords": stopwords
                    },
                    "stemmer": {
                        "type": "snowball",
                        "name": "english"
                    }
                },
                "analyzer": {
                    "stopped": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": [
                            "lowercase",
                            "english_stop",
                            "stemmer"
                        ]
                    }
                }
        }
        },
        "mappings": {
            "properties": {
                "text": {
                    "type": "text",
                    "fielddata": True,
                    "analyzer": "stopped",
                    "index_options": "positions",
                    "term_vector": "yes"
                }
            }
        }
    })

### Index Documents - DUC 2001 Dataset

In [3]:
def parse_duc(docs, summary_dict):
    summary = ''
    text_start, text_end = docs.find('<TEXT>') + len('<TEXT>'), docs.find('</TEXT>')
    docno_start, docno_end = docs.find('<DOCNO>') + len('<DOCNO>'), docs.find('</DOCNO>')
    text = docs[text_start:text_end].strip()
    docno = docs[docno_start:docno_end].strip()
    if docno in summary_dict.keys():
        summary = summary_dict[docno]

    yield {
        '_index': 'duc-2001',
        '_id': docno,
        'text': text,
        'gold_summary': summary
    }

def index_duc(filepath):
    print('indexing DUC documents')
    
    summary_dict = {}
    summary_files = os.listdir(f'{filepath}/Summaries')
    for filename in summary_files:
        summary_docno, _ = tuple(filename.split('.'))
        with open(f'{filepath}/Summaries/{filename}') as f:
            summary_dict[summary_docno.upper()] = f.read()

    files = os.listdir(filepath)
    for file in tqdm(files, position=0, desc='index duc dataset'):
        if '-' in file:
            with open(f'{filepath}/{file}', 'r') as f:
                docs = f.read()
                bulk(es, parse_duc(docs, summary_dict))

### Index Documents - 20NG Dataset

In [8]:
def parse_20ng(filename, category, text):
    yield {
        '_index': '20-ng',
        '_id': filename,
        'category': category,
        'text': text
        }

def index_20ng(filepath):    
    folders = os.listdir(f'{filepath}')
    valid_folders = [folder for folder in folders if '.' not in folder]
    for folder in valid_folders:
        categories = os.listdir(f'{filepath}/{folder}')
        for category in tqdm(categories, position=0, desc=f"inspecting all categories from folder '{folder}'"):
            files = os.listdir(f'{filepath}/{folder}/{category}')
            for filename in files:
                with open(f'{filepath}/{folder}/{category}/{filename}', "r", encoding="ISO-8859-1") as f:
                    text = f.read()
                    bulk(es, parse_20ng(filename, category, text))

In [3]:
es = Elasticsearch(['https://sunho:Dunkel6eit!!@i-o-optimized-deployment-84c1c6.es.us-east-1.aws.found.io:9243'])
stopwords = [line.strip() for line in open('./stoplist.txt')]

In [6]:
create_index('duc-2001', stopwords)
index_duc('DUC2001')

indexing DUC documents


HBox(children=(FloatProgress(value=0.0, max=313.0), HTML(value='')))




In [9]:
create_index('20-ng', stopwords)
index_20ng('20NG')

inspecting all categories from folder '20news-bydate-test': 100%|██████████| 20/20 [04:32<00:00, 13.64s/it]
inspecting all categories from folder '20news-bydate-train': 100%|██████████| 20/20 [06:59<00:00, 20.95s/it]


In [2]:
' '.join(['one', 'two'])

'one two'