In [1]:
#!pip install elasticsearch==8.4.3

Collecting elasticsearch==8.4.3
  Downloading elasticsearch-8.4.3-py3-none-any.whl (384 kB)
Installing collected packages: elasticsearch
  Attempting uninstall: elasticsearch
    Found existing installation: elasticsearch 8.5.0
    Uninstalling elasticsearch-8.5.0:
      Successfully uninstalled elasticsearch-8.5.0
Successfully installed elasticsearch-8.4.3


In [1]:
import elasticsearch

In [2]:
import os
import re

In [3]:
es = elasticsearch.Elasticsearch("http://localhost:9200")

In [4]:
es.info()

ObjectApiResponse({'name': 'DESKTOP-3PJFB14', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'HNdRkjgTQOu3TndAoHzH_w', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### 3. Define an ES analyzer for Polish texts

In [5]:
SYNONYMS = ["kpk,kodeks postępowania karnego",
            "kpc,kodeks postępowania cywilnego",
            "kk,kodeks karny",
            "kc,kodeks cywilny"]

ANALYZER_NAME = 'bills_analyzer'
INDEX_NAME = 'bills_index'

In [6]:
analyzer = {
    f"{ANALYZER_NAME}": {
        'type': 'custom',
        'tokenizer': 'standard',
        'filter':[
            'lowercase',
            'synonym',
            'morfologik_stem'
        ]
    }
}

synonym = {
    'synonym': {
        'type':'synonym',
        'synonyms': SYNONYMS
    }
}

mapping = {
    "properties": {
      "text": { 
        "type": "text",
        "analyzer": f"{ANALYZER_NAME}"
      }
    }
}

### 4. Define an ES index

In [7]:
indx = es.indices.create(index = INDEX_NAME,
                        mappings = mapping,
                        settings = {
                            'analysis' : {
                                'analyzer' : analyzer,
                                'filter': synonym
                            }
                        })

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [bills_index/PqatbuxbQD-VdUk3cvLC9w] already exists')

### 5. Load the data to the ES index.

In [8]:
directory = 'ustawy'

def load_bills(directory):
    bills = {}
    for file_name in os.listdir(directory):
        path = os.path.join(directory, file_name)
        if os.path.isfile(path):
            with open(path, encoding='utf-8') as f:
                content = f.read()
                content = content.replace("\xa0", " ").replace("\xad", "")
                content = re.sub(r"\s+", " ", content)
                bills[file_name] = content
    return bills

In [9]:
bills = load_bills(directory)

In [10]:
for name, content in bills.items():
    es.index(index=INDEX_NAME,
    id=name,
    document={
        "text": content
    })

In [11]:
es.count(index=INDEX_NAME)

ObjectApiResponse({'count': 1178, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### 6. Determine the number of legislative acts containing the word ustawa (in any form).

In [12]:
es.count(index=INDEX_NAME, 
         query={'match': 
                    {'text': 
                         {'query': "ustawa"}
                    }
               })

ObjectApiResponse({'count': 1178, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### 7. Determine the number of occurrences of the word ustawa by searching for this particular form, including the other inflectional forms.


In [13]:
es.termvectors(
    index=INDEX_NAME,
    id="2000_5.txt",
    fields=["text"],
    filter_path=["term_vectors.text.terms.ustawa.ttf"],
    term_statistics=True
)
# total term frequency

ObjectApiResponse({'term_vectors': {'text': {'terms': {'ustawa': {'ttf': 24937}}}}})

### 8. Determine the number of occurrences of the word ustaw by searching for this particular form, including the other inflectional forms.


In [18]:
ustaw_occur = es.indices.analyze(index=INDEX_NAME, body={
    "analyzer": f'{ANALYZER_NAME}',
    "text": "ustaw"
})['tokens']

  ustaw_occur = es.indices.analyze(index=INDEX_NAME, body={


In [34]:
ustaw_forms = [token['token'] for token in ustaw_occur]

In [35]:
ustaw_forms

['ustawa', 'ustawić']

In [44]:
count_forms = 0
for form in ustaw_forms:
    res = es.termvectors(
            index=INDEX_NAME,
            id="1993_599.txt",
            fields=["text"],
            filter_path=[f"term_vectors.text.terms.{form}.ttf"],
            term_statistics=True
            )['term_vectors']['text']['terms'][f'{form}']['ttf']
    count_forms += res
print(count_forms)

25850


In [39]:
es.termvectors(
    index=INDEX_NAME,
    id="1993_599.txt",
    fields=["text"],
    filter_path=["term_vectors.text.terms.ustawić.ttf"],
    term_statistics=True
)['term_vectors']['text']['terms'][form]['ttf']

913

### 9. Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in any inflection form.

In [57]:
es.count(index=INDEX_NAME, 
         query={'match_phrase': {'text': "kodeks postępowania cywilnego"}})

ObjectApiResponse({'count': 99, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### 10. Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase.

In [58]:
es.count(index=INDEX_NAME, 
         query={'match_phrase': {'text': {'query': "wchodzi w życie", 'slop': 2}}})

ObjectApiResponse({'count': 1174, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### 11. Determine the 10 documents that are the most relevant for the phrase konstytucja.

In [85]:
konstytucja = es.search(
    index=INDEX_NAME,
    query={'match': {'text': "konstytucja"}},
    filter_path=["hits.hits._id", "hits.hits._score"],
    size=10
)

In [93]:
konstytucja['hits']

{'hits': [{'_id': '1997_629.txt', '_score': 6.8676424},
  {'_id': '2000_443.txt', '_score': 6.6627803},
  {'_id': '1997_604.txt', '_score': 6.632101},
  {'_id': '1996_350.txt', '_score': 6.6268387},
  {'_id': '1997_642.txt', '_score': 6.2516546},
  {'_id': '2001_23.txt', '_score': 6.058013},
  {'_id': '1996_199.txt', '_score': 5.928105},
  {'_id': '1999_688.txt', '_score': 5.8497677},
  {'_id': '1997_681.txt', '_score': 5.466618},
  {'_id': '2001_1082.txt', '_score': 5.466618}]}

### 12. Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task. 

In [94]:
es.search(
    index=INDEX_NAME,
    query={'match': {'text': "konstytucja"}},
    highlight={'fields': {'text': {'number_of_fragments': 3}}},
    filter_path=["hits.hits._id", "hits.hits.highlight"],
    size=10
)['hits']["hits"]

[{'_id': '1997_629.txt',
  'highlight': {'text': ['Inicjatywa ustawodawcza w zakresie przedstawienia Zgromadzeniu Narodowemu projektu nowej <em>Konstytucji</em>',
    'Do zgłoszenia projektu <em>Konstytucji</em> załącza się wykaz obywateli popierających zgłoszenie, zawierający',
    'Zasady, na których opierać się ma <em>Konstytucja</em> mogą być poddane pod referendum. 2.']}},
 {'_id': '2000_443.txt',
  'highlight': {'text': ['umowy międzynarodowej lub załącznika nie wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>',
    'okoliczności, a umowa międzynarodowa nie wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>',
    'Polskiej do ratyfikacji jest dokonywane po uzyskaniu zgody, o której mowa w art. 89 ust. 1 i art. 90 <em>Konstytucji</em>']}},
 {'_id': '1997_604.txt',
  'highlight': {'text': ['W razie powstania wątpliwości co do zgodności z <em>Konstytucją</em> celów lub zasad działania partii politycznej',
    'Jeżeli Tr