# Setup the core DenseIndex for classical Keyword Search

In [None]:
import xml.etree.ElementTree as ET
import re
import pysolr
import requests
from IPython.display import display,HTML

Create a Solr instance. We configure neither timeout nor authentication.
Note: always_commit=True writes the documents to the core immediately. The parameter is
False by default. This procedure is practical, but comes at the expense of performance.

In [None]:
solr = pysolr.Solr('http://localhost:8983/solr/DenseIndex', always_commit=True)

A variable which contains the path to the original xml-file

In [None]:
file_path = '/home/bfh/irsed/daten/FIFA/fifa.xml'

Define a fuction for some basic text-cleaning

In [None]:
def clean_the_text(text):
    text = text.replace("\n", "")
    text = re.sub(r"\[.*?\]", "", text)
    return(text)

Define a function to add documents to Solr 

Note: For practical reasons, we do not use embeddings here. If necessary, embeddings could also be inserted here right away.

In [None]:
def add_to_solr(year, title, description):
    solr.add([
        {
            "year": year,
            "title": title,
            "description": description
        }
    ])
    solr.commit()

Parse the xml file and display some basic information

In [None]:
tree = ET.parse(file_path)
root = tree.getroot()

# Display basic information about the root element
root_tag = root.tag
root_attrib = root.attrib
num_children = len(root)

root_tag, root_attrib, num_children

Extracting all fields with the name 'year', 'title', 'description', and index in solr

In [None]:
for field in root.iter('field'):
    if field.attrib.get('name') == 'year':
        year = field.text
    if field.attrib.get('name') == 'title':
        # Extracting text content of the field
        title_content = field.text
    if field.attrib.get('name') == 'description':
        description_content = field.text
        add_to_solr(year, title_content, clean_the_text(description_content))

# Query the Index

In [None]:
solr_url = 'http://localhost:8983/solr/'
collection = "DenseIndex"

query = '"toxic atmosphere"'

Execute the query

In [None]:
string = {
    "query": query,
    "params": {
        "df": "title, description",
        "df": "description",
        "defType": "edismax",
        "indent": "true",
        "sort": "score desc",
    }
}

Display the result

In [None]:
docs = requests.post(solr_url + collection + "/select", json=string).json()["response"]["docs"]
for doc in docs:
    display(HTML("<h3>"+doc['title']+"</h3>"))
    display(HTML(doc['description']))
    print("")

Note: The description, which originally comes from Wikipedia, explicitly mentions a "toxic atmosphere".