# Solr Baseline

The easiest way to setup and run ad hoc single instance Solr is to use Docker. So make sure Docker is installed and run following commands in terminal:

```
docker pull solr
docker run -d -p 8983:8983 -t solr
docker exec -it solr /bin/bash
bin/solr create -c cord19_2020_05_19_abstract
```

Let's test it on `abstract` to get a baseline.

## Import necessary libs

In [None]:
import pysolr
import json
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm

## Verify Solr connection

In [None]:
def create_solr_connection(collection_name, solr_connection_url):
    solr = pysolr.Solr(
        "{}/solr/{}".format(solr_connection_url, collection_name),
        always_commit=True,
        timeout=10
    )
    return solr


solr = create_solr_connection("cord19_2020_05_19_abstract", "http://0.0.0.0:8983")
print(solr.ping())

## Load dataset

In [None]:
CORD19_PATH = Path('../data/input/trec_cord19_v0.csv')

def load_cord19(input_fpath: Path, dtype: str = 'csv', cols_to_keep: list = ['cord_uid', 'abstract'], index_col = 'cord_uid') -> pd.DataFrame:
    """Loads CORD19 data and returns it as pandas data frame
    """
    if dtype == 'csv':
        df = pd.read_csv(input_fpath, quotechar='"', index_col=index_col, usecols=cols_to_keep)
        # for each column
        for col in df.columns:
            # check if the columns contains string data
            if pd.api.types.is_string_dtype(df[col]):
                df[col] = df[col].str.strip() # removes front and end white spaces
                df[col] = df[col].str.replace('\s{2,}', ' ') # remove double or more white spaces
                
        df = df.dropna()
    return df


cord19 = load_cord19(CORD19_PATH)
cord19.head()

In [None]:
cord19.info()

In [None]:
abstracts_dict = cord19['abstract'].to_dict()
len(abstracts_dict)

## Build Solr Index

In [None]:
def build_solr_index(data_dict: dict, solr: pysolr.Solr):
    solr_payloads = []
    for uid, text in data_dict.items():
        solr_payload = {
            "id": uid,
            "text": text
        }
        solr_payloads.append(solr_payload)
    if len(solr_payloads) == 1000:
        solr.add(solr_payloads)
        solr_payloads = []

build_solr_index(abstracts_dict, solr)

## Load topics 

In [None]:
def load_queries(input_fpath: Path, dtype: str = 'csv', cols_to_keep=['topic-id', 'query'], index_col=['topic-id']) -> pd.DataFrame:
    """Loads queries file and returns it as pandas data frame
    """
    if dtype == 'csv':
        df = pd.read_csv(input_fpath, quotechar='"', index_col=index_col, usecols=cols_to_keep)
        # for each column
        for col in df.columns:
            # check if the columns contains string data
            if pd.api.types.is_string_dtype(df[col]):
                df[col] = df[col].str.strip() # removes front and end white spaces
                df[col] = df[col].str.replace('\s{2,}', ' ') # remove double or more white spaces
    return df

QUERY_FPATH = Path('../data/CORD-19/CORD-19/topics-rnd3.csv')
topics = load_queries(QUERY_FPATH)
topics.head()

In [None]:
topics.info()

In [None]:
topics_dict = topics['query'].to_dict()
len(topics_dict)

### Test Query

In [None]:
topic_id = 1
full_text_query = f"Topic id: {topic_id}, Query: {topics_dict[topic_id]}"
print(full_text_query)

solr_query_param = {
    "fl": "id, score",
    "rows": 10
}
results = solr.search(full_text_query, **solr_query_param).docs
print("results", "\n".join(str(e) for e in results))