# Upload to OpenSearch

In [1]:
from opensearchpy import OpenSearch
from tqdm import tqdm
import pickle
import os
from dotenv import load_dotenv

In [2]:
def opensearch_create_index(database_conn, index_name, os_mapping):
    search_index = database_conn.indices.exists(index=index_name)

    if not search_index:
        database_conn.indices.create(
            index=index_name,
            ignore=[400, 404],
            body=os_mapping,
        )

In [3]:
def opensearch_connection(index_name):
    user_name = os.getenv("DB_USERNAME")
    password = os.getenv("DB_PASSWORD")
    host = os.getenv("DB_HOSTNAME")
    port = os.getenv("DB_PORT")

    database_conn = OpenSearch(
        hosts=[{"host": host, "port": port}],
        http_auth=(user_name, password),
        use_ssl=False,
        verify_certs=False,
        ssl_assert_hostname=False,
        ssl_show_warn=False,
    )

    os_index_mapping = arxiv_index_mapping()

    opensearch_create_index(database_conn, index_name, os_index_mapping)

    return database_conn

In [4]:
def load_articles_vector(index_connection, article_info, index_name):
    for article in tqdm(article_info, desc="Saving articles to database"):
        doc = {
            "title": article[2]["title"].replace("[", "").replace("]", ""),
            "chunk_vector": article[1],
            "authors": article[2]["authors"],
            "journal": article[2]["journal"],
            "text_chunk_id": article[2]["text_chunk_id"],
            "publication_year": article[2]["publication_year"],
            "abstract": article[2]["abstract"],
            "chunk": article[2]["chunk"],
            "first_author": article[2]["first_author"],
        }

        _id = article[0]

        index_connection.index(index=index_name, body=doc, id=_id)

In [5]:
def read_articles(filename):
    with open(filename, "rb") as f:
        articles_data = pickle.load(f)
    return articles_data

### With Model all-mpnet-base-v2

In [6]:
def arxiv_index_mapping():
    os_mapping = {
        "settings": {
            "index": {
                "number_of_shards": 1,
                "number_of_replicas": 1,
                "knn": True,
            },
        },
        "mappings": {
            "properties": {
                "chunk_vector": {
                    "type": "knn_vector",
                    "dimension": 768,
                    "method": {
                        "engine": "nmslib",
                        "name": "hnsw",
                        "space_type": "cosinesimil",
                        "parameters": {
                            "ef_construction": 40,
                            "m": 8,
                        },
                    },
                },
            }
        },
    }

    return os_mapping

In [7]:
load_dotenv()
db_index = os.getenv('DB_INDEX')

database_connection = opensearch_connection(db_index)

In [10]:
data = read_articles("/Users/renke/Documents/23:24ws/NLPT/final_project/QA-System-INLPT-WS2023/data/vector_data.pkl")
load_articles_vector(database_connection, data, db_index)

Saving articles to database: 100%|██████████| 106334/106334 [04:19<00:00, 409.88it/s]
