# First setup and run MongoDB
**Tutorial:https://www.mongodb.com/docs/manual/installation/**

# Download Papers

In [None]:
# allows async co-routines to work inside of jupyter notebook
import nest_asyncio
nest_asyncio.apply()

In [None]:
import pathlib

CACHING_DIR = 'scopus_cache'
CACHING_DIR = pathlib.Path(CACHING_DIR).resolve()

In [None]:
from TELF.pre_processing.iPenguin.Scopus import Scopus
import os

if "SCOPUS_KEY" in os.environ:
    print("Found SCOPUS_KEY environment variable")
    API_KEY = os.environ["SCOPUS_KEY"]
else:
    print("Variable does not exist. Export Scopus API key on your environment using the variable name SCOPUS_KEY.")
    API_KEY = ""

In [None]:
scopus = Scopus(
    keys = [API_KEY], 
    mode = 'fs',         # file system caching mode (default)
    name = CACHING_DIR,  # where to cache the files
    verbose = True
)

In [None]:
# search for 'Boian Alexandrov' in all author fields
query = 'AUTH(Boian Alexandrov)'
df, paper_ids = scopus.search(query, n=100)
df.info()

# Penguin Storage

In [None]:
from TELF.applications.Penguin import Penguin
import os

MONGO_URI = "localhost:27017"
DB_NAME = "Penguin"
USERNAME = None
PASSWORD = None

penguin_db = Penguin(uri=MONGO_URI, 
                     db_name=DB_NAME, 
                     username=USERNAME, 
                     password=PASSWORD, verbose=True)
print("Connected to DB:", penguin_db.db_name)

### Add Documents

In [None]:
penguin_db.add_many_documents(directory=CACHING_DIR, source="Scopus", overwrite=True)

In [None]:
counts = penguin_db.count_documents()
print("Number of documents:", counts)

In [None]:
list(penguin_db.db["Penguin"].find())

# Search

In [None]:
search_results = penguin_db.text_search(target="Tensor", scopus=True, s2=True)
len(search_results)

In [None]:
ids = ["doi:10.1038/s41598-017-09537-y"]
id_results = penguin_db.id_search(ids=ids)
len(id_results)

# Tagging

In [None]:
paper_id = "eid:2-s2.0-85028463178"
penguin_db.add_tag(document_id=paper_id, tag="Tensors")
len(penguin_db.find_by_tag("Tensors"))

In [None]:
penguin_db.remove_tag(document_id=paper_id, tag="Tensors")

# Bloom Filter

In [None]:
bloom_filter = penguin_db.get_id_bloom(source="Scopus")
bloom_filter

In [None]:
if "s2id:319fd71e72e4b9c76f40429f3fb40aa98a3b0918" in bloom_filter:
    print("Paper exists in DB (checked via Bloom filter)")