# First setup and run MongoDB
**Tutorial:https://www.mongodb.com/docs/manual/installation/**

# Download Papers

In [1]:
# allows async co-routines to work inside of jupyter notebook
import nest_asyncio
nest_asyncio.apply()

In [2]:
import pathlib

CACHING_DIR = 'scopus_cache'
CACHING_DIR = pathlib.Path(CACHING_DIR).resolve()

In [4]:
from TELF.pre_processing.iPenguin.Scopus import Scopus
import os

if "SCOPUS_KEY" in os.environ:
    print("Found SCOPUS_KEY environment variable")
    API_KEY = os.environ["SCOPUS_KEY"]
else:
    print("Variable does not exist. Export Scopus API key on your environment using the variable name SCOPUS_KEY.")
    API_KEY = ""

Found SCOPUS_KEY environment variable


In [5]:
scopus = Scopus(
    keys = [API_KEY], 
    mode = 'fs',         # file system caching mode (default)
    name = CACHING_DIR,  # where to cache the files
    verbose = True
)

In [6]:
# search for 'Boian Alexandrov' in all author fields
query = 'AUTH(Boian Alexandrov)'
df, paper_ids = scopus.search(query, n=100)
df.info()

[Scopus API]: Remaining API calls: 9981
              Quota resets at:     2025-03-25 05:25:34

100%|██████████| 41/41 [00:06<00:00,  6.18it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   eid               41 non-null     object
 1   doi               41 non-null     object
 2   title             41 non-null     object
 3   year              41 non-null     int64 
 4   abstract          41 non-null     object
 5   authors           41 non-null     object
 6   author_ids        41 non-null     object
 7   affiliations      41 non-null     object
 8   funding           29 non-null     object
 9   PACs              27 non-null     object
 10  publication_name  41 non-null     object
 11  subject_areas     41 non-null     object
 12  num_citations     41 non-null     int64 
dtypes: int64(2), object(11)
memory usage: 4.3+ KB


[Scopus]: Finished downloading 41 papers in 11.03s
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


# Penguin Storage

In [7]:
from TELF.applications.Penguin import Penguin
import os

MONGO_URI = "localhost:27017"
DB_NAME = "Penguin"
USERNAME = None
PASSWORD = None

penguin_db = Penguin(uri=MONGO_URI, 
                     db_name=DB_NAME, 
                     username=USERNAME, 
                     password=PASSWORD, verbose=True)
print("Connected to DB:", penguin_db.db_name)

Connected to DB: Penguin


[Penguin]: MongoDB connection successful and database is valid.


### Add Documents

In [8]:
penguin_db.add_many_documents(directory=CACHING_DIR, source="Scopus", overwrite=True)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  41 out of  41 | elapsed:    0.1s finished


In [9]:
counts = penguin_db.count_documents()
print("Number of documents:", counts)

Number of documents: {'scopus': 42, 's2': 0}


In [10]:
list(penguin_db.db["Penguin"].find())

[]

# Search

In [17]:
search_results = penguin_db.text_search(target="Tensor", scopus=True, s2=True)
len(search_results)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


13

In [24]:
ids = ["doi:10.1038/s41598-017-09537-y"]
id_results = penguin_db.id_search(ids=ids)
len(id_results)

1

# Tagging

In [29]:
paper_id = "eid:2-s2.0-85028463178"
penguin_db.add_tag(document_id=paper_id, tag="Tensors")
len(penguin_db.find_by_tag("Tensors"))

1

In [30]:
penguin_db.remove_tag(document_id=paper_id, tag="Tensors")

# Bloom Filter

In [33]:
bloom_filter = penguin_db.get_id_bloom(source="Scopus")
bloom_filter

<Bloom size_in_bits=752 approx_items=41.3>

In [34]:
if "s2id:319fd71e72e4b9c76f40429f3fb40aa98a3b0918" in bloom_filter:
    print("Paper exists in DB (checked via Bloom filter)")