In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%pip install -e ../backend

In [3]:
import json
import pandas as pd
import os
import re
import string


DATA_PATH = "arxiv/arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2012
YEAR_PATTERN = r"(19|20[0-9]{2})"
ML_CATEGORY = "cs.LG"

In [43]:
def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract']
    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            if paper['year']:
                if paper['year'] >= YEAR_CUTOFF and ML_CATEGORY in paper['categories']:
                    yield paper

In [44]:
df = pd.DataFrame(papers())
len(df)

11361

In [45]:
# Avg length of the abstracts
df.abstract.apply(lambda a: len(a.split())).mean()

169.8317049555497

In [46]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

In [12]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [13]:
# Create embeddings from the title and abstract
emb = model.encode(df.apply(lambda r: clean_description(r['title'] + ' ' + r['abstract']), axis=1).tolist())

In [14]:
# Add embeddings to df
df = df.reset_index().drop('index', axis=1)
df['vector'] = emb.tolist()

In [15]:
import pickle

# Export to file!
with open('arxiv_embeddings_10000.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)

In [85]:
%pip install -q redis-om

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [5]:
import getpass

REDIS_PASSWORD = getpass.getpass('REDIS_PASSWORD=')
len(REDIS_PASSWORD)

REDIS_PASSWORD= ········


8

In [6]:
import os

In [158]:
os.environ['REDIS_HOSTPORT'] = 'redis-14540.c21985.us-central1-1.gcp.cloud.rlrcp.com:14540'
os.environ['REDIS_USERNAME'] = 'machine-untitled-dev'
os.environ['REDIS_PASSWORD'] = REDIS_PASSWORD

os.environ['REDIS_OM_URL'] = f'redis://{REDIS_USERNAME}:{REDIS_PASSWORD}@{REDIS_HOSTPORT}'

os.environ['DATA_LOCATION'] = '.'

In [159]:
from vecsim_app.load_data import load_all_data

In [160]:
import redis.asyncio as redis

REDIS_HOSTPORT = os.environ['REDIS_HOSTPORT']
REDIS_USERNAME = os.environ['REDIS_USERNAME']
REDIS_PASSWORD = os.environ['REDIS_PASSWORD']

In [130]:
await load_all_data()

Ping successful: True
Loading papers into Vecsim App
Papers loaded!
Creating vector search index
Search index created


In [131]:
redis_conn = redis.from_url(f'redis://{REDIS_HOSTPORT}',
                                username=REDIS_USERNAME,
                                password=REDIS_PASSWORD)
print(f"Ping successful: {await redis_conn.ping()}")

Ping successful: True


In [132]:
from vecsim_app.search_index import SearchIndex
import asyncio

In [133]:
search_index = SearchIndex()

In [134]:
categories = 'cs.LG,cs.CG,cs.CV,math.OC,cs.LG'.split(',')
years = [2014, 2015]
similar_paper_id = '0705.4485'  # https://arxiv.org/pdf/0705.4485.pdf "Mixed Membership Stochastic Blockmodels"
# paper_vector:1104.1990
# paper_vector:0808.3231
# paper_vector:1009.3613
# paper_vector:1207.1655
# paper_vector:1101.4752

In [135]:
query = search_index.vector_query(
    categories=categories,
    years=years,
)
count_query = search_index.count_query(
    categories=categories,
    years=years,
)

In [161]:
INDEX_NAME = 'papers'

In [163]:
# # see routes.py

paper_vector_key = "paper_vector:" + similar_paper_id
vector = await redis_conn.hget(paper_vector_key, "vector")
    
total, results = await asyncio.gather(
    redis_conn.ft().search(count_query),
    redis_conn.ft().search(query, query_params={"vec_param": vector})
)

In [175]:
p = results.docs[0]

In [177]:
vars(p)


{'id': 'paper_vector:0705.4485',
 'payload': None,
 'vector_score': '0',
 'paper_id': '0705.4485',
 'paper_pk': '01GGRAGY16BCN43QR59J6DPSEH'}

In [180]:
from vecsim_app.models import Paper

async def process_paper(p, i: int) -> dict:
    # paper = await Paper.get(p.paper_pk)
    d = vars(p)
    score = 1 - float(p.vector_score)
    d['similarity_score'] = score
    return d

async def papers_from_results(total, results) -> dict:
    # extract papers from VSS results
    return {
        'total': vars(total),
        'papers': [
            await process_paper(p, i)
            for i, p in enumerate(results.docs)
        ]
    }

In [181]:
await papers_from_results(total, results)

{'total': {'total': 402,
  'duration': 31.348228454589844,
  'docs': [Document {'id': 'paper_vector:1012.3697', 'payload': None},
   Document {'id': 'paper_vector:1303.7117', 'payload': None},
   Document {'id': 'paper_vector:1304.1014', 'payload': None},
   Document {'id': 'paper_vector:1402.1473', 'payload': None},
   Document {'id': 'paper_vector:1405.4807', 'payload': None},
   Document {'id': 'paper_vector:1201.5604', 'payload': None},
   Document {'id': 'paper_vector:1308.5200', 'payload': None},
   Document {'id': 'paper_vector:1310.2273', 'payload': None},
   Document {'id': 'paper_vector:1408.3693', 'payload': None},
   Document {'id': 'paper_vector:1606.07286', 'payload': None}]},
 'papers': [{'id': 'paper_vector:0705.4485',
   'payload': None,
   'vector_score': '0',
   'paper_id': '0705.4485',
   'paper_pk': '01GGRAGY16BCN43QR59J6DPSEH',
   'similarity_score': 1.0},
  {'id': 'paper_vector:1411.5404',
   'payload': None,
   'vector_score': '0.363725543022',
   'paper_id': '1