In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
%pip install -q -r requirements.txt

You should consider upgrading via the '/home/jovyan/workspace/untitled1-vector-search/.venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [7]:
import json
import pandas as pd
import os
import re
import string

from vecsim_app.embeddings import Embeddings
from vecsim_app.data_utils import papers


DATA_PATH = "/home/jovyan/arxiv/arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2012
YEAR_PATTERN = r"(19|20[0-9]{2})"
ML_CATEGORY = "cs.LG"

In [8]:
df = pd.DataFrame(papers(data_path=DATA_PATH, year_cutoff=YEAR_CUTOFF, year_pattern=YEAR_PATTERN, ml_category=ML_CATEGORY))
len(df)

11419

In [9]:
# Avg length of the abstracts
df.abstract.apply(lambda a: len(a.split())).mean()

169.84534547683685

In [10]:
df

Unnamed: 0,id,title,year,authors,categories,abstract
0,0705.4485,Mixed membership stochastic blockmodels,2014,"Edoardo M Airoldi, David M Blei, Stephen E Fie...","stat.ME,cs.LG,math.ST,physics.soc-ph,stat.ML,s...",Observations consisting of measurements on r...
1,0808.3231,Multi-Instance Multi-Label Learning,2012,"Zhi-Hua Zhou, Min-Ling Zhang, Sheng-Jun Huang,...","cs.LG,cs.AI","In this paper, we propose the MIML (Multi-In..."
2,0811.4413,A Spectral Algorithm for Learning Hidden Marko...,2012,"Daniel Hsu, Sham M. Kakade, Tong Zhang","cs.LG,cs.AI",Hidden Markov Models (HMMs) are one of the m...
3,0903.4817,An Exponential Lower Bound on the Complexity o...,2012,"Bernd G\""artner, Martin Jaggi and Cl\'ement Maria","cs.LG,cs.CG,cs.CV,math.OC,stat.ML",For a variety of regularized optimization pr...
4,0909.5175,Bounding the Sensitivity of Polynomial Thresho...,2013,"Prahladh Harsha, Adam Klivans, Raghu Meka","cs.CC,cs.LG",We give the first non-trivial upper bounds o...
...,...,...,...,...,...,...
11414,2211.00724,Privacy Induces Robustness: Information-Comput...,2022,"Kristian Georgiev, Samuel B. Hopkins","stat.ML,cs.DS,cs.LG",We establish a simple connection between rob...
11415,2211.01107,Deep Reinforcement Learning for Power Control ...,2022,Ziad El Jamous and Kemal Davaslioglu and Yalin...,"cs.NI,cs.LG,cs.SY,eess.SY",This paper presents a deep reinforcement lea...
11416,2211.01373,Interpretable Modeling and Reduction of Unknow...,2022,"Maryam Toloubidokhti, Nilesh Kumar, Zhiyuan Li...","eess.IV,cs.AI,cs.LG",Prior knowledge about the imaging physics pr...
11417,2211.01500,Learning to Grasp the Ungraspable with Emergen...,2022,"Wenxuan Zhou, David Held","cs.RO,cs.AI,cs.LG",A simple gripper can solve more complex mani...


In [11]:
df['authors_clean'] = df['authors'].apply(lambda a: ' '.join(re.findall(r'\w\w+', a)).strip())
df['authors_clean'][:3]

0    Edoardo Airoldi David Blei Stephen Fienberg Er...
1    Zhi Hua Zhou Min Ling Zhang Sheng Jun Huang Yu...
2                    Daniel Hsu Sham Kakade Tong Zhang
Name: authors_clean, dtype: object

In [12]:
df['text'] = df.apply(lambda r: Embeddings.clean_description(r['authors_clean'] + ' ' + r['title'] + ' ' + r['abstract']), axis=1)
df['text'][:1]

0     edoardo airoldi david blei stephen fienberg e...
Name: text, dtype: object

In [13]:
from sentence_transformers import SentenceTransformer
#see models here: https://www.sbert.net/docs/pretrained_models.html
model_name = 'sentence-transformers/all-distilroberta-v1'

model = SentenceTransformer(model_name)

In [14]:
# Create embeddings from the title and abstract
emb = model.encode(df['text'].tolist())

In [15]:
# Add embeddings to df
df = df.reset_index().drop('index', axis=1)
df['vector'] = emb.tolist()

In [16]:
import pickle

# Export to file!
with open(f'arxiv_embeddings_10000.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)

In [17]:
!ls -lh .

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 106M
-rw-rw-r-- 1 jovyan jovyan 106M Nov  6 05:23 arxiv_embeddings_10000.pkl
-rw-rw-r-- 1 jovyan jovyan  16K Nov  6 05:23 arxiv-embeddings.ipynb
-rw-rw-r-- 1 jovyan jovyan 1.9K Nov  5 23:59 requirements.txt
