In [2]:
import json
import pandas as pd
import os
import re
import string


DATA_PATH = "arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2012
YEAR_PATTERN = r"(19|20[0-9]{2})"
ML_CATEGORY = "cs.LG"

In [3]:
def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract']
    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            if paper['year']:
                if paper['year'] >= YEAR_CUTOFF and ML_CATEGORY in paper['categories']:
                    yield paper

In [4]:
df = pd.DataFrame(papers())
len(df)

11304

In [5]:
# Avg length of the abstracts
df.abstract.apply(lambda a: len(a.split())).mean()

169.8122788393489

In [6]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Create embeddings from the title and abstract

'''
Feature Set
    - Keyword Extraction
        - Category Specific
        - Author Specific
    - Paper Ranking Algorithm
        - Journal Specific Ranking
        - Category Specific Ranking
        - Author Specific Ranking
    - General Search using Title
    - General Search using Abstracts
    
User Experience[Faceits]
    - Author
    - Category
    - Title
    - Abstract
    - Year

Dimitrios
    - Extract features from Categories
    -   Finding simularity between category
    -     How close is Computer Science to Computer Engineering
    -     How close is Physics to Astrophyiscs
    -      What metrics are used to rank simularity?
    -      Which ML Models or NN Models would be used to "segment" the categories
    -      Merge two or more features that look a like using the Abstract or Title and/or Author
    -        Look for the most expressive features and match those together
Jitu
    - Title; create a vector/embedding from the title
    - Suggest different ways to phrase the title; create vectors from that
    -   very very very tricky
    -   Searching through description content, to find simular description and suggest title alterations or title simularity
Ugo
    - Authors
    -   Finding simularity between author contirbutation
    -      What metrics are used to rank simularity?
    -      Which ML Models or NN Models would be used to "segment" the categories
    -      Merge two or more features that look a like using the Abstract or Title and/or Author
    -        Look for the most expressive features and match those together
    -   Citation ranking
Joe
    - Break the descirption into sentences; create vector/embedding from each sentence
    - Keyword Extraction: Most common terms in Abstracts
'''

emb = model.encode(df.apply(lambda r: clean_description(r['title'] + ' ' + r['abstract']), axis=1).tolist())

In [14]:
# Add embeddings to df
df = df.reset_index().drop('index', axis=1)
df['vector'] = emb.tolist()

In [15]:
import pickle

# Export to file!
with open('arxiv_embeddings_10000.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)