# Scientific Paper Analysis

This notebook performs several actions to analyse (a) a 'seed' scientific paper, and (b) extract and download any references that can be found in that paper in arXiv.org. It can perform that step recursively, to build up a small graph of papers referred to by other papers. 

It uses a lot of heuristics and regular expressions to parse PDF files. Due to the huge variety of citation formats, it is very brittle. 



## Install and import dependencies, other preparation tasks

(Note: dependies should be installed via requirements.txt).

In [1]:
# conda install pytorch torchvision torchaudio -c pytorch
# pip install transformers
# pip install nltk
# pip install numpy 
# pip install pandas
#pip install wordcloud
# conda install matplotlib
# pip install scholarly
#pip install icecream

In [840]:

# Common Python libraries, used for regular expressions, file downloads and other things
import os, os.path
import re
import urllib, urllib.request, urllib.parse
# Parse arXiv API responses (in ATOM XML format)
import xml.etree.ElementTree as ET

# Alternative to print()
from icecream import ic

# Used to extract PDF text
import textract

# Numpy and Pandas for data manipulation.
# All references stored in a Pandas DataFrame for easy analysis.
import numpy as np
import pandas as pd

# Not used yet - but helpful for simple machine learning tasks.
# Maybe helpful for text summarisation in particular.
from transformers import pipeline

# Natural Language Toolkit - for tokenisation and basic linguistic analysis
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn

# Simple word cloud generator
from wordcloud import WordCloud

# Google Scholar wrapper - not that useful yet
from scholarly import scholarly


The code below downloads a set of English language stopwords ('a', 'the' etc), and adds commonly occuring terms in PDFs, to remove further noise.

In [4]:
# Preparatory steps
nltk.download('stopwords')
full_stop_words = set(nltk.corpus.stopwords.words('english'))

full_stop_words.add('b')
full_stop_words.add('c')
full_stop_words.add('d')
full_stop_words.add('e')
full_stop_words.add('f')
full_stop_words.add('g')
full_stop_words.add('h')
full_stop_words.add('j')
full_stop_words.add('k')
full_stop_words.add('l')
full_stop_words.add('m')
full_stop_words.add('n')
full_stop_words.add('p')
full_stop_words.add('q')
full_stop_words.add('r')
full_stop_words.add('u')
full_stop_words.add('v')
full_stop_words.add('x')
full_stop_words.add('w')
full_stop_words.add('y')
full_stop_words.add('z')
full_stop_words.add('pp')
full_stop_words.add('et')
full_stop_words.add('al')
full_stop_words.add('ha')
full_stop_words.add('li')
full_stop_words.add('sij')
full_stop_words.add('arxiv')

[nltk_data] Downloading package stopwords to /Users/liam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reference Extraction and Parsing


A series of regular expression utilities for parsing and extraction whoe references from PDFs.



In [829]:
# Common functions for parsing references from plain text

def re_matches(text, regex):
    iterator = re.findall(regex, text)

    refs = []
    for match in iterator:
        refs.append(match) 

    return refs

def re_digit():
    return r'\[\d*\]\ '

def re_first_first():
    return r'(?:(?:\-?[A-Z]\.\ )+(?:[A-Zvd][A-Za-z\-\u0080-\uFFFF]+\,?\ ?)+)'

def re_first_first_multiple():
    return r'(?:' + re_first_first() +'+and\ )?'

def re_first_last():
    return r'(?:(?:[A-Zvd][A-Za-z\-\u0080-\uFFFF]+)+\,\ (?:\-?[A-Z]\.\ ?)+)'

def re_first_last_multiple():
    return '(?:(?:' + re_first_last() +'\,\ )+' + re_first_last() + r'\,?\ and\ )?'

def re_full_first():
    return r'(?:(?:[A-Zvd][A-Za-z\-\u0080-\uFFFF]+\ )+(?:\-?[A-Z]\.?\ )*(?:[A-Zvd][A-Za-z\-\u0080-\uFFFF]+\,?\ ?)+)'

def re_year():
    return r'\(?(?:\d{4})\)?'

def re_trailing_digit():
    return r'(?:[\d\ \,]+\ )?'


def re_in_journal():
    return r'(?:(?:in)?\ [^\,]+\,\ )'

def re_pages():
    return r'(?:\ (?:pp\.\ )?\d+\-?\d+)?'


# Type 0
# Example: 2106.12139.pdf
def gen_refs_with_pages(text):
    r = ( 
        re_digit() 
        + r'('
        + re_first_first()  
        + r'[^\[]*?\,?\ ' 
        + re_year() 
        + re_pages()
        + r')'
        + r'\.?'
        # + re_trailing_digit() 
        )
    regex = re.compile(r)
    return re_matches(text, regex)

# Type 1
# Example: Fashionista: A fashion-aware graphical system for exploring
def gen_refs_end_year_firstname_first(text):
    r = ( 
        re_digit() 
        + r'('
        + re_first_first_multiple() 
        + re_first_first()  
        + r'[^\[]*?' 
        + re_in_journal()
        + r'[^\[]*?' 
        + re_year() 
        + r')'
        + r'\.?'
        )
    regex = re.compile(r)    
    return re_matches(text, regex) 

# Type 2
# Example: Fashion-gen: The generative fashion dataset and challenge.pdf
def gen_refs_end_year_firstname_last(text):
    regex = r'((?:(?:[A-Zvd][A-Za-zá\-\ ]+\,\ (?:\-?[A-Z]\.\ ?)+(?:\,\ )?)*)(?:\,?\ and\ )?(?:[A-Zvd][A-Za-zá\-\ ]+\,\ (?:\-?[A-Z]\.\ ?)+)[^\.]*\.\ [^\.]*\,\ \d{4}\.)'
    r = ( 
        r'('
        + re_first_last_multiple() 
        + re_first_last()  
        # + r'[^\[]*?\,?\ ' 
        + r'[^\[]*?\,?\ ' 
        # + re_pages()
        + re_year() 
        + r'\.'
        + r')'
        )
    regex = re.compile(r)    
    return re_matches(text, regex) 


# Type 3
# Example: A compact embedding for facial expression similarity.pdf
def gen_refs_with_pages_conjoined(text):
    r = ( 
        # re_digit() 
        r'('
        + re_first_first_multiple() 
        + re_first_first()  
        + r'\.[^\[]*?\,\ ' 
        + re_year() 
        + r'\.' 
        + re_trailing_digit() 
        + r')'
        )
    regex = re.compile(r)
    return re_matches(text, regex)

# Type 4
# Example: Shapenet/ An information-rich 3d model repository.pdf
def gen_refs_expanded_name(text):
    r = ( 
        r'\[(?:\d+)\]\ '
        + r'('
        + re_full_first()  
        + r'[^\[]*?\,?\ ' 
        + re_year() 
        + r'\.?'
        + r')'
        )
    regex = re.compile(r)    
    return re_matches(text, regex) 


# Defunct functions

# def gen_refs_end_year(text):
#     regex = r'[A-Z][A-Za-z\-]+\,\ [A-Z]\.(?:.*\n.*){1,4}\,\ \d\d\d\d[a-b]?\.'
#     return re_matches(text, regex) 

# def gen_refs_end_year_with_text_brackets(text):
#     regex = r'\[(?:[A-Za-z\ ]*\.?\,\ \d+)\]\ (?:.*\n.*){1,4}\d\d\d\d[a-b]?\.'
#     return re_matches(text, regex) 

# def gen_refs_end_year_with_number_brackets(text):
#     regex = r'\[(?:\d+)\]\ (?:.*\n.*){1,4}\d\d\d\d[a-b]?\.'
#     return re_matches(text, regex) 
    
# def gen_refs_end_pages(text):
#     text = text.replace('\n', ' ')
#     regex = r'\[\d*\]\ [^[]*pp\.\ \d+\.\ '
#     return re_matches(text, regex) 


# Main reference 'generator'. Looks for a variety of reference patterns,
# and returns, along with the extracted references, a number that acts to flag
# the best internal reference parsing option to apply.
def gen_refs_multiple_pass(text):
    refs0 = gen_refs_with_pages(text)
    refs1 = gen_refs_end_year_firstname_first(text)
    refs2 = gen_refs_end_year_firstname_last(text)
    refs3 = gen_refs_with_pages_conjoined(text)

    # Don't use this just yet - too greedy
    refs4 = gen_refs_expanded_name(text)

    lst = [refs0, refs1, refs2, refs3, refs4]
    lst_sorted = sorted(lst, key=len, reverse=True)
    refs = lst_sorted[0]
    ref_type = lst.index(refs)

    # Specific checks
    if ref_type == 0 and len(refs0) == len(refs1):
        refs = refs1
        ref_type = 1

    return refs, ref_type




Again, messy code to parse a reference string into parts (authors, title, journal / conference, publisher, volume, pages, year).

The type returned by gen_refs_multiple_pass() corresponds to the parse_refX function invoked here.

In [823]:
# Common functions for extracting references parts

# Makes a data frame object from a set of reference parameters
def make_dataframe(authors, title, journal, publisher, volume, year, pages, ref, source_file, source_title):
    return pd.DataFrame([[(', ').join(authors), title, journal, publisher, volume, year, pages, ref, source_file, source_title]], 
                        columns = ['authors', 'title', 'journal', 'publisher', 'volume', 'year', 'pages', 'full_ref', 'source_file', 'source_title'])

# Uses a general citation pattern to extract the authors, title, journal, volume, year, and pages
# The author pattern extracts individual authors. 
# The mapping object enables different orders in the regular expression.
def _parse_ref_internals(ref, citation_pattern, authors_pattern, mapping = {
        'first_authors': 0,
        'last_author': 1,
        'title': 2,
        'journal': 3,
        'publisher': 4,
        'volume': 5,
        'year': 6,
        'pages': 7
    }):
    all_authors = []
    title = ''
    journal = ''
    publisher = ''
    volume = ''
    year = ''
    pages = ''

    iterator = re.findall(citation_pattern, ref)
    for i in iterator:
        first_authors = i[mapping['first_authors']]
        # ic(first_authors)
        last_author = i[mapping['last_author']].strip()
        title = i[mapping['title']]
        journal = i[mapping['journal']]
        publisher = i[mapping['publisher']]
        volume = i[mapping['volume']]
        year = i[mapping['year']]
        pages = i[mapping['pages']]
        if first_authors != '':
            authors = re.findall(authors_pattern, first_authors)
            for a in authors:
                all_authors.append(a)
        all_authors.append(last_author)
    return all_authors, title, journal, publisher, volume, year, pages

# Parses a reference that follows this pattern:
# Y. Guo, Y. Liu, A. Oerlemans, S. Lao, S. Wu, M.S. Lew, Deep learning for visual understanding: A review, Neurocomputing, 187 (2016) 27-48
# From 2106.12139.pdf
# - Year before pages
# - All fields are comma-separated
# - Authors are not joined by a conjuntion
def parse_ref0(ref):
    citation_pattern = re.compile(
        r'((?:(?:\-?[A-Z]\.)+\ (?:[A-Zvd][A-Za-z\u0080-\uFFFF\-\ ]+)\,\ )*)' # authors
        + r'((?:\-?[A-Z]\.)+\ (?:[A-Zvd][A-Za-z\u0080-\uFFFF\-\ ]+))\,\ ' # last author
        + r'([^\,\.]*)\,\ ' # title
        + r'((?:[^\,]*)\,\ )?' # journal
        + r'((?:[^\,]*)\,\ )?' # publisher
        + r'(\d+\ )?' # volume
        + r'\(?(\d{4})\)?\,?\ ?' # year
        + r'(?:pp\.\ )?(\d+\-\d+)?') # pages
    authors_pattern = re.compile(
        r'((?:\-?[A-Z]\.)+\ (?:[A-Zvd][A-Za-z\u0080-\uFFFF\ \-]+))')

    return _parse_ref_internals(ref, citation_pattern, authors_pattern)


# Parses a reference that follows this pattern:
def parse_ref1(ref):
    citation_pattern = re.compile(
        r'((?:(?:\-?[A-Z]\.\ )+(?:[A-Zvd][A-Za-z\u0080-\uFFFF\-\ ]+)\,?\ )*and\ )?' 
        + r'((?:\-?[A-Zvd]\.\ )+(?:[A-Zvd][A-Za-z\u0080-\uFFFF\-\ ]+))[\,\.]\ '  # last author
        + r'[\u0080-\uFFFF]?([^\,\.]*)[\,\.\u0080-\uFFFF]+' # title
        + r'(?:\ in\ )?([^\,\.]*)' # journal
        + r'(?:([^\,]*)\,\ )?' # publisher 
        + r'(\d+\ )?' # volume
        + r'(\d{4})?\,?\ ?' # year
        + r'(?:pp\.\ )?(\d+\-?\d+)?' # pages
        ) 
    authors_pattern = re.compile(r'(?:\-?[A-Z]\.\ )+[A-Zvd][A-Za-z\u0080-\uFFFF\-\ ]+')

    return _parse_ref_internals(ref, citation_pattern, authors_pattern)
# Parses a reference that follows this pattern:
def parse_ref2(ref):
    
    citation_pattern = re.compile(
        r'((?:(?:[A-Zvd][A-Za-z\-\ \u0080-\uFFFF]+)+\,\ (?:\-?[A-Z]\.\ ?)+(?:\,\ )?)*)(?:\,?\ and\ )?'
        + r'([A-Zvd][A-Za-za\-\ \u0080-\uFFFF]+\,\ (?:\-?[A-Z]\.\ ?)+)' 
        + r'([^\.]*)' # title
        + r'\.\ '
        + r'((?:[^\,]*))?'  # journal
        + r'(?:\,\ (.*(?:pp\.\ |d+\(d+\)\:)?\d+(?:\-\d*)?))?' # pages 
        + r'(\.\ (?:[^\,]*))?' # publisher
        + r'\,\ '
        + r'(\d{4})'
        + r'(\,\ \d+\ )?' # volume
        # + r'\.'
        )
    authors_pattern = re.compile(
        r'(?:[A-Zvd][A-Za-za\-\u0080-\uFFFF]+)+\,\ (?:\-?[A-Z]\.\ ?)+')

    return _parse_ref_internals(ref, citation_pattern, authors_pattern, mapping = {
        'first_authors': 0,
        'last_author': 1,
        'title': 2,
        'journal': 3,
        'pages': 4,
        'publisher': 5,
        'volume': 7,
        'year': 6
    })

def parse_ref3(ref):
    citation_pattern = re.compile(
        r'((?:(?:\-?[A-Z]\.)+\ (?:[A-Zvd][A-Za-z\u0080-\uFFFF\ \-]+\,\ ))*)and\ ((?:\-?[A-Z]\.)+\ (?:[A-Zvd][A-Za-z\u0080-\uFFFF\ \-]+\.))?' 
        + r'([^\,]*)\,\ ([^\,]*)\,\ (?:([^\,]*)\,\ )?' 
        + r'(?:(\d+)\ )?' # volume
        + r'\(?(\d{4})\)?\,?\ ?' 
        + r'(?:pp\.\ )?(\d+\-?\d+)?')
    authors_pattern = re.compile(r'((?:\-?[A-Z]\.)+\ (?:[A-Zvd][A-Za-z\u0080-\uFFFF\ \-]+))')

    return _parse_ref_internals(ref, citation_pattern, authors_pattern)


def parse_ref4(ref):
    citation_pattern = re.compile(
        r'((?:(?:[A-Zvd][A-Za-z\-\u0080-\uFFFF]+\ )+(?:\-?[A-Z]\.?\ )*(?:[A-Zvd][A-Za-z\u0080-\uFFFF\-\ ]+)\,?\ )*)and\ ' 
        + r'((?:[A-Zvd][A-Za-z\-\u0080-\uFFFF]+\ )+(?:\-?[A-Zvd]\.?\ )*(?:[A-Zvd][A-Za-z\u0080-\uFFFF\-\ ]+))[\,\.]\ (?:et\ al\.\ )?'  # last author
        + r'([^\.]*)\.\ ' # title
        + r'(?:In\ )?([^\,]*)(?:[\,\.]\ )' # journal
        + r'(\d+(?:\(\d+\))?\:?)?' # volume
        + r'(?:pages\ )?(\d+(?:[\–|\u2014]\d+)?)?' # pages
        + r'(?:([^\,]*)\,\ )?' # publisher 
        + r'.*(\d{4})\.' # year
        # + r'(\!)?'
        # + r'(\!)?'
        # + r'(\!)?'
        # + r'(\!)?'
        ) 
    authors_pattern = re.compile(r'(?:[A-Zvd][A-Za-z\-\u0080-\uFFFF]+\ )+(?:\-?[A-Z]\.\ )*[A-Zvd][A-Za-z\u0080-\uFFFF\-\ ]+')

    return _parse_ref_internals(ref, citation_pattern, authors_pattern, mapping = {
        'first_authors': 0,
        'last_author': 1,
        'title': 2,
        'journal': 3,
        'volume': 4,
        'pages': 5,
        'publisher': 6,
        'year': 7
    })

def parse_ref(ref, ref_type):
    if ref_type == 0:
        return parse_ref0(ref)
    elif ref_type == 1:
        return parse_ref1(ref)
    elif ref_type == 2:
        return parse_ref2(ref)
    elif ref_type == 3:
        return parse_ref3(ref)
    elif ref_type == 4:
        return parse_ref4(ref)
    else:
        return parse_ref0(ref)


def parse_ref_and_makedataframe(ref, ref_type, source_file, source_title):
    authors, title, journal, publisher, volume, year, pages = parse_ref(ref, ref_type)
    return make_dataframe(authors, title, journal, publisher, volume, year, pages, ref, source_file, source_title)




A set of utility functions to help with processing PDF and analysing text.

In [848]:
# Other utility functions

def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word)

def remove_set_from_dict(s, d):
    for t in s:
        if t in d:
            del d[t]
    return d
    
def word_frequencies(text):
    words = re.findall('[A-Za-z][A-Za-z0-9]*', text)
    frequencies = {}
    for w in words:
        w = w.lower()
        w = get_lemma(w)
        if w not in full_stop_words:
            if w in frequencies:
                frequencies[w] += 1
            else:
                frequencies[w] = 1
    return frequencies

def word_frequencies_dist(text, frequencies):
    words = re.findall('[A-Za-z][A-Za-z0-9]*', text)
    for w in words:
        w = w.lower()
        w = get_lemma(w)
        if w not in full_stop_words:
            if w in frequencies:
                frequencies[w] += 1
            else:
                frequencies[w] = 1
    return frequencies



def wc(freqs, file_name):
    wordcloud = WordCloud(background_color="white", width=600, height=600, max_words=5000, contour_width=3, contour_color='steelblue')
    # Generate a word cloud
    #wordcloud.generate(long_string)
    wordcloud.generate_from_frequencies(freqs)
    wordcloud.to_file(f'data/word-cloud-{file_name}.png')
    wordcloud.to_image()



def arxiv_results(title):
    t =  urllib.parse.quote_plus(title)
    url = 'http://export.arxiv.org/api/query?search_query=all:'+t+'&start=0&max_results=1'
    data = urllib.request.urlopen(url)
    results = data.read().decode('utf-8')
    return results

def make_up_file_name(file_name, directory):
    return os.path.join(directory, file_name + '.pdf')

def extract_and_save_pdf_from_atom(atom_xml, title, title_no_colon, title_stem, directory, pass_through=False):
    root = ET.fromstring(atom_xml)
    for e in root.findall('{http://www.w3.org/2005/Atom}entry'):
        t = e.find('{http://www.w3.org/2005/Atom}title')
        link = e.find('{http://www.w3.org/2005/Atom}link')
        if t.text.lower().find(title_stem.lower()) == 0:
            for link in e.findall('{http://www.w3.org/2005/Atom}link'):
                # Title must match and link must be a pdf
                if 'type' in link.attrib and link.attrib['type'] == 'application/pdf':
                    u  = link.attrib['href']
                    ic(u, title)
                    if not pass_through:
                        urllib.request.urlretrieve(u, 
                            make_up_file_name(title, directory))

def test_element(atom_xml):
    root = ET.fromstring(atom_xml)
    entry = root.find('{http://www.w3.org/2005/Atom}entry')
    if entry is not None:
        return True
    else:
        return False

def top_dist(freqs, n):
    counter = 0
    for w in sorted(freqs, key = freqs.get, reverse = True):
        counter = counter + 1
        print(w, freqs[w])
        if counter == n:
            break

## Processing files

The code below analyses frequencies and extracts references for a 'seed' paper.

Both file name and title must be supplied, and the PDF file downloaded to the 'refs' directory first.

In [517]:

# Input
file_name = '2106.12139.pdf'
title = 'PatentNet: A Large-Scale Incomplete Multiview, Multimodal, Multilabel Industrial Goods Image Database'


# Extract the text
text = textract.process("refs/" + file_name).decode('utf-8')

# Obtain word frequencies
freqs = word_frequencies(text)

# Set up a references dataframe
references = pd.DataFrame(columns = ['authors', 'title', 'journal', 'publisher', 'volume', 'year', 'pages', 'full_ref', 'source_file', 'source_title'])

# Extract the references
refs, ref_type = gen_refs_multiple_pass(text)

# Add the references to the dataframe
for r in refs:
    r = r.replace('\n', ' ') # Is this necessary? 
    references = references.append(parse_ref_and_makedataframe(r, ref_type, file_name, title))

# Save references to a csv
references.to_csv('data/references.csv')



In [60]:
# Test code

# t = references['title'].iloc[7]
# print(t)
# r = arxiv_results(t.strip())
# print(r)
# # ET.tostring(r)

In [839]:
## Test code

# s = """Liu, Z., Luo, P., Wang, X., and Tang, X. Deep learning
# face attributes in the wild. In Proceedings of the IEEE
# International Conference on Computer Vision, pp. 3730–
# 3738, 2015."""

# # \1{5}.*\d\d\d\d\.
# # regex = r'[A-Z][A-Za-z\,\.\ ]+[A-Z]\.\ [A-Z](?:.*\n.*){0,5}\,\ \d\d\d\d\.'
# regex = r'[A-Z][A-Za-z\-]+\,\ [A-Z]\.(?:.*\n.*){0,5}\,\ \d\d\d\d\.'
# pattern = re.compile(regex)
# iterator = re.findall(pattern, s)
# print(iterator)


In [844]:


# More test code.
# This is used to process and extract references from downloaded pdfs.
# It helps to check how the parsing code is identifying types of citations, 
# before actual parsing occurs.
def test_ref_type(file):
    f = os.path.join(file)
    text = textract.process(f).decode('utf-8')
    text = text.replace('\n', ' ')
    # regex = r'(?:[A-Zvd][A-Za-zá\-]+)\,\ (?:[A-Z]\.\ ?)+'
    # regex = r'((?:(?:[A-Zvd][A-Za-zá\-\ ]+\,\ (?:\-?[A-Z]\.\ ?)+(?:\,\ )?)*)(?:\,?\ and\ )?(?:[A-Zvd][A-Za-zá\-\ ]+\,\ (?:\-?[A-Z]\.\ ?)+)[^\.]*\.\ [^\.]*\,\ \d{4}\.)'
    # pattern = re.compile(regex)
    # iterator = re.findall(pattern, text)
    # ic(file)
    # for i in iterator:
    #     ic(i)
    refs_local, ref_type = gen_refs_multiple_pass(text)
    # refs_local = gen_refs_with_pages(text)
    # refs_local = gen_refs_end_year_firstname_first(text)
    # refs_local = gen_refs_end_year_firstname_last(text)
    # refs_local = gen_refs_with_pages_conjoined(text)
    # refs_local = gen_refs_expanded_name(text)
    
    # refs_local = gen_refs_text_brackets(text)
    # ref_type = 1
    ic(file)
    ic(ref_type)
    ic(len(refs_local))
    # ic(refs_local)
    # ic(text)



# test_ref_type('./refs/2106.12139.pdf')
# test_ref_type('./refs/download/A compact embedding for facial expression similarity.pdf')
# test_ref_type('./refs/download/Central similarity quantization for efficient image and video retrieval.pdf')
# test_ref_type('./refs/download/Doubly Aligned Incomplete Multi-view Clustering.pdf')
# test_ref_type('./refs/download/Efficientnet: Rethinking model scaling for convolutional neural networks.pdf')
# test_ref_type('./refs/download/Fashion-gen: The generative fashion dataset and challenge.pdf')
# test_ref_type('./refs/download/Fashion-mnist: a novel image dataset for benchmarking machine learning algorithms.pdf')
# test_ref_type('./refs/download/Fashionista: A fashion-aware graphical system for exploring visually similar items.pdf')
# test_ref_type('./refs/download/Hashnet: Deep learning to hash by continuation.pdf')
# test_ref_type('./refs/download/Imagenet: A large-scale hierarchical image database.pdf')
# test_ref_type('./refs/download/Partnet: A large-scale benchmark for finegrained and hierarchical part-level 3d object understanding.pdf')
# test_ref_type('./refs/download/Shapenet: An information-rich 3d model repository.pdf')





In [865]:


# Attempt to locate references on Arxiv, and save any found files as pdfs
def download_from_arxiv(titles, directory):
    counter = 0
    for title in titles:

        # Check if the string is empty
        if len(title) == 0:
            continue

        f = make_up_file_name(title, directory)

        counter = counter + 1

        # If the file already exists, 
        if not os.path.isfile(f):
            
            # Search for the title on Arxiv
            r = arxiv_results(title)

            # Create abridged versions
            title_stem = title
            title_no_colon = title
            if title.find(':') > -1:
                title_stem = title[:title.index(':')]
                title_no_colon = title[:title.index(':')] + title[title.index(':')+1:]
            
            # Did we find a result?
            has_entry = test_element(r)

            # Remove the semi-colon - seems to confuse Arxiv API
            if not has_entry:
                r = arxiv_results(title_no_colon)
                has_entry = test_element(r)

            # Remove everything after the semi-colon
            if not has_entry:
                r = arxiv_results(title_stem)
                has_entry = test_element(r)

            print(counter, title)
            if has_entry:
                extract_and_save_pdf_from_atom(r, title, title_no_colon, title_stem, False)




In [866]:

titles = references['title']
for t in titles:
    print(str(t))
# download_from_arxiv(titles, './refs/download')

Deep learning for visual understanding: A review
Deep convolutional neural networks for image classification: A comprehensive review
Imagenet: A large-scale hierarchical image database
Deepfashion: Powering robust clothes recognition and retrieval with rich annotations

3d shapenets: A deep representation for volumetric shapes
Seeing 3d chairs: exemplar part-based 2d-3d alignment using a large dataset of cad models
Benchmark Datasets for Fault Detection and Classification in Sensor Data
A style-based generator architecture for generative adversarial networks
A compact embedding for facial expression similarity
Partnet: A large-scale benchmark for finegrained and hierarchical part-level 3d object understanding
3d object representations for fine-grained categorization
Fashionista: A fashion-aware graphical system for exploring visually similar items
Fashionai: A hierarchical dataset for fashion understanding
Fashion-mnist: a novel image dataset for benchmarking machine learning algorithm

In [837]:
d = './refs/download'
# d = './refs'

all_freqs = {**freqs}
all_references = references.copy()
for file in os.listdir(d):
    # if file.endswith("Fashionista: A fashion-aware graphical system for exploring visually similar items.pdf"):
    # if file.endswith("A compact embedding for facial expression similarity.pdf"):
    # if file.endswith("Fashion-gen: The generative fashion dataset and challenge.pdf"):
    # if file.endswith("Partnet: A large-scale benchmark for finegrained and hierarchical part-level 3d object understanding.pdf"):
    # if file.endswith("2106.12139.pdf"):
    # if file.endswith("Shapenet: An information-rich 3d model repository.pdf"):    
    # if file.endswith("Imagenet: A large-scale hierarchical image database.pdf"):    
    if file.endswith(".pdf") and file.find('Deepfashion') == -1:
        f = os.path.join(d, file)
        ic(f)
        try:
            text_local = textract.process(f).decode('utf-8')
            text_local = text_local.replace('\n', ' ')
            all_freqs = word_frequencies_dist(text_local, all_freqs)
            refs_local, ref_type  = gen_refs_multiple_pass(text_local)

            ic(len(refs_local))
            ic(ref_type)
            counter = 0
            for r in refs_local:
                counter = counter + 1

                # if counter > 3:
                #     break
                # ic(r)

                # authors, title, journal, publisher, volume, year, pages = parse_ref1(r)
                # ic(authors)
                # ic(title)
                # ic(journal)
                # ic(publisher)
                # ic(volume)
                # ic(year)
                # ic(pages)

                all_references = all_references.append(parse_ref_and_makedataframe(r, ref_type, f, ''))

        except Exception as e:
            ic(e)
    

ic| f: ('./refs/download/Fashionista: A fashion-aware graphical system for exploring '
        'visually similar items.pdf')
ic| len(refs_local): 7
ic| ref_type: 1
ic| f: './refs/download/Fashion-gen: The generative fashion dataset and challenge.pdf'
ic| len(refs_local): 31
ic| ref_type: 2
ic| f: ('./refs/download/Central similarity quantization for efficient image and '
        'video retrieval.pdf')
ic| len(refs_local): 44
ic| ref_type: 3
ic| f: './refs/download/Doubly Aligned Incomplete Multi-view Clustering.pdf'
ic| len(refs_local): 0
ic| ref_type: 1
ic| f: ('./refs/download/Partnet: A large-scale benchmark for finegrained and '
        'hierarchical part-level 3d object understanding.pdf')
ic| len(refs_local): 43
ic| ref_type: 1
ic| f: ('./refs/download/Efficientnet: Rethinking model scaling for convolutional '
        'neural networks.pdf')
ic| len(refs_local): 52
ic| ref_type: 2
ic| f: './refs/download/A compact embedding for facial expression similarity.pdf'
ic| len(refs_local)

In [838]:
all_references.to_csv('data/all_references.csv')

# Language processing

In [841]:
# Generate the word cloud from the seed file frequencies.
wc(freqs, file_name)

# Generate the word cloud from frequencies of all downloaded files.
wc(all_freqs, 'all_freqs')



In [121]:
top_dist(freqs, 5)

image 95
industrial 92
good 81
view 67
patentnet 61


In [842]:
top_dist(all_freqs, 10)

image 495
model 309
part 283
hash 280
method 267
learning 262
category 261
dataset 256
shape 249
data 247
