In [136]:
# conda install pytorch torchvision torchaudio -c pytorch
# pip install transformers
# pip install nltk
# pip install numpy 
# pip install pandas
#pip install wordcloud
# conda install matplotlib
# pip install scholarly

In [1]:
import textract
from transformers import pipeline
import nltk
import numpy as np
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import WordCloud
from nltk.corpus import wordnet as wn

import urllib, urllib.request, urllib.parse
import xml.etree.ElementTree as ET
import re
import os.path

from scholarly import scholarly

In [2]:
# Preparatory steps
nltk.download('stopwords')
full_stop_words = set(nltk.corpus.stopwords.words('english'))

full_stop_words.add('b')
full_stop_words.add('c')
full_stop_words.add('d')
full_stop_words.add('e')
full_stop_words.add('f')
full_stop_words.add('g')
full_stop_words.add('h')
full_stop_words.add('j')
full_stop_words.add('k')
full_stop_words.add('l')
full_stop_words.add('m')
full_stop_words.add('n')
full_stop_words.add('p')
full_stop_words.add('q')
full_stop_words.add('r')
full_stop_words.add('u')
full_stop_words.add('v')
full_stop_words.add('x')
full_stop_words.add('w')
full_stop_words.add('y')
full_stop_words.add('z')
full_stop_words.add('pp')
full_stop_words.add('et')
full_stop_words.add('al')
full_stop_words.add('ha')
full_stop_words.add('li')
full_stop_words.add('sij')
full_stop_words.add('arxiv')

[nltk_data] Downloading package stopwords to /Users/liam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
# Common functions

def re_matches(text, regex):
    iterator = re.findall(regex, text)

    refs = []
    for match in iterator:
        refs.append(match) 

    # print(len(refs))
    return refs

def gen_refs(text):
    regex = r'\[\d*\]\ ([A-Z]\.[^\[]*)\n'
    return re_matches(text, regex)
    
def gen_refs_no_number(text):
    regex = r'\[\d*\]\ ([A-Z]\.[^\[]*)\n'
    return re_matches(text, regex)

def gen_refs_end_year(text):
    regex = r'[A-Z][A-Za-z\-]+\,\ [A-Z]\.(?:.*\n.*){1,4}\,\ \d\d\d\d[a-b]?\.'
    return re_matches(text, regex) 

def gen_refs_end_year_with_text_brackets(text):
    regex = r'\[(?:[A-Za-z\ ]*\.?\,\ \d+)\]\ (?:.*\n.*){1,4}\d\d\d\d[a-b]?\.'
    return re_matches(text, regex) 

def gen_refs_end_year_with_number_brackets(text):
    regex = r'\[(?:\d+)\]\ (?:.*\n.*){1,4}\d\d\d\d[a-b]?\.'
    return re_matches(text, regex) 
    
def gen_refs_end_pages(text):
    text = text.replace('\n', ' ')
    regex = r'\[\d*\]\ [^[]*pp\.\ \d+\.\ '
    return re_matches(text, regex) 

def get_lemma(word):
    return WordNetLemmatizer().lemmatize(word)

def remove_set_from_dict(s, d):
    for t in s:
        if t in d:
            del d[t]
    return d
    
def word_frequencies(text):
    words = re.findall('[A-Za-z][A-Za-z0-9]*', text)
    frequencies = {}
    for w in words:
        w = w.lower()
        w = get_lemma(w)
        if w not in full_stop_words:
            if w in frequencies:
                frequencies[w] += 1
            else:
                frequencies[w] = 1
    return frequencies

def word_frequencies_dist(text, frequencies):
    words = re.findall('[A-Za-z][A-Za-z0-9]*', text)
    for w in words:
        w = w.lower()
        w = get_lemma(w)
        if w not in full_stop_words:
            if w in frequencies:
                frequencies[w] += 1
            else:
                frequencies[w] = 1
    return frequencies



def wc(freqs, file_name):
    wordcloud = WordCloud(background_color="white", width=600, height=600, max_words=5000, contour_width=3, contour_color='steelblue')
    # Generate a word cloud
    #wordcloud.generate(long_string)
    wordcloud.generate_from_frequencies(freqs)
    wordcloud.to_file(f'data/word-cloud-{file_name}.png')
    wordcloud.to_image()


def parse_ref_default(ref, source_file, source_title):
    segments = ref.split(',')
    authors = []
    title = ''
    journal = ''
    publisher = ''
    volume = ''
    year = ''
    pages = ''
    finished_authors = False
    author_count = 0
    for count, value in enumerate(segments):
        if value.find('.') > -1 and finished_authors == False:
            authors.append(value.strip())
            author_count = author_count + 1
        elif value.find('.') == -1 and count < author_count + 2:
            if finished_authors == False:
                finished_authors = True
                title = value.strip()
            else:
                journal = value.strip()
        elif count == len(segments) - 1 and author_count + 2 == len(segments) - 1:
            match = re.search(r'(.*)\((\d\d\d\d)\)(.*)', value)
            if match is not None:
                volume = match.groups()[0]
                year = match.groups()[1]
                pages = match.groups()[2]
        # For when year and pages are comma-separated
        elif count == len(segments) - 2 and author_count + 3 == len(segments) - 1:
            year = value
        elif count == len(segments) - 1 and author_count + 3 == len(segments) - 1:
            pages = value
        # For when volume / publisher, year and pages are comma-separated
        elif count == len(segments) - 3 and author_count + 4 == len(segments) - 1:
            if value.isdigit():
                volume = value
            else:
                publisher = value
        elif count == len(segments) - 2 and author_count + 4 == len(segments) - 1:
            year = value
        elif count == len(segments) - 1 and author_count + 4 == len(segments) - 1:
            pages = value
    return pd.DataFrame([[(', ').join(authors), title, journal, publisher, volume, year, pages, ref, source_file, source_title]], 
                        columns = ['authors', 'title', 'journal', 'publisher', 'volume', 'year', 'pages', 'full_ref', 'source_file', 'source_title'])

def parse_ref_1(ref, source_file, source_title):
    pattern = re.compile(r'[A-Z][a-z\-]+\, [A-Z\.]+')
    authors = []
    title = ''
    journal = ''
    publisher = ''
    volume = ''
    year = ''
    pages = ''
    finished_authors = False
    author_count = 0
    return pd.DataFrame([[(', ').join(authors), title, journal, publisher, volume, year, pages, ref, source_file, source_title]], 
                        columns = ['authors', 'title', 'journal', 'publisher', 'volume', 'year', 'pages', 'full_ref', 'source_file', 'source_title'])

def parse_ref(ref, ref_type, source_file, source_title):
    print(ref_type)
    if ref_type == 0:
        return parse_ref_default(ref, source_file, source_title)
    elif ref_type == 1:
        return parse_ref_1(ref, source_file, source_title)
    else:
        return parse_ref_default(ref, source_file, source_title)

def arxiv_results(title):
    t =  urllib.parse.quote_plus(title)
    url = 'http://export.arxiv.org/api/query?search_query=all:'+t+'&start=0&max_results=1'
    data = urllib.request.urlopen(url)
    results = data.read().decode('utf-8')
    return results

def make_up_file_name(file_name):
    return 'refs/download/' +  file_name + '.pdf'

def extract_and_save_pdf_from_atom(atom_xml, title, title_no_colon, title_stem, pass_through=False):
    root = ET.fromstring(atom_xml)
    for e in root.findall('{http://www.w3.org/2005/Atom}entry'):
        t = e.find('{http://www.w3.org/2005/Atom}title')
        link = e.find('{http://www.w3.org/2005/Atom}link')
        if t.text.lower().find(title_stem.lower()) == 0:
            for link in e.findall('{http://www.w3.org/2005/Atom}link'):
                # Title must match and link must be a pdf
                if 'type' in link.attrib and link.attrib['type'] == 'application/pdf':
                    u  = link.attrib['href']
                    print(u, title)
                    if not pass_through:
                        urllib.request.urlretrieve(u, make_up_file_name(title))

def test_element(atom_xml):
    root = ET.fromstring(atom_xml)
    entry = root.find('{http://www.w3.org/2005/Atom}entry')
    if entry is not None:
        return True
    else:
        return False

def top_dist(freqs, n):
    counter = 0
    for w in sorted(freqs, key = freqs.get, reverse = True):
        counter = counter + 1
        print(w, freqs[w])
        if counter == n:
            break

In [6]:
file_name = '2106.12139.pdf'
title = 'PatentNet: A Large-Scale Incomplete Multiview, Multimodal, Multilabel Industrial Goods Image Database'


text = textract.process("refs/" + file_name).decode('utf-8')
freqs = word_frequencies(text)

references = pd.DataFrame(columns = ['authors', 'title', 'journal', 'publisher', 'volume', 'year', 'pages', 'full_ref', 'source_file', 'source_title'])

refs = gen_refs(text)
for r in refs:
    r = r.replace('\n', ' ')
    references = references.append(parse_ref(r, file_name, title))

references.to_csv('data/references.csv')



In [59]:
# Generate the word cloud from the seed file frequencies.
wc(freqs, file_name)



In [60]:
# Test code

# t = references['title'].iloc[7]
# print(t)
# r = arxiv_results(t.strip())
# print(r)
# # ET.tostring(r)

In [56]:
# Attempt to locate references on Arxiv, and save
counter = 0
for title in references['title']:
    f = make_up_file_name(title)
    counter = counter + 1
    if not os.path.isfile(f):
        r = arxiv_results(title)
        title_stem = title
        title_no_colon = title
        if title.find(':') > -1:
            title_stem = title[:title.index(':')]
            title_no_colon = title[:title.index(':')] + title[title.index(':')+1:]
        
        has_entry = test_element(r)

        # Remove the semi-colon - seems to confuse Arxiv API
        if not has_entry:
            r = arxiv_results(title_no_colon)
            has_entry = test_element(r)

        # Remove everything after the semi-colon
        if not has_entry:
            r = arxiv_results(title_stem)
            has_entry = test_element(r)

        print(counter, title)
        if has_entry:
            extract_and_save_pdf_from_atom(r, title, title_no_colon, title_stem, False)



1 Deep learning for visual understanding: A review
2 Deep convolutional neural networks for image classification: A comprehensive review
4 Microsoft coco: Common objects in context
5 Deepfashion: Powering robust clothes recognition and retrieval with rich annotations
http://arxiv.org/pdf/1901.07973v1 Deepfashion: Powering robust clothes recognition and retrieval with rich annotations


KeyboardInterrupt: 

In [179]:
def gen_refs_multiple_pass(text):
    refs = gen_refs(text)
    ref_type = 0
    if len(refs) == 0:
        refs = gen_refs_end_year(text)
        ref_type = 1
    if len(refs) == 0:
        refs = gen_refs_end_year_with_text_brackets(text)
        ref_type = 2
    if len(refs) == 0:
        refs = gen_refs_end_year_with_number_brackets(text)
        ref_type = 3
    if len(refs) == 0:
        refs = gen_refs_end_pages(text)
        ref_type = 4
    return refs, ref_type



import os
d = './refs/download'
all_freqs = {**freqs}
all_references = references.copy()
for file in os.listdir(d):
    if file.endswith("Fashion-gen: The generative fashion dataset and challenge.pdf"):
    # if file.endswith(".pdf"):
        print(os.path.join(d, file))
        f = os.path.join(d, file)
        try:
            text_local = textract.process(f).decode('utf-8')
            all_freqs = word_frequencies_dist(text_local, all_freqs)
            refs_local, ref_type = gen_refs_multiple_pass(text_local)

            whole_pattern = re.compile(r'((?:[A-Z][A-Za-zá\-\ ]+\,\ (?:\-?[A-Z]\.\ ?)+(?:\,\ )?)*)(?:\,?\ and\ )?([A-Z][A-Za-zá\-\ ]+\,\ (?:\-?[A-Z]\.\ ?)+)([^\.]*)\.\ (.*)\,\ (\d{4})')
            author_subpatten = re.compile(r'[A-Z][A-Za-zá\-\ ]+\,\ (?:\-?[A-Z]\.\ ?)+')
            for r in refs_local:
                r = r.replace('\n', ' ')
                print(r)
                # pattern = re.compile(r'([A-Z][a-z\-]+\,\ (?:[A-Z]\.?)*[\,\ ]){0,8}([^\.]*)(.*)\,\ (\d){4}')
                it = re.findall(whole_pattern, r)
                for i in it:
                    first_authors = i[0]
                    last_author = i[1].strip()
                    title = i[2]
                    journal = i[3]
                    year = i[4]
                    all_authors = []
                    if first_authors != '':
                        authors = re.findall(author_subpatten, first_authors)
                        # print(first_authors)
                        # print(authors)
                        for a in authors:
                            all_authors.append(a)
                    all_authors.append(last_author)
                    print(all_authors)
                    print(title)
                    print(journal)
                    print(year)
# Han, X., Wu, Z., Wu, Z., Yu, R., and Davis, L. S. Viton: An image-based virtual try-on network. arXiv preprint arXiv:1711.08447, 2017.

                # all_references = all_references.append(parse_ref(r, ref_type, f, ''))

        except:
            print('Failed to read', f)

    

./refs/download/Fashion-gen: The generative fashion dataset and challenge.pdf
Han, X., Wu, Z., Wu, Z., Yu, R., and Davis, L. S. Viton: An image-based virtual try-on network. arXiv preprint arXiv:1711.08447, 2017.
['Han, X.', 'Wu, Z.', 'Wu, Z.', 'Yu, R.', 'Davis, L. S.']
Viton: An image-based virtual try-on network
arXiv preprint arXiv:1711.08447
2017
Huang, X., Li, Y., Poursaeed, O., Hopcroft, J., and Belongie, S. Stacked generative adversarial networks. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR), volume 2, pp. 4, 2017.
['Huang, X.', 'Li, Y.', 'Poursaeed, O.', 'Hopcroft, J.', 'Belongie, S.']
Stacked generative adversarial networks
In IEEE Conference on Computer Vision and Pattern Recognition (CVPR), volume 2, pp. 4
2017
Isola, P., Zhu, J.-Y., Zhou, T., and Efros, A. A. Image-toimage translation with conditional adversarial networks. arXiv preprint, 2017.
['Isola, P.', 'Zhu, J.-Y.', 'Zhou, T.', 'Efros, A. A.']
Image-toimage translation with conditional adversar

In [48]:
# f = './refs/download/Shapenet: An information-rich 3d model repository.pdf'
f = './refs/download/Fashion-gen: The generative fashion dataset and challenge.pdf'
# f = './refs/download/Doubly Aligned Incomplete Multi-view Clustering.pdf'
# f = './refs/download/Shapenet: An information-rich 3d model repository.pdf'

t = textract.process(f).decode('utf-8')
print(t)
# t = t.replace('\n', ' ')
# regex = r'\[\d*\]\ [^\n]*\n[^\n]*\d\d\d\d\.'
# regex = r'[A-Z][A-Za-z\-]+\,(.*\n.*){0,5}\,\ \d\d\d\d\.'
# regex = r'[A-Z][A-Za-z\-]+\,\ [A-Z]\.(?:.*\n.*){1,4}\,\ \d\d\d\d[a-b]?\.'
# regex = r'\ [A-Z][A-Za-z\,\.\ \-\n]+[A-Z]\.\ [A-Z](?:.*\n.*){0,5}\,\ \d\d\d\d\.'
# regex = r'[A-Z][A-Za-z\,\.\ ]+[A-Z]\.\ [A-Z](.\n.)*\1{5}.*\d\d\d\d\.\ '
# regex = r'\[(?:[A-Za-z\ \.]*\,\ \d+)\]\ (?:.*\n.*){1,4}\d\d\d\d[a-b]?\.'
regex = r'\[(?:\d+)\]\ (?:.*\n.*){1,4}\d\d\d\d[a-b]?\.'
pattern = re.compile(regex)
iterator = re.findall(pattern, t)

print(len(iterator))
refs = []
for match in iterator:
    refs.append(match) 
print(refs)

ShapeNet: An Information-Rich 3D Model Repository
http://www.shapenet.org
1

Angel X. Chang , Thomas Funkhouser2 , Leonidas Guibas1 , Pat Hanrahan1 , Qixing Huang3 , Zimo Li3 ,
Silvio Savarese1 , Manolis Savva∗1 , Shuran Song2 , Hao Su∗1 , Jianxiong Xiao2 , Li Yi1 , and Fisher Yu2

arXiv:1512.03012v1 [cs.GR] 9 Dec 2015

1

Stanford University — 2 Princeton University — 3 Toyota Technological Institute at Chicago
Authors listed alphabetically

Abstract

tial scans is a research goal shared by computer graphics
and vision. Scene understanding from 2D images is a grand
challenge in vision that has recently benefited tremendously
from 3D CAD models [28, 34]. Navigation of autonomous
robots and planning of grasping manipulations are two large
areas in robotics that benefit from an understanding of 3D
shapes. At the root of all these research problems lies
the need for attaching semantics to representations of 3D
shapes, and doing so at large scale.
Recently, data-driven methods from the mac

In [52]:
wc(all_freqs, 'all_freqs')



In [26]:
s = """Liu, Z., Luo, P., Wang, X., and Tang, X. Deep learning
face attributes in the wild. In Proceedings of the IEEE
International Conference on Computer Vision, pp. 3730–
3738, 2015."""

# \1{5}.*\d\d\d\d\.
# regex = r'[A-Z][A-Za-z\,\.\ ]+[A-Z]\.\ [A-Z](?:.*\n.*){0,5}\,\ \d\d\d\d\.'
regex = r'[A-Z][A-Za-z\-]+\,\ [A-Z]\.(?:.*\n.*){0,5}\,\ \d\d\d\d\.'
pattern = re.compile(regex)
iterator = re.findall(pattern, s)
print(iterator)


['Liu, Z., Luo, P., Wang, X., and Tang, X. Deep learning\nface attributes in the wild. In Proceedings of the IEEE\nInternational Conference on Computer Vision, pp. 3730–\n3738, 2015.']


In [121]:
top_dist(freqs, 5)

image 95
industrial 92
good 81
view 67
patentnet 61


In [58]:
top_dist(all_freqs, 1000)

image 495
model 309
part 283
hash 280
method 267
learning 262
category 261
dataset 256
shape 249
data 247
network 240
view 207
object 190
deep 187
segmentation 185
figure 181
annotation 177
center 162
training 154
different 153
code 149
using 146
retrieval 135
set 131
result 130
similarity 129
hashnet 129
scale 127
large 126
function 126
table 126
level 123
imagenet 122
fashion 122
multi 122
accuracy 119
instance 118
loss 116
label 115
recognition 114
information 110
datasets 110
number 109
bit 109
expression 107
ieee 105
triplet 105
use 103
fine 101
two 100
depth 100
based 99
three 99
visual 98
semantic 98
class 97
computer 97
efficientnet 97
performance 96
vision 96
grained 95
scaling 95
hashing 95
hierarchical 94
page 94
feature 93
also 93
point 93
embedding 93
industrial 92
good 92
similar 92
one 92
matrix 91
item 89
work 88
conference 88
neural 87
space 86
example 85
algorithm 85
classification 84
show 83
proposed 82
incomplete 80
top 79
used 78
clustering 78
first 76
shapenet 76


In [53]:
all_references.to_csv('data/all_references.csv')