# Scraping

### Setups and configulations

In [1]:
import os
import sys
import re
import requests
from bs4 import BeautifulSoup
import bibtexparser
import pickle
sys.path.append('../')
import confana as common
#from common import *

### Execute scraping

In [None]:
# configurations
param = common.set_param('https://openaccess.thecvf.com/', 'CVPR', 2014, 2023, 1, 1)

histall = {}
for year in range(param.yearFrom, param.yearTo, param.interval):
    yr = str(year)
    pklfl = 'summary' + yr

    if os.path.exists(pklfl):
        with open(pklfl,'rb') as f:
            authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pickle.load(f)
            print(f'loading {pklfl}...')
    else:
        conf = param.conf_prefix + yr
        print(conf,'...')
        pu = common.parseUrl(param.site, conf)
        normal, authorss, titles = pu.parseCVF()
        if normal < 0: 
            print('parsing failed...')

        pAT = common.parseATlist(titles, authorss)
        jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pAT.selectJP()
        hist = pAT.makehist(jpauthors,yr)

        with open(pklfl, 'wb') as f:
            pickle.dump((authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper), f)

    print(f'\t{numjpauthors} Japanese authors out of all {numallauthors} authors')
    print(f'\t{numjppaper} Japanese related papers out of all {numpapers} papers')
    #for ii in range(0,len(jpauthors)):
    #    print(f'{jpauthors[ii]}\t{jptitles[ii]}')
    pAT = common.parseATlist(titles, authorss)
    histall = pAT.mergehist(histall,hist,yr)

ph = common.parseHist(param,histall)
ph.plotsorted()

### Analysis of trends by titles

In [None]:
import pickle
import re

year = 2023
#stoplist = "^(a|an|the|in|on|to|of|with|without|by|from|for|via|using|and|or)$"
#seplist = ":\s|(^|\s)(a|an|the|in|on|to|of|with|without|by|from|for|via|using|and|or)(\s|$)"
#seplist = "\sa\s|\san\s|\sthe\s|\sin\s|\son\s|\sto\s|\sof\s|\swith\s|\swithout\s|\sby\s|\sfrom\s|\sfor\s|\svia\s|\susing\s|\sand\s|\sor\s|:\s|,\s|^on\s|^a\s|^an\s|^the\s"
#seplist = re.compile("\sa\s|\san\s|\sthe\s|\sin\s|\son\s|\sto\s|\sof\s|\swith\s|\swithout\s|\sby\s|\sfrom\s|\sfor\s|\svia\s|\sbeyond\s|\sas\s|\susing\s|\sand\s|\sor\s|\sis\s|\sam\s|\sare\s|:\s|,\s|^on\s|^a\s|^an\s|^the\s",re.IGNORECASE)
seplist = re.compile(":\s|(?:\s|^)(?:a|an|the|in|on|to|of|with|without|by|from|for|via|toward|towards|beyond|using|and|or|as|is|am|are)(?:\s|$)",re.IGNORECASE)
replist = re.compile("^(?:a|an|the|in)\s|\s(?:a|an|the|in)\s|s$|^\s$|^in$",re.IGNORECASE)


with open( 'summary' + str(year) , 'rb' ) as f:
    authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pickle.load(f)

wordhist = {} 
for title in titles:
    words = seplist.split(title)
    for item in words:
        word = replist.sub("", item)
        #print(word)
        if word == "":
            continue
        if word in wordhist:
            wordhist[word] += 1
        else:
            wordhist[word] = 1

stwordhist = sorted( wordhist.items(), key = lambda x:-x[1] )
for item in stwordhist:
    if item[1] > 2:
        print("{0:>50}: {1:>5}".format(item[0],item[1]))
        #print("{0:}: {1:}".format(item[0],item[1]))


### NER

In [None]:
import pickle
import re
from rake_nltk import Rake

# configulations
year = 2014
seplist = re.compile(":\s|(?:\s|^)(?:a|an|the|in|on|to|of|with|without|by|from|for|via|toward|towards|beyond|using|and|or|as|is|am|are)(?:\s|$)",re.IGNORECASE)
replist = re.compile("^(?:a|an|the|in)\s|\s(?:a|an|the|in)\s|s$|^\s$|^in$",re.IGNORECASE)



# Uses stopwords for english from NLTK, and all punctuation characters by default
r = Rake()


# loading data
with open( 'summary' + str(year) , 'rb' ) as f:
    authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pickle.load(f)

wordhist = {}
for title in titles:
    r.extract_keywords_from_text(title)
    words = r.get_ranked_phrases()

    for item in words:
        #word = replist.sub("", item)
        word = item
        if word == "":
            continue
        if word in wordhist:
            wordhist[word] += 1
        else:
            wordhist[word] = 1

stwordhist = sorted( wordhist.items(), key = lambda x:-x[1] )
for item in stwordhist:
    if item[1] > 2:
        print("{0:>50}: {1:>5}".format(item[0],item[1]))
        #print("{0:}: {1:}".format(item[0],item[1]))


In [None]:
import pickle
import re

for year in range(2014,2024):
    print("----")
    print(year)
    with open( 'summary' + str(year) , 'rb' ) as f:
        authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pickle.load(f)

    numpaper = len(titles)
    pt = common.parseTitle(titles)
    stwordhist = pt.wordfreq()
        
    for item in stwordhist:
        ratio = item[1]/numpaper
        if ratio > 0.005:
            #print("{0:}\t{1:}\t{2:.5f}".format(item[0],item[1],ratio))
            print("{0:}\t{1:.5f}".format(item[0],ratio))
            #print("{0:}: {1:}".format(item[0],item[1]))
