# Scraping

### Setups and configulations

In [1]:
import os
import sys
import re
import requests
from bs4 import BeautifulSoup
import pickle
sys.path.append('../')
import confana
#from common import *

### Execute scraping (CVF website)

In [9]:
# configurations
param = confana.set_param('https://openaccess.thecvf.com/', 'CVPR', 2014, 2025, 1, 1)

In [None]:
histall = {}
for year in range(param.yearFrom, param.yearTo, param.interval):
    yr = str(year)
    pklfl = param.cachedir + '/' + 'summary' + yr

    if os.path.exists(pklfl):
        with open(pklfl,'rb') as f:
            authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pickle.load(f)
            print(f'loading {pklfl}...')
        print(f'\t{numjpauthors} Japanese authors out of all {numallauthors} authors')
        print(f'\t{numjppaper} Japanese related papers out of all {numpapers} papers')
    else:
        conf = param.conf_prefix + yr
        print(conf,'...')
        pu = confana.parseUrl(param,yr)
        normal, authorss, titles = pu.parseCVF()
        if normal < 0: 
            print('parsing failed...')

        pAT = confana.parseATlist(titles, authorss)
        jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pAT.selectJP()
        hist = pAT.makehist(jpauthors,yr)

        with open(pklfl, 'wb') as f:
            pickle.dump((authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper), f)

        print(f'\t{numjpauthors} Japanese authors out of all {numallauthors} authors')
        print(f'\t{numjppaper} Japanese related papers out of all {numpapers} papers')
    #print(jpauthors)
    #for ii in range(0,len(jpauthors)):
    #    print(f'{jpauthors[ii]}\t{jptitles[ii]}')
    pAT = confana.parseATlist(titles, authorss)
    histall = pAT.mergehist(histall,hist,yr)

ph = confana.parseHist(param,histall)
ph.plotsorted()

### Analysis by accepted list
- e.g. https://cvpr.thecvf.com/Conferences/2025/AcceptedPapers

In [None]:
# configurations
param = confana.set_param('https://cvpr.thecvf.com/Conferences/2025/AcceptedPapers', 'CVPR', 2025, 2025, 1, 1)

histall = {}
for year in range(param.yearFrom, param.yearTo, param.interval):
    yr = str(year)
    pklfl = param.cachedir + '/' + 'summary' + yr

    conf = param.conf_prefix + yr
    print(conf,'...')
    pu = confana.parseUrl(param,yr)
    url = param.site
    bs = pu.beautifulsoup(url)
    cols = bs.find_all('td')

    titles = []
    authorss = []
    print(len(cols))
    for col in cols:
        ccc = col.find('a')
        if ccc is None:
            ccc = col.find('strong')
            if ccc is None:
                continue
        ttl = ccc.get_text().strip()
        ccc = col.find('i')
        if ccc is None: 
            pass
        else:
            au_line = ccc.get_text().strip()
            authors = re.split(' · ',au_line)
            if authors[0]:
                titles.append(ttl)
                authorss.append(authors)
    
    pAT = confana.parseATlist(titles, authorss)
    jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pAT.selectJP()
    hist = pAT.makehist(jpauthors,yr)

    with open(pklfl, 'wb') as f:
        pickle.dump((authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper), f)

    print(f'\t{numjpauthors} Japanese authors out of all {numallauthors} authors')
    print(f'\t{numjppaper} Japanese related papers out of all {numpapers} papers')
    pAT = confana.parseATlist(titles, authorss)
    histall = pAT.mergehist(histall,hist,yr)

ph = confana.parseHist(param,histall)
ph.plotsorted()

### KW trend

In [None]:
import pickle
import re

for year in range(2014,2026,1):
    print("----")
    print(year)
    with open(param.cachedir + '/' +   'summary' + str(year) , 'rb' ) as f:
        authorss,titles,hist,jpauthors, jptitles, numallauthors, numjpauthors, numpapers, numjppaper = pickle.load(f)

    numpaper = len(titles)
    pt = confana.parseTitle(titles)
    stwordhist = pt.wordfreq()
        
    for item in stwordhist:
        ratio = item[1]/numpaper
        if ratio > 0.005:
            #print("{0:}\t{1:}\t{2:.5f}".format(item[0],item[1],ratio))
            print("{0:}\t{1:.5f}".format(item[0],ratio))
            #print("{0:}: {1:}".format(item[0],item[1]))
