In [235]:
import pandas as pd
import numpy as np

import re
import urllib.request
from bs4 import BeautifulSoup

sub_categories = {
    'Artificial intelligence':'eng_artificialintelligence',
    'Data Mining & Analysis':'eng_datamininganalysis',
    'Databases & Information Systems':'eng_databasesinformationsystems',
    'Computer Vision & Pattern Recognition':'eng_computervisionpatternrecognition',
    'Computational Linguistics':'eng_computationallinguistics',
    'Multimedia':'eng_multimedia',
    'Signal Processing':'eng_signalprocessing',
}
sub_categories_list = [ i for i in sub_categories.keys() ]

In [236]:
def isconf(x):
    flag = 0
    if 'Conference' in x: flag = 1
    if 'Meeting' in x: flag = 1
    return flag

df = pd.DataFrame(columns=['name', 'h5_mean', 'h5_median'])

for key, q in sub_categories.items():
    url = 'https://scholar.google.co.jp/citations?view_op=top_venues&hl=ja&vq=%s'%q
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html)
    tmp = soup.find('div', id="gsc_mp_content")

    names = [ i.text for i in tmp.find_all('td', class_="gsc_mvt_t")]
    h5_mean = [ int(i.text) for i in tmp.find_all('span', class_="gs_ibl gsc_mp_anchor")]
    h5_median = [ int(i.text) for i in tmp.find_all('a', class_="gs_ibl gsc_mp_anchor")]
    rows = [ [names[i], h5_mean[i], h5_median[i]] for i in range( len(names) )]

    tmp = pd.DataFrame( rows, columns=['name', 'h5_mean', 'h5_median'] )
    tmp['is_conference'] = tmp['name'].apply(isconf)
    # tmp['category'] = key
    tmp[key] = 1
    tmp = tmp.query('is_conference == 1')
    
    df = pd.concat([df, tmp], sort=False)

In [237]:
df = df.fillna(0)
df.shape

(56, 11)

In [238]:
rows = []

for n in df['name'].unique():
    tmp = df.query('name == "%s"'%n)
    if len(tmp) > 1:
        row = np.sum( tmp[sub_categories_list].values, axis=0 )
        row = tmp.values.tolist()[0][:4] + row.tolist()
        rows.append( row )
    else:
        rows.append( tmp.values.tolist()[0] )

df = pd.DataFrame(rows, columns=df.columns)

In [239]:
def get_short_kakko(x):
    tmp = re.search(r'\(.+?\)', x)
    if tmp == None: return None
    else: return tmp.group()[1:-1]

In [240]:
df["shortname"] = df["name"].apply(get_short_kakko)
df = df.loc[:,["shortname"] + list( df.columns[:-1] )]

In [241]:
df.to_csv('cs_conf_list_v2.csv')

In [242]:
# 手動で追加...

In [243]:
df = pd.read_csv('cs_conf_list.csv', index_col=0)
df = df.drop(columns=['name','is_conference'])
df = df.sort_values('h5_mean', ascending=False)
df.reset_index(drop=True)

Unnamed: 0,shortname,h5_mean,h5_median,Artificial intelligence,Data Mining & Analysis,Databases & Information Systems,Computer Vision & Pattern Recognition,Computational Linguistics,Multimedia,Signal Processing
0,CVPR,302,188,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,ICCV,204,124,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,ICML,193,113,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ECCV,180,104,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,EMNLP,132,76,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,ACL,130,87,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,ICASSP,122,79,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,WWW,118,76,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,KDD,117,77,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,AAAI,101,69,1.0,0.0,0.0,0.0,0.0,0.0,0.0
