In [None]:
import os
import sys
import fnmatch
import numpy as np
import pandas as pd
import json
import gzip
import pickle
import csv
import scipy.sparse
Xauth = None
from collections import defaultdict

In [None]:
faculty_affil = pd.read_csv('faculty-affiliations.csv')
ranks = pd.read_csv('other_ranks/ranks.csv')
def csv2dict_str_str(fname):
    with open(fname, mode='r') as infile:
        rdr = csv.reader(infile)
        d = {rows[0].strip(): rows[1].strip() for rows in rdr}
    return d
alias_dict = csv2dict_str_str('dblp-aliases.csv')
alias_dict_values = set(alias_dict.values())
alias_dict_keys = set(alias_dict.keys())
faculty_affil_set = set(faculty_affil.name)

In [None]:
papers = pickle.load(gzip.open('parsed_files.pkl.gz','rb'))

In [None]:
tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = papers[0]
print(papers[0],len(papers))

In [None]:
from shutil import copyfile
copyfile('dblp-aliases.csv', 'dblp-aliases-expanded.csv')
more_aliases = pickle.load(gzip.open('dblp_aliases_auto.pkl.gz','rb'))

In [None]:
with open('dblp-aliases-expanded.csv','at') as fp:
    for names in more_aliases:
        # does it exist as a target
        exists = [_ in alias_dict_values for _ in names]
        # does it exist as an affiliation
        exists_affil = [_ in faculty_affil_set for _ in names]
        # does it have a number in it
        has_nonumbers = [(not _.split(' ')[-1].isdigit()) for _ in names]

        # last priority default is the first name
        true_name = names[0]
        
        # can we use the names for stuff
        take_from_affil = sum(exists_affil) > 0
        take_from_alias = sum(exists) > 0
        take_from_alpha = sum(has_nonumbers) > 0
        
        # second last option is a non-numbered alias
        if not has_nonumbers[0] and take_from_alpha:
            true_name = [_ for _ in names if (not _.split(' ')[-1].isdigit())][0]
        # second option is an affiliation
        if take_from_affil:
            true_name = [_ for _ in names if _ in faculty_affil_set][0]
        # top option is an existing alias target 
        if take_from_alias:
            true_name = [_ for _ in names if _ in alias_dict_values][0]
        
        # if there is no affiliation, no need for this alias
        #if not take_from_affil:
        #    continue
        
        for name in names:
            # don't need an alias for the true name
            if name == true_name:
                continue
            # only to existing faculty
            #if name in exists_affil:
            #    continue
            # it doesn't already exist as a source or target
            if not ( (name in alias_dict_keys) or (name in alias_dict_values) ):
                fp.write('{},{}\r\n'.format(name,true_name))

In [None]:
aliasdict = csv2dict_str_str('dblp-aliases-expanded.csv')

In [None]:
informal_venues = {}
total_venues = {}
pubtypes = {}
pagecounts = {}
conf_ambiguity = defaultdict(dict)
for paper in papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    total_venues[venue] = 1 + total_venues.get(venue,0)
    pagecounts[pages] = 1 + pagecounts.get(pages,0)
    pubtypes[publtype] = 1
    if publtype is not None:
        informal_venues[venue] = 1 + informal_venues.get(venue,0)
    if url != '':
        key = '/'.join(url.split('/')[:3])
        conf_ambiguity[key][venue] = 1 + conf_ambiguity[key].get(venue,0)

frac_informal_venues = {k:v/total_venues[k] for k,v in informal_venues.items()}
skip_venues = {k:v for k,v in frac_informal_venues.items() if v > 0.5 }

In [None]:
combine_venues = {}
for k in conf_ambiguity:
    venues = [_ for _ in conf_ambiguity[k] if (not '@' in _)]
    venues = [_ for _ in venues if (not 'Workshop' in _)]
    venues = [_ for _ in venues if (not 'Companion' in _)]

    if len(venues) > 1:
        if 'db/series/' in k:
            for v in venues:
                combine_venues[v] = k
        else:
            if '/' in k:
                print("'" + k +"':",venues,",")


In [None]:
conf_ambiguity['db/journals/tvcg']
#combine_venues

In [None]:
for paper in papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    if 'n Designing Interactive Systems' in venue:
        pass#print(year,title,'\n',venue)

In [None]:
#sorted([(v,k,frac_informal_venues[k]) for k,v in informal_venues.items()],reverse=True)

In [None]:
#pubtypes

In [None]:
#skip_venues

In [None]:
eb_toofew

In [None]:
useful_papers = []
all_venues = set()
all_authors = set()
for paper in papers:
    tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip = paper
    # these seem like.. not helpful..
    # should either combine them or drop them. combine_venues for combine. this for drop
    if 'db/series/' in url:
        continue
    # these seem like noise?
    if 'db/conf/dagstuhl/' in url:
        continue
    if 'db/conf/dimacs/' in url:
        continue
    if 'db/conf/ac/' in url:
        continue
    #if 'db/conf/ifip' in url: # that's not a typo!
    #    continue
    if 'db/conf/birthday/' in url:
        continue
    if 'db/books/collections/' in url:
        continue
    if publtype is None and venue not in skip_venues and (pages != 0 or (not eb_toofew)) and tag != 'incollection' and not eb_skip \
                        and year >= 1970 and year < 2020 and (pages==-1 or pages >=4 or (not eb_toofew)) and (pages <= 100 or (not eb_toofew)):
        authors = [aliasdict.get(a,a) for a in authors]
        for a in authors:
            all_authors.add(a)
        all_venues.add(venue)
        useful_papers.append((tag,title, authors, venue, pages, startPage,year,volume,number,url,publtype,eb_toofew,eb_skip))

In [None]:
len(all_venues),len(all_authors),len(useful_papers)

In [None]:
all_venues = sorted(list(all_venues))
all_authors = sorted(list(all_authors))
useful_papers = sorted(useful_papers,key = lambda paper: (paper[6],paper[3])) #sort by year,conf

In [None]:
conf_idx = {v:i for i,v in enumerate(all_venues)}
name_idx = {v:i for i,v in enumerate(all_authors)}

In [None]:
with gzip.open('useful_venue_list.pkl.gz','wb') as fp:
    pickle.dump(all_venues,fp,-1)
with gzip.open('useful_authors_list.pkl.gz','wb') as fp:
    pickle.dump(all_authors,fp,-1)
with gzip.open('useful_papers.pkl.gz','wb') as fp:
    pickle.dump(useful_papers,fp,-1)