In [209]:
from j2v import mag
from j2v import w2v
import numpy as np
import pandas as pd
import random
import json
from collections import defaultdict
from itertools import combinations
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import re

In [2]:
top = pd.read_csv(mag.DATA_ROOT+'top_journals.csv', header=0)

In [3]:
vid2vname = mag.get_venue_dict()

processing /l/nx/data/MicrosoftAcademicGraph/Journals.txt...
finished processing!

processing /l/nx/data/MicrosoftAcademicGraph/Conferences.txt...
finished processing!



In [4]:
# focus on all matched journals that are covered in the model.
mapping = mag.load_map_jid_discipline('journal_discipline_map.csv')
tem = pd.read_csv(mag.DATA_ROOT+'jid_2d_projection.csv', header = None)
tem.columns = ['jid', 'x', 'y', 'hex_color']
jids = set(tem.jid.tolist())
candidates = mapping[mapping.jid.isin(jids)][['disc_name', 'jname', 'jid']]
candidates.index = range(len(candidates))

14113 journals in MAG's Journal.txt were matched to UCSD data, and 29 of them are interdisciplinary journals in UCSD catelog.


## Rank all candidate journals for each target

### Baseline model (top J in the same discipline)

In [204]:
disc_jids = defaultdict(set)
tem_dict = pd.Series(candidates.disc_name.values, index = candidates.jid).to_dict()
for jid, disc in tem_dict.items():
    disc_jids[disc].add(jid)

In [205]:
# see `select_top_journals.ipynb`
vid_pr = {}
for line in mag.yield_one_line('pr_vid.csv', delimiter=','):
    vid, pr = line
    vid_pr[vid] = float(pr)

processing /l/nx/data/MicrosoftAcademicGraph/pr_vid.csv...
finished processing!



In [224]:
def top_same_disc(row):
    jid, disc = row['jid'], row['disc_name']
    sim = sorted(disc_jids[disc], key=lambda x: vid_pr[x], reverse=True)
    sim.remove(jid)
    return '||'.join([vid2vname[jid] for jid in sim])

In [225]:
top['pr_disc'] = top.apply(top_same_disc, axis=1)

### Baseline model (citation)

In [214]:
refs = "PaperReferences_J_C.csv"
paps = "Papers_J_C.csv"

In [215]:
pid_vid_dict, _ = mag.get_pid_vid_dict(paps)

processing /l/nx/data/MicrosoftAcademicGraph/Papers_J_C.csv...
processed 1000000 lines...
processed 2000000 lines...
processed 3000000 lines...
processed 4000000 lines...
processed 5000000 lines...
processed 6000000 lines...
processed 7000000 lines...
processed 8000000 lines...
processed 9000000 lines...
processed 10000000 lines...
processed 11000000 lines...
processed 12000000 lines...
processed 13000000 lines...
processed 14000000 lines...
processed 15000000 lines...
processed 16000000 lines...
processed 17000000 lines...
processed 18000000 lines...
processed 19000000 lines...
processed 20000000 lines...
processed 21000000 lines...
processed 22000000 lines...
processed 23000000 lines...
processed 24000000 lines...
processed 25000000 lines...
processed 26000000 lines...
processed 27000000 lines...
processed 28000000 lines...
processed 29000000 lines...
processed 30000000 lines...
processed 31000000 lines...
processed 32000000 lines...
processed 33000000 lines...
processed 34000000 lin

In [216]:
row_index2jid = candidates['jid'].to_dict()
jid2row_index = {jid: i for i, jid in row_index2jid.items()}
col_index2vid = {i: vid for i, vid in enumerate(set(pid_vid_dict.values()))}
vid2col_index = {vid: i for i, vid in col_index2vid.items()}

In [217]:
out_deg = np.zeros(shape = (len(row_index2jid), len(col_index2vid)))
in_deg = np.zeros(shape = (len(row_index2jid), len(col_index2vid)))

In [218]:
for line in mag.yield_one_line(refs, delimiter=','):
    src, des = line
    s_vid, d_vid = pid_vid_dict[src], pid_vid_dict[des]
    if s_vid in jid2row_index:
        out_deg[jid2row_index[s_vid], vid2col_index[d_vid]] += 1
    if d_vid in jid2row_index:
        in_deg[jid2row_index[d_vid], vid2col_index[s_vid]] += 1

processing /l/nx/data/MicrosoftAcademicGraph/PaperReferences_J_C.csv...
processed 1000000 lines...
processed 2000000 lines...
processed 3000000 lines...
processed 4000000 lines...
processed 5000000 lines...
processed 6000000 lines...
processed 7000000 lines...
processed 8000000 lines...
processed 9000000 lines...
processed 10000000 lines...
processed 11000000 lines...
processed 12000000 lines...
processed 13000000 lines...
processed 14000000 lines...
processed 15000000 lines...
processed 16000000 lines...
processed 17000000 lines...
processed 18000000 lines...
processed 19000000 lines...
processed 20000000 lines...
processed 21000000 lines...
processed 22000000 lines...
processed 23000000 lines...
processed 24000000 lines...
processed 25000000 lines...
processed 26000000 lines...
processed 27000000 lines...
processed 28000000 lines...
processed 29000000 lines...
processed 30000000 lines...
processed 31000000 lines...
processed 32000000 lines...
processed 33000000 lines...
processed 340

processed 285000000 lines...
processed 286000000 lines...
processed 287000000 lines...
processed 288000000 lines...
processed 289000000 lines...
processed 290000000 lines...
processed 291000000 lines...
processed 292000000 lines...
processed 293000000 lines...
processed 294000000 lines...
processed 295000000 lines...
processed 296000000 lines...
processed 297000000 lines...
processed 298000000 lines...
processed 299000000 lines...
processed 300000000 lines...
processed 301000000 lines...
processed 302000000 lines...
processed 303000000 lines...
processed 304000000 lines...
processed 305000000 lines...
processed 306000000 lines...
processed 307000000 lines...
processed 308000000 lines...
processed 309000000 lines...
processed 310000000 lines...
processed 311000000 lines...
processed 312000000 lines...
processed 313000000 lines...
processed 314000000 lines...
processed 315000000 lines...
processed 316000000 lines...
processed 317000000 lines...
processed 318000000 lines...
processed 3190

In [219]:
def norm_2d_array_in_place(a):
    for i in range(len(a)):
        total = np.sum(a[i])
        if total > 0:
            a[i] /= total

In [221]:
norm_2d_array_in_place(in_deg)
norm_2d_array_in_place(out_deg)
matrix = np.concatenate((out_deg, in_deg), axis = 1)

In [222]:
sim = cosine_similarity(matrix)
top_N_index = np.argsort(sim, axis=1)[:, -2::-1] # the last one is itself

In [226]:
baseline_dict = {}
for i, jid in row_index2jid.items():
    baseline_dict[jid] = '||'.join([vid2vname[row_index2jid[j]] for j in top_N_index[i]])

In [227]:
top['citation'] = top['jid'].map(baseline_dict)

In [None]:
del in_deg, out_deg, matrix

### J2v model

In [232]:
model = w2v.load_j2v(mag.DATA_ROOT+'100feat_50minwords_10context_2016')
# venue id -> model index. 
vid2index = dict((vid, i) for i, vid in enumerate(model.index2word))


model shape is: (20835, 100) 



In [234]:
ixs = [vid2index[row_index2jid[i]] for i in range(len(row_index2jid))]
matrix_w2v = model.syn0[ixs, :].astype('float64')
sim_w2v = cosine_similarity(matrix_w2v)
top_N_index_w2v = np.argsort(sim_w2v, axis=1)[:, -2::-1]

In [235]:
w2v_dict = {}
for i, jid in row_index2jid.items():
    w2v_dict[jid] = '||'.join([vid2vname[row_index2jid[j]] for j in top_N_index_w2v[i]])

In [236]:
top['j2v'] = top['jid'].map(w2v_dict)

In [238]:
top.to_csv(mag.get_path('top_journal_rank_all_candidate.csv'), index=None)