In [None]:
from j2v import mag
from j2v import w2v
from j2v import predict

import numpy as np
import pandas as pd
import random
from collections import Counter
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import networkx as nx

In [2]:
def dict_to_csv(csv_file, csv_columns, mdict):
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        # writer.writeheader()
        for key, value in mdict.items():
            writer.writerow({csv_columns[0]: key, csv_columns[1]: value})

### Candidates

In [3]:
refs = "PaperReferences_J_C.csv"
paps = "Papers_J_C.csv"

In [4]:
vid2vname = mag.get_venue_dict()

processing /l/nx/data/MicrosoftAcademicGraph/Journals.txt...
finished processing!

processing /l/nx/data/MicrosoftAcademicGraph/Conferences.txt...
finished processing!



In [5]:
mapping = mag.load_map_jid_discipline('journal_discipline_map.csv')

14113 journals in MAG's Journal.txt were matched to UCSD data, and 29 of them are interdisciplinary journals in UCSD catelog.


In [7]:
# focus on all matched journals that are covered in the model.
tem = pd.read_csv(mag.DATA_ROOT+'jid_2d_projection.csv', header = None)
tem.columns = ['jid', 'x', 'y', 'hex_color']
jids = set(tem.jid.tolist())
candidates = mapping[mapping.jid.isin(jids)][['disc_name', 'jname', 'jid']]
candidates.index = range(len(candidates))

In [9]:
top_N = 5

### Baseline model 1 (random journal in the same discipline)

In [11]:
disc_jids = defaultdict(set)
tem_dict = pd.Series(candidates.disc_name.values, index = candidates.jid).to_dict()
for jid, disc in tem_dict.items():
    disc_jids[disc].add(jid)

In [117]:
len(tem_dict)

12780

In [116]:
def rand_top_same_disc(row):
    jid, disc = row['jid'], row['disc_name']
    sim = random.sample((disc_jids[disc] - set([jid])), top_N)
    return '||'.join([vid2vname[jid] for jid in sim])

In [117]:
candidates['top_five_disc'] = candidates.apply(rand_top_same_disc, axis=1)

### Baseline model 2 (top PR journals in the same discipline)

In [11]:
disc_jids = defaultdict(set)
tem_dict = pd.Series(candidates.disc_name.values, index = candidates.jid).to_dict()
for jid, disc in tem_dict.items():
    disc_jids[disc].add(jid)

In [9]:
# see `select_top_journals.ipynb`
vid_pr = {}
for line in mag.yield_one_line('pr_vid.csv', delimiter=','):
    vid, pr = line
    vid_pr[vid] = float(pr)

processing /l/nx/data/MicrosoftAcademicGraph/pr_vid.csv...
finished processing!



In [28]:
def top_same_disc(row):
    jid, disc = row['jid'], row['disc_name']
    sim = sorted(disc_jids[disc], key=lambda x: vid_pr[x], reverse=True)
    sim.remove(jid)
    sim = sim[:top_N]
    return '||'.join([vid2vname[jid] for jid in sim])

In [40]:
candidates['top_five_pr_disc'] = candidates.apply(top_same_disc, axis=1)

### Baseline model 3 (edge weights)

In [11]:
pid_vid_dict, _ = mag.get_pid_vid_dict(paps)

processing /l/nx/data/MicrosoftAcademicGraph/Papers_J_C.csv...
processed 1000000 lines...
processed 2000000 lines...
processed 3000000 lines...
processed 4000000 lines...
processed 5000000 lines...
processed 6000000 lines...
processed 7000000 lines...
processed 8000000 lines...
processed 9000000 lines...
processed 10000000 lines...
processed 11000000 lines...
processed 12000000 lines...
processed 13000000 lines...
processed 14000000 lines...
processed 15000000 lines...
processed 16000000 lines...
processed 17000000 lines...
processed 18000000 lines...
processed 19000000 lines...
processed 20000000 lines...
processed 21000000 lines...
processed 22000000 lines...
processed 23000000 lines...
processed 24000000 lines...
processed 25000000 lines...
processed 26000000 lines...
processed 27000000 lines...
processed 28000000 lines...
processed 29000000 lines...
processed 30000000 lines...
processed 31000000 lines...
processed 32000000 lines...
processed 33000000 lines...
processed 34000000 lin

In [15]:
vid_frac_paper = defaultdict(int)
for pid, vid in pid_vid_dict.items():
    vid_frac_paper[vid] += 1
for vid in vid_frac_paper:
    vid_frac_paper[vid] /= len(pid_vid_dict)

In [12]:
vid_vid_total = defaultdict(int)
for line in mag.yield_one_line(refs, delimiter=','):
    src, des = line
    s_vid, d_vid = pid_vid_dict[src], pid_vid_dict[des]
    vid_vid_total[(s_vid, d_vid)] += 1 # directed

processing /l/nx/data/MicrosoftAcademicGraph/PaperReferences_J_C.csv...
processed 1000000 lines...
processed 2000000 lines...
processed 3000000 lines...
processed 4000000 lines...
processed 5000000 lines...
processed 6000000 lines...
processed 7000000 lines...
processed 8000000 lines...
processed 9000000 lines...
processed 10000000 lines...
processed 11000000 lines...
processed 12000000 lines...
processed 13000000 lines...
processed 14000000 lines...
processed 15000000 lines...
processed 16000000 lines...
processed 17000000 lines...
processed 18000000 lines...
processed 19000000 lines...
processed 20000000 lines...
processed 21000000 lines...
processed 22000000 lines...
processed 23000000 lines...
processed 24000000 lines...
processed 25000000 lines...
processed 26000000 lines...
processed 27000000 lines...
processed 28000000 lines...
processed 29000000 lines...
processed 30000000 lines...
processed 31000000 lines...
processed 32000000 lines...
processed 33000000 lines...
processed 340

processed 285000000 lines...
processed 286000000 lines...
processed 287000000 lines...
processed 288000000 lines...
processed 289000000 lines...
processed 290000000 lines...
processed 291000000 lines...
processed 292000000 lines...
processed 293000000 lines...
processed 294000000 lines...
processed 295000000 lines...
processed 296000000 lines...
processed 297000000 lines...
processed 298000000 lines...
processed 299000000 lines...
processed 300000000 lines...
processed 301000000 lines...
processed 302000000 lines...
processed 303000000 lines...
processed 304000000 lines...
processed 305000000 lines...
processed 306000000 lines...
processed 307000000 lines...
processed 308000000 lines...
processed 309000000 lines...
processed 310000000 lines...
processed 311000000 lines...
processed 312000000 lines...
processed 313000000 lines...
processed 314000000 lines...
processed 315000000 lines...
processed 316000000 lines...
processed 317000000 lines...
processed 318000000 lines...
processed 3190

In [26]:
vid_out_total = defaultdict(int)
for (f, t), cn in vid_vid_total.items():
    vid_out_total[f] += cn

In [26]:
vid_in_total = defaultdict(int)
for (f, t), cn in vid_vid_total.items():
    vid_in_total[t] += cn

In [21]:
g = nx.DiGraph((x, y, {'weight': v}) for (x, y), v in vid_vid_total.items())

In [60]:
def top_out_edge_weight(row):
    jid = row['jid']
    sim = []
    for u, v in g.out_edges(jid):
        if v in jids and v != u: # only focus on matched journals
            out_frac = g.get_edge_data(u, v)['weight']/vid_out_total[u]
            delta_weight = out_frac - vid_frac_paper[v]
            sim.append((v, delta_weight))
    sim = sorted(sim, key=lambda x: x[1], reverse=True)
    sim = [j[0] for j in sim[:top_N]]
    return '||'.join([vid2vname[j] for j in sim])

def top_in_edge_weight(row):
    jid = row['jid']
    sim = []
    for u, v in g.in_edges(jid):
        if u in jids and v != u: # only focus on matched journals
            in_frac = g.get_edge_data(u, v)['weight']/vid_in_total[v]
            delta_weight = in_frac - vid_frac_paper[u]
            sim.append((v, delta_weight))
    sim = sorted(sim, key=lambda x: x[1], reverse=True)
    sim = [j[0] for j in sim[:top_N]]
    return '||'.join([vid2vname[j] for j in sim])

In [61]:
# candidates['top_five_out_edge_weight'] = candidates.apply(top_out_edge_weight, axis=1)
candidates['top_five_in_edge_weight'] = candidates.apply(top_in_edge_weight, axis=1)

### Baseline model 4 (citation matrix)

In [45]:
row_index2jid = candidates['jid'].to_dict()
jid2row_index = {jid: i for i, jid in row_index2jid.items()}
col_index2vid = {i: vid for i, vid in enumerate(set(pid_vid_dict.values()))}
vid2col_index = {vid: i for i, vid in col_index2vid.items()}

In [69]:
len(row_index2jid)

12780

In [70]:
len(col_index2vid)

24020

In [80]:
out_deg = np.zeros(shape = (len(row_index2jid), len(col_index2vid)))
in_deg = np.zeros(shape = (len(row_index2jid), len(col_index2vid)))

In [81]:
for line in mag.yield_one_line(refs, delimiter=','):
    src, des = line
    s_vid, d_vid = pid_vid_dict[src], pid_vid_dict[des]
    if s_vid in jid2row_index:
        out_deg[jid2row_index[s_vid], vid2col_index[d_vid]] += 1
    if d_vid in jid2row_index:
        in_deg[jid2row_index[d_vid], vid2col_index[s_vid]] += 1

processing /l/nx/data/MicrosoftAcademicGraph/PaperReferences_J_C.csv...
processed 1000000 lines...
processed 2000000 lines...
processed 3000000 lines...
processed 4000000 lines...
processed 5000000 lines...
processed 6000000 lines...
processed 7000000 lines...
processed 8000000 lines...
processed 9000000 lines...
processed 10000000 lines...
processed 11000000 lines...
processed 12000000 lines...
processed 13000000 lines...
processed 14000000 lines...
processed 15000000 lines...
processed 16000000 lines...
processed 17000000 lines...
processed 18000000 lines...
processed 19000000 lines...
processed 20000000 lines...
processed 21000000 lines...
processed 22000000 lines...
processed 23000000 lines...
processed 24000000 lines...
processed 25000000 lines...
processed 26000000 lines...
processed 27000000 lines...
processed 28000000 lines...
processed 29000000 lines...
processed 30000000 lines...
processed 31000000 lines...
processed 32000000 lines...
processed 33000000 lines...
processed 340

In [84]:
def norm_2d_array_in_place(a):
    for i in range(len(a)):
        total = np.sum(a[i])
        if total > 0:
            a[i] /= total

In [86]:
norm_2d_array_in_place(in_deg)
norm_2d_array_in_place(out_deg)

In [90]:
matrix = np.concatenate((out_deg, in_deg), axis = 1)

In [91]:
del in_deg, out_deg

In [92]:
sim = cosine_similarity(matrix)
top_N_index = np.argsort(sim, axis=1)[:, -2:-(top_N+2):-1] # the last one is itself

In [113]:
baseline_dict = {}
for i, jid in row_index2jid.items():
    baseline_dict[jid] = '||'.join([vid2vname[row_index2jid[j]] for j in top_N_index[i]])

In [None]:
candidates['top_five_citation'] = candidates['jid'].map(baseline_dict)

### w2v model

In [4]:
model = w2v.load_j2v(mag.DATA_ROOT+'100feat_50minwords_10context_2016')
# venue id -> model index. 
vid2index = dict((vid, i) for i, vid in enumerate(model.index2word))


model shape is: (20835, 100) 



In [94]:
ixs = [vid2index[row_index2jid[i]] for i in range(len(row_index2jid))]
matrix_w2v = model.syn0[ixs, :].astype('float64')
sim_w2v = cosine_similarity(matrix_w2v)
top_N_index_w2v = np.argsort(sim_w2v, axis=1)[:, -2:-(top_N+2):-1]

In [113]:
w2v_dict = {}
for i, jid in row_index2jid.items():
    w2v_dict[jid] = '||'.join([vid2vname[row_index2jid[j]] for j in top_N_index_w2v[i]])

In [114]:
candidates['top_five_j2v'] = candidates['jid'].map(w2v_dict)

### Output to a file

In [42]:
pd.options.display.max_colwidth = 500

In [62]:
candidates.loc[candidates.jname=='Nature']

Unnamed: 0,disc_name,jname,jid,top_five_disc,top_five_citation,top_five_j2v,top_five_pr_disc,top_five_edge_weight,top_five_out_edge_weight,top_five_in_edge_weight
9783,Interdiscipline,Nature,8364228,Hydrobiologia||Journal of Theoretical Biology||Environmental Science & Technology||Physical Review E||Journal of Biological Chemistry,Science||BioEssays||PLOS Biology||Proceedings of the National Academy of Sciences of the United States of America||Cold Spring Harbor Symposia on Quantitative Biology,Science||Proceedings of the National Academy of Sciences of the United States of America||Cold Spring Harbor Symposia on Quantitative Biology||Nature Communications||BioEssays,Science||Proceedings of the National Academy of Sciences of the United States of America||Journal of Biological Chemistry||Journal of the Acoustical Society of America||Journal of the American Chemical Society,Eas Publications Series||Astronomy and Astrophysics||Experimental and Clinical Immunogenetics||Current Drug Targets - Cns & Neurological Disorders||Science,Science||Cell||Journal of Biological Chemistry||Proceedings of the National Academy of Sciences of the United States of America||The Astrophysical Journal,Eas Publications Series||Experimental and Clinical Immunogenetics||Astronomy and Astrophysics||Current Drug Targets - Cns & Neurological Disorders||Harvey Lectures


In [10]:
candidates

Unnamed: 0,disc_name,jname,jid,top_five_disc,top_five_citation,top_five_j2v,top_five_pr_disc,top_five_out_edge_weight,top_five_und_edge_weight
0,Electrical Engineering & Computer Science,Formal Aspects of Computing,07A4F1E7,International Journal of Computer Vision||Inte...,The Journal of Logic and Algebraic Programming...,The Journal of Logic and Algebraic Programming...,IEEE Transactions on Information Theory||IEEE ...,Theoretical Computer Science||ACM Transactions...,Theoretical Computer Science||Electronic Notes...
1,Medical Specialties,Revista Brasileira De Hematologia E Hemoterapia,0837204E,Contact Dermatitis||Mmw-fortschritte Der Mediz...,Jornal Brasileiro De Patologia E Medicina Labo...,Indian Journal of Hematology and Blood Transfu...,Cell||The New England Journal of Medicine||Cir...,Blood||Journal of Clinical Oncology||Transfusi...,Blood||Journal of Clinical Oncology||Transfusi...
2,Social Sciences,Research in Rural Sociology and Development,0AF80E42,Law Library Journal||Religion||Journal of Educ...,Sociologia Ruralis||Journal of Rural Studies||...,Ecumene||cultural geographies||Urban History R...,Journal of Personality and Social Psychology||...,Journal of Rural Studies||Sociologia Ruralis||...,Journal of Rural Studies||Sociologia Ruralis||...
3,Medical Specialties,Zeitschrift Fur Gastroenterologie,0B51FE5C,Nephron Clinical Practice||Vascular Medicine||...,Journal of Gastroenterology and Hepatology||Gu...,Acta Gastro-enterologica Belgica||Reviews in G...,Cell||The New England Journal of Medicine||Cir...,Gastroenterology||Hepatology||Gut||Journal of ...,World Journal of Gastroenterology||The America...
4,Biology,Journal of Nematology,08DE9C2C,Wetlands Ecology and Management||Gayana||PLOS ...,Nematropica||Nematology||Russian Journal of Ne...,Nematology||Nematropica||Russian Journal of Ne...,Genetics||Plant Physiology||Ecology||Animal Be...,Phytopathology||Plant Disease||Genetics||Molec...,Applied Soil Ecology||Plant Disease||Biologica...
5,Social Sciences,Journal of Consumer Policy,05F5BC16,Indian Journal of Pharmaceutical Education and...,Journal of Consumer Affairs||Journal of Public...,International Journal of Consumer Studies||Jou...,Journal of Personality and Social Psychology||...,Journal of Consumer Research||Journal of Consu...,Journal of Consumer Research||Journal of Consu...
6,Social Sciences,Economia Chilena,39535F77,Local Environment||European Integration Online...,Applied Economics||Revista De Economia Aplicad...,Topics in Macroeconomics||Economia Aplicada||P...,Journal of Personality and Social Psychology||...,Journal of Banking and Finance||Journal of Fin...,Applied Economics||Journal of Banking and Fina...
7,Medical Specialties,Acc Current Journal Review,033F62AA,Reports of Practical Oncology & Radiotherapy||...,Cardiology Clinics||Journal of the American Co...,European Heart Journal||Circulation-cardiovasc...,Cell||The New England Journal of Medicine||Cir...,Circulation||Journal of the American College o...,Journal of the American College of Cardiology|...
8,Health Professionals,Womens Health Issues,04DAE960,Psychiatrische Praxis||Journal of Pain and Pal...,Journal of Womens Health||Journal of Community...,Journal of Womens Health||Health Care for Wome...,BMC Public Health||Journal of Consulting and C...,BMC Public Health||American Journal of Public ...,BMC Public Health||American Journal of Public ...
9,Health Professionals,Human & Experimental Toxicology,08CDBE1D,Journal of Psychoactive Drugs||Medical Clinics...,Toxicology Letters||Toxicology||Toxicology Mec...,Clinical Toxicology||Archives of Toxicology||T...,BMC Public Health||Journal of Consulting and C...,Environmental Health Perspectives||Toxicology|...,Toxicology||Toxicology Letters||Toxicology and...


In [64]:
top = pd.read_csv(mag.DATA_ROOT+'top_journals.csv', header=0)

In [65]:
top = top.join(candidates.drop(columns = ['disc_name', 'jname']).set_index('jid'), on = 'jid')

In [66]:
top.to_csv(mag.DATA_ROOT+'top_journals_similarity_task.csv', index=None)