In [1]:
import pandas as pd
import glob
import csv
import gc
import numpy as np
from scipy.spatial import distance
from tqdm.notebook import tqdm

datadir = "./parsed_scopus/"

In [2]:
scopus_files = glob.glob(datadir + "*.out")
scopus_asjc = pd.concat((pd.read_csv(f, sep="\t") for f in scopus_files))

In [3]:
scopus_asjc["asjc"] = scopus_asjc["asjc"].str.split(";")

In [4]:
scopus_asjc["authorcountry"] = scopus_asjc["authorcountry"].str.split(";")

In [5]:
sorted(scopus_asjc["pubyear"].unique())

[1902,
 1915,
 1947,
 1954,
 1961,
 1962,
 1965,
 1967,
 1968,
 1970,
 1976,
 1981,
 1982,
 1983,
 1987,
 1988,
 1989,
 1990,
 1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020]

In [6]:
scopus_asjc.explode("authorcountry")

Unnamed: 0,#eid,pubyear,asjc,srcid,doctype,srctype,authorcountry
0,2-s2.0-33750599153,2006,[1000],19472,ar,j,chn
1,2-s2.0-33750926841,2006,"[2503, 1501]",12378,cp,k,kor
2,2-s2.0-33750859998,2006,[2200],100147003,cp,p,deu
3,2-s2.0-33750681684,2006,"[3207, 3312]",14230,ar,j,usa
4,2-s2.0-33750599167,2005,[3500],52142,ar,j,bra
...,...,...,...,...,...,...,...
5995,2-s2.0-35748984933,2007,"[1311, 1312, 2402, 2404]",19706,ar,j,deu
5996,2-s2.0-35548990519,2007,"[2210, 1507, 2207]",29207,ar,d,
5997,2-s2.0-35648968738,2007,"[3306, 3314, 1201, 2738]",36956,ar,j,usa
5998,2-s2.0-35748934177,2007,[1300],29302,re,j,ita


In [7]:
scopus_asjc = scopus_asjc.explode("authorcountry").explode("asjc")

In [8]:
#scopus_asjc = scopus_asjc[scopus_asjc["srctype"]=="j"]

In [9]:
scopus_asjc = scopus_asjc[scopus_asjc["pubyear"] >= 1996]

In [10]:
scopus_asjc_count = scopus_asjc.groupby(["authorcountry", "asjc", "pubyear"])["#eid"].count().fillna(0).reset_index()

In [11]:
scopus_asjc_count = scopus_asjc_count.rename(columns={"#eid":"count"})

In [12]:
current = scopus_asjc_count[scopus_asjc_count["pubyear"]==1996].set_index(["authorcountry", "asjc"])["count"].unstack().fillna(0).drop("-", axis=1)

In [13]:
reslist = []
for current_year in tqdm(range(1996, 2020)):
    current = scopus_asjc_count[scopus_asjc_count["pubyear"]==current_year].set_index(["authorcountry", "asjc"])["count"].unstack().fillna(0).drop("-", axis=1, errors='ignore')
    for row in current.itertuples():
        vec1 = np.array(row[1:])
        ctry1 = row[0]
        for row2 in current.itertuples():
            vec2 = np.array(row2[1:])
            ctry2 = row2[0]
            if(ctry1 == ctry2):
                continue
            else:
                cossim = 1 - distance.cosine(vec1, vec2)
                jacsim = 1 - distance.jaccard(vec1, vec2)
            reslist.append((ctry1, ctry2, current_year, cossim, jacsim))

  0%|          | 0/24 [00:00<?, ?it/s]

In [14]:
df_sim01 = pd.DataFrame(reslist, columns=["ctry01", "ctry02", "year", "cosinesim", "jaccardsim"])
display(df_sim01)
df_sim01.to_csv("./output/scopus_asjc_similarity_unnormalized.tsv", sep="\t", index=None)

Unnamed: 0,ctry01,ctry02,year,cosinesim,jaccardsim
0,afg,ago,1996,0.000000,0.000000
1,afg,alb,1996,0.027940,0.014706
2,afg,and,1996,0.258199,0.250000
3,afg,ant,1996,0.000000,0.000000
4,afg,are,1996,0.110576,0.005464
...,...,...,...,...,...
1247587,zwe,wlf,2019,0.144794,0.000000
1247588,zwe,wsm,2019,0.291859,0.030303
1247589,zwe,yem,2019,0.246348,0.067265
1247590,zwe,zaf,2019,0.717432,0.000000


In [15]:
reslist2 = []
for current_year in tqdm(range(1996, 2020)):
    current = scopus_asjc_count[scopus_asjc_count["pubyear"]==current_year].set_index(["authorcountry", "asjc"])["count"].unstack().fillna(0).drop("-", axis=1, errors='ignore')
    current = current / current.sum()
    for row in current.itertuples():
        vec1 = np.array(row[1:])
        ctry1 = row[0]
        for row2 in current.itertuples():
            vec2 = np.array(row2[1:])
            ctry2 = row2[0]
            if(ctry1 == ctry2):
                continue
            else:
                cossim = 1 - distance.cosine(vec1, vec2)
                jacsim = 1 - distance.jaccard(vec1, vec2)
            reslist2.append((ctry1, ctry2, current_year, cossim, jacsim))

  0%|          | 0/24 [00:00<?, ?it/s]

In [16]:
df_sim02 = pd.DataFrame(reslist2, columns=["ctry01", "ctry02", "year", "cosinesim", "jaccardsim"])
display(df_sim02)
df_sim02.to_csv("./output/scopus_asjc_similarity_asjcnormalized.tsv", sep="\t", index=None)

Unnamed: 0,ctry01,ctry02,year,cosinesim,jaccardsim
0,afg,ago,1996,0.000000,0.000000
1,afg,alb,1996,0.154770,0.014706
2,afg,and,1996,0.006209,0.250000
3,afg,ant,1996,0.000000,0.000000
4,afg,are,1996,0.004121,0.005464
...,...,...,...,...,...
1247587,zwe,wlf,2019,0.065759,0.000000
1247588,zwe,wsm,2019,0.096312,0.030303
1247589,zwe,yem,2019,0.167920,0.067265
1247590,zwe,zaf,2019,0.592288,0.000000
