In [1]:
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import json
import re
import os

In [2]:
with open(os.path.join("data", "issn2langs.json"), "r") as infile:
    issn2langs = json.load(infile)

In [3]:
with open(os.path.join("data", "OJS_languages_v3.csv"), "r") as infile:
    ojs = pd.read_csv(infile)

In [4]:
ojs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22561 entries, 0 to 22560
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   issn          22561 non-null  object
 1   issn_alt      8314 non-null   object
 2   context_name  22561 non-null  object
 3   journal_url   22561 non-null  object
 4   gcld3_code    22555 non-null  object
 5   language      22561 non-null  object
dtypes: object(6)
memory usage: 1.0+ MB


In [5]:
issn2mono = dict(zip(ojs["issn"].tolist(), ojs["gcld3_code"].tolist()))

In [6]:
allowed_langs = ['en', 'id', 'es', 'pt']

In [7]:
d = defaultdict(int)
d2df = {}
count = 0
id_check = []

for idx, (k, v) in enumerate(list(issn2langs.items())):
    
    langs = []
    if issn2mono[k] in allowed_langs:
        langs.append(issn2mono[k]) #stable code for each
    else:
        langs.append("xx")
        
    c = list(Counter(v).items())
    c = [tup for tup in c if tup[1] > 5]
    
    if c:
        for tup in c:
            if tup[0] in allowed_langs:
                langs.append(tup[0])
            elif tup[0] in ["af", "ja"]: #common errors
                continue
            else:
                langs.append("xx")
    
    langs = sorted(list(set(langs)))
    
    langtup = tuple(langs)
    d[langtup] += 1
    count += 1
    
    #checking indonesian journals
    if langtup == ('id', 'pt'):
        id_check.append(k)
    if langtup == ('en', 'es', 'id', 'pt'):
        id_check.append(k)
    if langtup == ('en', 'es', 'id', 'xx'):
        id_check.append(k)
    if langtup == ('en', 'es', 'id'):
        id_check.append(k)
    if langtup == ('en', 'id', 'pt'):
        id_check.append(k)

In [8]:
print(count)

22559


In [9]:
for k in sorted(d, key=len, reverse=False):
    print(k, d[k])

('en',) 6646
('xx',) 446
('id',) 2605
('pt',) 1134
('es',) 852
('en', 'id') 4417
('id', 'xx') 121
('en', 'es') 1762
('en', 'xx') 2164
('en', 'pt') 821
('es', 'pt') 192
('es', 'xx') 56
('id', 'pt') 4
('pt', 'xx') 36
('es', 'pt', 'xx') 10
('en', 'id', 'xx') 335
('en', 'es', 'pt') 550
('en', 'pt', 'xx') 41
('en', 'es', 'xx') 218
('en', 'es', 'id') 47
('en', 'id', 'pt') 30
('id', 'pt', 'xx') 1
('en', 'es', 'pt', 'xx') 49
('en', 'es', 'id', 'pt') 12
('en', 'es', 'id', 'xx') 10


In [10]:
print(id_check)

['2599-1353', '2253-900X', '2356-1955', '1510-5091', '2722-6689', '2528-2344', '2318-5422', '2526-6675', '2613-9812', '2580-4553', '2443-3187', '2721-4192', '2723-3367', '2747-0733', '2621-3559', '2086-9754', '2745-8563', '2548-3366', '1411-545X', '0100-1965', '2745-5955', '2447-6536', '1907-5995', '2448-8232', '2145-888X', '1858-0262', '2615-5850', '2477-250X', '2655-2515', '2656-1832', '2614-7904', '1576-3420', '1693-6191', '2745-7168', '2655-6812', '2709-4685', '2722-9017', '1018-5674', '2597-7989', '2579-8766', '0121-2923', '2526-110X', '2599-3224', '2086-8162', '2151-2612', '2716-0807', '1411-4143', '0430-5027', '2715-0658', '2715-4882', '2301-9263', '2777-0362', '1858-2400', '2621-4148', '1679-1010', '2660-4418', '2615-6881', '2477-3557', '2655-7533', '0187-0173', '2656-2022', '2221-755X', '2359-0033', '2614-512X', '0211-111X', '2723-9535', '2460-1780', '2216-0973', '2589-8019', '2166-7918', '2078-1938', '2599-0136', '2412-4338', '2656-1794', '2595-9980', '2683-2100', '2318-4507'