# 3. Disciplines of Journals using OJS <a name=languages></a>

### Notebook objectives:
1. Obtain <a href='https://github.com/tgweber/fosc'>field of study classifications</a> (Weber et al., 2020) for concatenated abstracts of articles published in a sample of 20,420 journals supported by OJS.
2. Classify journals by their primary field of study.
3. Group the journals by division: STEM, Social Science, Humanities.

Import packages:

In [None]:
import os
import time
import json
import ijson
from collections import Counter
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

Instantiate <a href='https://direct.mit.edu/qss/article/1/2/525/96148/Using-supervised-learning-to-classify-metadata-of'>Weber et al.'s </a> feedforward neural net for classifying academic fields of study:

In [None]:
from fosc import load_model, vectorize
from fosc.config import config
fosc_model = load_model('mlp_l')

Create a dict mapping the output of the fosc classifier `int` to `str`:

In [None]:
fosc_dict = {
    0:'Mathematical Sciences',
    1:'Physical Sciences',
    2:'Chemical Sciences',
    3:'Earth and Environmental Sciences',
    4:'Biological Sciences',
    5:'Agricultural and Veterinary Sciences',
    6:'Information and Computing Sciences',
    7:'Engineering and Technology',
    8:'Medical and Health Sciences',
    9:'Built Environment and Design',
    10:'Education',
    11:'Economics',
    12:'Commerce, Management, Tourism and Services',
    13:'Studies in Human Society',
    14:'Psychology and Cognitive Sciences',
    15:'Law and Legal Studies',
    16:'Studies in Creative Arts and Writing',
    17:'Language, Communication and Culture',
    18:'History and Archaeology',
    19:'Philosophy and Religious Studies'
}

In [None]:
def assign_discipline(row):
    return disc_dict[row['discipline']]

STEM = ['Agricultural and Veterinary Sciences', 
        'Biological Sciences', 
        'Built Environment and Design', 
        'Chemical Sciences',
        'Earth and Environmental Sciences',
        'Engineering and Technology',
        'Information and Computing Sciences',
        'Mathematical Sciences',
        'Medical and Health Sciences',
        'Physical Sciences'
       ]
SOCSCI = ['Commerce, Management, Tourism and Services',
          'Economics',
          'Education',
          'Law and Legal Studies',
          'Psychology and Cognitive Sciences',
          'Studies in Human Society'
         ]
HUM = ['History and Archaeology',
       'Language, Communication and Culture',
       'Philosophy and Religious Studies',
       'Studies in Creative Arts and Writing'
      ]

### English <a name=en></a>

with open('data/issn_to_payload_en.json', 'r') as infile:
    issn_to_payload_en = json.load(infile)
print(len(issn_to_payload_en))

#data
dict_en = {'issn': issn_to_payload_en.keys(), 'payload': issn_to_payload_en.values()}
payloadDF_en = pd.DataFrame.from_dict(dict_en)
print(payloadDF_en.shape)

#fosc
vectorized_en = vectorize(payloadDF_en.payload, model_id)
preds_en = pd.DataFrame(model.predict(vectorized_en))
payloadDF_en = payloadDF_en.join(preds_en)

#select primary field of study classification label
payloadDF_en['discipline'] = payloadDF_en[[i for i in range(0, 20)]].idxmax(axis=1)
payloadDF_en['discipline'] = payloadDF_en.apply(assign_discipline, axis=1)

#create another DF that maps journal issn to discipline
discDF_en = payloadDF_en[['issn', 'discipline']].copy()
print(discDF_en.shape)

#create a final DF of discipline counts
countDF_en = pd.DataFrame(discDF_en['discipline'].value_counts())
countDF_en.reset_index(inplace=True)
countDF_en = countDF_en.rename(columns = {'index':'Discipline',
                                          'discipline': 'Count'})

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(4, 10))

# Plot the
sns.set_color_codes("pastel")
sns.barplot(x="Count", y="Discipline", data=countDF_en,
            label="Total", color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 3000),
       xlabel = 'Active journals using OJS',
       ylabel = 'Discipline',
       title = 'Disciplines of English-language journals ($\it{n}$ = 17,761)')

matplotlib.pyplot.xticks([0, 500, 1000, 1500, 2000, 2500],
                         ['0', '500', '1,000', '1,500', '2,000', '2,500'])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.25
    percent = round(((p.get_width() / 17761) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 150, _y, value, ha='left', weight='bold')

plt.savefig('disc_en.png', bbox_inches='tight')

pivot_en = countDF_en.pivot_table(columns='Discipline')

triDF_en = {'Division': ['Social Sciences', 'STEM', 'Humanities'],
            'Count': [pivot_en[SOCSCI].values.sum(axis=1)[0],
                      pivot_en[STEM].values.sum(axis=1)[0],
                      pivot_en[HUM].values.sum(axis=1)[0]]}
triDF_en

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(10,4))

sns.barplot(x="Count", y="Division", data=triDF_en, color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 10000),
       xlabel = 'Active journals using OJS',
       ylabel = 'Division',
       title = 'English-language journals ($\it{n}$ = 17,761)')

matplotlib.pyplot.xticks([0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000],
                         ['0', '1,000', '2,000', '3,000', '4,000', '5,000', '6,000', '7,000', '8,000', '9,000', ''])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.33
    percent = round(((p.get_width() / 17761) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 125, _y, value, ha='left', weight='bold')

plt.savefig('div_en.png', bbox_inches='tight')

### Bahasa Indonesia <a name=id></a>

with open('data/issn_to_translation_id.json', 'r') as infile:
    issn_to_payload_id = json.load(infile)
print(len(issn_to_payload_id))

#data
dict_id = {'issn': issn_to_payload_id.keys(), 'payload': issn_to_payload_id.values()}
payloadDF_id = pd.DataFrame.from_dict(dict_id)
print(payloadDF_id.shape)

#fosc
vectorized_id = vectorize(payloadDF_id.payload, model_id)
preds_id = pd.DataFrame(model.predict(vectorized_id))
payloadDF_id = payloadDF_id.join(preds_id)

#select primary field of study classification label
payloadDF_id['discipline'] = payloadDF_id[[i for i in range(0, 20)]].idxmax(axis=1)
payloadDF_id['discipline'] = payloadDF_id.apply(assign_discipline, axis=1)

#create another DF that maps journal issn to discipline
discDF_id = payloadDF_id[['issn', 'discipline']].copy()
print(discDF_id.shape)

#create a final DF of discipline counts
countDF_id = pd.DataFrame(discDF_id['discipline'].value_counts())
countDF_id.reset_index(inplace=True)
countDF_id = countDF_id.rename(columns = {'index':'Discipline',
                               'discipline': 'Count'})

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(4, 10))

# Plot
sns.set_color_codes("pastel")
sns.barplot(x="Count", y="Discipline", data=countDF_id,
            label="Total", color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 1500),
       xlabel = 'Active journals using OJS',
       ylabel = 'Discipline',
       title = 'Disciplines of Indonesian journals ($\it{n}$ = 8,138)')

matplotlib.pyplot.xticks([0, 250, 500, 750, 1000, 1250],
                         ['0', '250', '500', '750', '1,000', '1,250'])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.25
    percent = round(((p.get_width() / 8138) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 50, _y, value, ha='left', weight='bold')

plt.savefig('disc_id.png', bbox_inches='tight')

pivot_id = countDF_id.pivot_table(columns='Discipline')

triDF_id = {'Division': ['Social Sciences', 'STEM', 'Humanities'],
            'Count': [pivot_id[SOCSCI].values.sum(axis=1)[0],
                      pivot_id[STEM].values.sum(axis=1)[0],
                      pivot_id[HUM].values.sum(axis=1)[0]]}
triDF_id

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(8,4))

sns.barplot(x="Count", y="Division", data=triDF_id, color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 5000),
       xlabel = 'Active journals using OJS',
       ylabel = 'Division',
       title = 'Indonesian-language journals ($\it{n}$ = 8,138)')

matplotlib.pyplot.xticks([0, 1000, 2000, 3000, 4000, 5000],
                         ['0', '1,000', '2,000', '3,000', '4,000', ''])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.33
    percent = round(((p.get_width() / 8138) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 100, _y, value, ha='left', weight='bold')

plt.savefig('div_id.png', bbox_inches='tight')

### Spanish <a name=es></a>

with open('data/issn_to_translation_es.json', 'r') as infile:
    issn_to_payload_es = json.load(infile)
print(len(issn_to_payload_es))

#data
dict_es = {'issn': issn_to_payload_es.keys(), 'payload': issn_to_payload_es.values()}
payloadDF_es = pd.DataFrame.from_dict(dict_es)
print(payloadDF_es.shape)

#fosc
vectorized_es = vectorize(payloadDF_es.payload, model_id)
preds_es = pd.DataFrame(model.predict(vectorized_es))
payloadDF_es = payloadDF_es.join(preds_es)

#select primary field of study classification label
payloadDF_es['discipline'] = payloadDF_es[[i for i in range(0, 20)]].idxmax(axis=1)
payloadDF_es['discipline'] = payloadDF_es.apply(assign_discipline, axis=1)

#create another DF that maps journal issn to discipline
discDF_es = payloadDF_es[['issn', 'discipline']].copy()
print(discDF_es.shape)

#create a final DF of discipline counts
countDF_es = pd.DataFrame(discDF_es['discipline'].value_counts())
countDF_es.reset_index(inplace=True)
countDF_es = countDF_es.rename(columns = {'index':'Discipline',
                               'discipline': 'Count'})

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(4, 10))

# Plot
sns.set_color_codes("pastel")
sns.barplot(x="Count", y="Discipline", data=countDF_es,
            label="Total", color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 1500),
       xlabel = 'Active journals using OJS',
       ylabel = 'Discipline',
       title = 'Disciplines of Spanish-language journals ($\it{n}$ = 4,468)')

matplotlib.pyplot.xticks([0, 250, 500, 750, 1000, 1250],
                         ['0', '250', '500', '750', '1,000', '1,250'])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.25
    percent = round(((p.get_width() / 4468) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 40, _y, value, ha='left', weight='bold')

plt.savefig('disc_es.png', bbox_inches='tight')

pivot_es = countDF_es.pivot_table(columns='Discipline')

triDF_es = {'Division': ['Social Sciences', 'STEM', 'Humanities'],
            'Count': [pivot_es[SOCSCI].values.sum(axis=1)[0],
                      pivot_es[STEM].values.sum(axis=1)[0],
                      pivot_es[HUM].values.sum(axis=1)[0]]}
triDF_es

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(8,4))

sns.barplot(x="Count", y="Division", data=triDF_es, color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 2500),
       xlabel = 'Active journals using OJS',
       ylabel = 'Division',
       title = 'Spanish-language journals ($\it{n}$ = 4,468)')

matplotlib.pyplot.xticks([0, 500, 1000, 1500, 2000, 2500, 3000],
                         ['0', '500', '1,000', '1,500', '2,000', '2,500', ''])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.33
    percent = round(((p.get_width() / 4468) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 50, _y, value, ha='left', weight='bold')

plt.savefig('div_es.png', bbox_inches='tight')

### Portuguese <a name=pt></a>

with open('data/issn_to_translation_pt.json', 'r') as infile:
    issn_to_payload_pt = json.load(infile)
print(len(issn_to_payload_pt))

#data
dict_pt = {'issn': issn_to_payload_pt.keys(), 'payload': issn_to_payload_pt.values()}
payloadDF_pt = pd.DataFrame.from_dict(dict_pt)
print(payloadDF_pt.shape)

#fosc
vectorized_pt = vectorize(payloadDF_pt.payload, model_id)
preds_pt = pd.DataFrame(model.predict(vectorized_pt))
payloadDF_pt = payloadDF_pt.join(preds_pt)

#select primary field of study classification label
payloadDF_pt['discipline'] = payloadDF_pt[[i for i in range(0, 20)]].idxmax(axis=1)
payloadDF_pt['discipline'] = payloadDF_pt.apply(assign_discipline, axis=1)

#create another DF that maps journal issn to discipline
discDF_pt = payloadDF_pt[['issn', 'discipline']].copy()
print(discDF_pt.shape)

#create a final DF of discipline counts
countDF_pt = pd.DataFrame(discDF_pt['discipline'].value_counts())
countDF_pt.reset_index(inplace=True)
countDF_pt = countDF_pt.rename(columns = {'index':'Discipline',
                               'discipline': 'Count'})

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(4, 10))

# Plot
sns.set_color_codes("pastel")
sns.barplot(x="Count", y="Discipline", data=countDF_pt,
            label="Total", color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 1500),
       xlabel = 'Active journals using OJS',
       ylabel = 'Discipline',
       title = 'Disciplines of Portuguese-language journals ($\it{n}$ = 3,372)')

matplotlib.pyplot.xticks([0, 250, 500, 750, 1000, 1250],
                         ['0', '250', '500', '750', '1,000', '1,250'])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.25
    percent = round(((p.get_width() / 3372) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 40, _y, value, ha='left', weight='bold')

plt.savefig('disc_pt.png', bbox_inches='tight')

pivot_pt = countDF_pt.pivot_table(columns='Discipline')

triDF_pt = {'Division': ['Social Sciences', 'STEM', 'Humanities'],
            'Count': [pivot_pt[SOCSCI].values.sum(axis=1)[0],
                      pivot_pt[STEM].values.sum(axis=1)[0],
                      pivot_pt[HUM].values.sum(axis=1)[0]]}
triDF_pt

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(8,4))

sns.barplot(x="Count", y="Division", data=triDF_pt, color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 2500),
       xlabel = 'Active journals using OJS',
       ylabel = 'Division',
       title = 'Portuguese-language journals ($\it{n}$ = 3,372)')

matplotlib.pyplot.xticks([0, 500, 1000, 1500, 2000, 2500],
                         ['0', '500', '1,000', '1,500', '2,000', ''])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.33
    percent = round(((p.get_width() / 3372) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 50, _y, value, ha='left', weight='bold')

plt.savefig('div_pt.png', bbox_inches='tight')

### All languages combined

issn_to_payload_all = defaultdict(str)
for k, v in issn_to_payload_en.items():
    issn_to_payload_all[k] += v
for k, v in issn_to_payload_id.items():
    issn_to_payload_all[k] += v
for k, v in issn_to_payload_es.items():
    issn_to_payload_all[k] += v
for k, v in issn_to_payload_pt.items():
    issn_to_payload_all[k] += v
print(len(issn_to_payload_all))

#data
dict_all = {'issn': issn_to_payload_all.keys(), 'payload': issn_to_payload_all.values()}
payloadDF_all = pd.DataFrame.from_dict(dict_all)
print(payloadDF_all.shape)

#fosc
vectorized_all = vectorize(payloadDF_all.payload, model_id)
preds_all = pd.DataFrame(model.predict(vectorized_all))
payloadDF_all = payloadDF_all.join(preds_all)

#select primary field of study classification label
payloadDF_all['discipline'] = payloadDF_all[[i for i in range(0, 20)]].idxmax(axis=1)
payloadDF_all['discipline'] = payloadDF_all.apply(assign_discipline, axis=1)

#create another DF that maps journal issn to discipline
discDF_all = payloadDF_all[['issn', 'discipline']].copy()
print(discDF_all.shape)

#create a final DF of discipline counts
countDF_all = pd.DataFrame(discDF_all['discipline'].value_counts())
countDF_all.reset_index(inplace=True)
countDF_all = countDF_all.rename(columns = {'index':'Discipline', 'discipline': 'Count'})

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(6, 15))

# Plot
sns.set_color_codes("pastel")
sns.barplot(x="Count", y="Discipline", data=countDF_all, color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 3000),
       xlabel = 'Active journals using OJS',
       ylabel = 'Discipline',
       title = 'Disciplines of active journals ($\it{n}$ = 20,181)')

matplotlib.pyplot.xticks([0, 500, 1000, 1500, 2000, 2500],
                         ['0', '500', '1,000', '1,500', '2,000', '2,500'])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.25
    percent = round(((p.get_width() / 20181) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 40, _y, value, ha='left', weight='bold')

plt.savefig('disc_all.png', bbox_inches='tight')

pivot_all = countDF_all.pivot_table(columns='Discipline')

triDF_all = {'Division': ['Social Sciences', 'STEM', 'Humanities'],
            'Count': [pivot_all[SOCSCI].values.sum(axis=1)[0],
                      pivot_all[STEM].values.sum(axis=1)[0],
                      pivot_all[HUM].values.sum(axis=1)[0]]}
triDF_all

sns.set_theme(style="whitegrid")

# Initialize the matplotlib figure
fig, ax = plt.subplots(figsize=(10,4))

sns.barplot(x="Count", y="Division", data=triDF_all, color="grey")

sns.despine(bottom=True)

ax.set(xlim=(0, 10000),
       xlabel = 'Active journals using OJS',
       ylabel = 'Division',
       title = 'English, Indonesian, Spanish, and Portuguese-language journals ($\it{n}$ = 20,181)')

matplotlib.pyplot.xticks([0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000],
                         ['0', '1,000', '2,000', '3,000', '4,000', '5,000', '6,000', '7,000', '8,000', '9,000', ''])

for p in ax.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.33
    percent = round(((p.get_width() / 20181) * 100), 1)
    if len(str(int(p.get_width()))) == 4:
        value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + ' ({})'.format(str(percent)+'%')
    else:
        value = str(int(p.get_width())) + ' ({})'.format(str(percent)+'%')
    ax.text(_x + 150, _y, value, ha='left', weight='bold')

plt.savefig('div_all.png', bbox_inches='tight')