# 2. Languages of Journals using OJS <a name=languages></a>

### Notebook objectives:
1. Obtain <a href='https://github.com/google/cld3'>gcld3 language classifications</a> for the abstracts of articles published in a sample of 20,420 journals supported by OJS. 
2. Classify journals by their primary language of publishing.
3. Classify journals based on whether they publish in multiple languages.

<a href='https://docs.google.com/document/d/103l90P0OuM0muOsmUYlnProG_Xo9yBR4IQ6INB21WaE/edit?usp=sharing'>This link</a> navigates to a Google doc with examples of journals using OJS to publish open access articles in **56 different languages**. <br>

Import packages:

In [None]:
import os
import time
import json
import ijson
from collections import Counter
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#### Google's Compact Language Detector v3 (gcld3)

Initialize <a href='https://github.com/google/cld3'>gcld3</a>:

In [None]:
import gcld3
classifier = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000)

Store a list of gcld3 language codes corresponding to the 56 languages known to be supported by OJS:

In [None]:
known_langs = ['af', 'ar', 'bg', 'bg-Latn', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'el-Latn', 'en', 'es', 'et', 'eu',
               'fa', 'fi', 'fr', 'gd', 'gl', 'hi', 'hi-Latn', 'hr', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja',
               'ja-Latn', 'ka', 'kk', 'ko', 'lt', 'mk', 'ms', 'ne', 'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'ru-Latn',
               'si', 'sk', 'sl', 'sr', 'sv', 'sw', 'ta', 'th', 'tr', 'uk', 'ur', 'uz', 'vi', 'zh', 'zh-Latn']

Create a function that:
<br>
1. Opens and streams a 15 GB .json file of selected metadata [title, description, subject, language]  for the most recent 100 journal articles published in 20,420 OJS contexts;
<br><br>
2. Passes the 'description' values (article abstracts) to gcld3 to generate lists of predicted languages for each journal;
<br><br>
3. Returns a dict mapping journal issn to a list of gcld3-predicted language codes for article abstracts;

In [None]:
path_to_json = os.path.join('data', 'beacon_metadata.json')

In [None]:
def classify_abstracts(path_to_json, classifier):
    
    issn_to_langs = defaultdict(list)
    
    issn = ''
    description_prefix = ''
    journal_count = 0
    article_count = 0
    
    with open(path_to_json, 'r') as f: #open json file, streamed as dict(dict(list(str)))
        for prefix, event, value in ijson.parse(f): #parse each json event iteratively
            
            if event == 'map_key': #if the event is a new dictionary
                if re.search('\d{4}-\d{4}', value): #and the value of the key is an issn
                    
                    issn = value #store the issn to use as a prefix in ijson.parse()
                    description_prefix = issn + '.' + 'description' + '.' + 'item' #store a prefix for filtering
                    journal_count += 1 #add to the journal count
                
            elif (prefix, event) == (description_prefix, 'string'): #if the json event is a 'description' string
            
                if len(value) > 10: #if the 'description' string is not arbitrary
                
                    article_count += 1 #add to the article count
                
                    pred_ = classifier(text=value) #run gcld3
                    if pred_.is_reliable: #if the language prediction is reliable
                        issn_to_langs[issn].append(pred_.language) 
                        #pass the 'description' string to gcld3 and store the predicted language code
                        #{journal issn: [list of predicted language codes for each article]}
                        del pred_
                        
            else:
                continue
                
    print('Number of article abstracts: {}'.format(article_count))
    print('Number of journals: {}'.format(len(issn_to_language)))
    return issn_to_langs

In [None]:
%%time
issn_to_langs = classify_descriptions(path_to_json, classifier)

Sanity check:

In [None]:
print(type(issn_to_langs))
for k, v in issn_to_langs.items():
    print(k) #issn for one journal
    print(v) #list of gcld3 language classifications for most recent 100 or fewer articles published in journal
    break

Create a function that:
<br>
1. Filters the languages in issn_to_langs by inclusion in the list of `known_langs`;
<br><br>
2. Determines the most common language code in the list of abstract language classsifications for each journal;
<br><br>
3. Returns a dict mapping each issn to a single primary language code;

In [None]:
def classify_journals(issn_to_langs, known_langs):
    
    for k, v in issn_to_langs.items():
        issn_to_langs[k] = [lang for lang in v if lang in known_langs]
    
    issn_to_primary = {}
    for k, v in issn_to_langs.items():
        issn_to_lang[k] = Counter(v).most_common(1)[0][0]
        
    return issn_to_primary

In [None]:
issn_to_primary = classify_journals(issn_to_langs, known_langs)

Create a third function that counts:

In [None]:
language_counts = defaultdict(int)
for k, v in top_languages.items():
    if v: #in case of empty lists
        language_counts[v[0][0]] += 1

In [None]:
#Store the language codes and their counts in a pd.Series
language_distribution = pd.Series(list(issn_to_primary.values()))
#Sort the language code Series by count
language_distribution.sort_values(ascending=False, inplace=True)
#Convert the Series to a DataFrame
primaryLangs = pd.DataFrame(language_distribution, columns=['count'], index=language_distribution.index)
primaryLangs.reset_index(inplace=True)
primaryLangs.rename(columns = {'index':'language'}, inplace=True)
#Total number of journals:
total = primaryLangs['count'].sum()
#Print n
print('Total: {} journals'.format(total))

#### Bar plot of the 10 most common languages in which OJS users publish their articles  (*n*=20,416) <br>
Each bar represents the proportion of journals for which the specified language is their primary publishing language.

In [None]:
%matplotlib inline

fig, ax = matplotlib.pyplot.subplots()

sns.set_style('whitegrid')

lang = sns.barplot(x=primaryLangs['count'][:10],
                   y=['English', 'Bahasa Indonesia', 'Spanish', 'Portuguese', 'Ukrainian',
                      'Russian', 'German', 'French', 'Polish', 'Arabic'],
                   data=primaryLangs,
                   orient='h',
                   color='grey')

ax.set(xlim=(0, 12000),
       xlabel="Active journals using OJS",
       ylabel="Language")


matplotlib.pyplot.xticks([2000, 4000, 6000, 8000, 10000],
                         ['2,000', '4,000', '6,000', '8,000', '10,000'])

for p in lang.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.2
    percent = round(((p.get_width() / total) * 100), 1)
    value = '{}'.format(str(percent)+'%')
    lang.text(_x + 100, _y, value, ha='left', weight='bold')

fig.savefig('OJS_top10_langs.png', bbox_inches=('tight'))

#### Bar plot of multilingualism among journals using OJS  (*n*=20,228) <br>
Each bar represents the proportion of journals that published **5 or more articles in each of their publishing languages**.

In [None]:
multilingual = defaultdict(list)
for k, v in issn_to_language.items():
    for language in Counter(v).items():
        if language[1] >= 5: #If the the number of article abstracts tagged as a given language ('en') exceeds 5
            multilingual[k].append(language[0]) #Append the language (e.g.,'en') to a language list for the journal

In [None]:
multilingual_counts = defaultdict(int)
array_lengths = []

for k, v in multilingual.items():
    multiplier = len(v)
    array_lengths.append(multiplier)
    if multiplier >= 3:
        multilingual_counts['Multi- (3+ languages)'] += 1
    elif multiplier == 2:
        multilingual_counts['Bi- (2 languages)'] += 1
    elif multiplier == 1:
        multilingual_counts['Mono- (1 language)'] += 1
    else:
        continue
        
total = 0
for v in multilingual_counts.values():
    total += v
#Print n
print('Total: {} journals'.format(total))

#Print average length of language list per journal
print('Average number of languages per journal: {}'.format( np.array(array_lengths).mean() ))

In [None]:
multilingual_dist = pd.Series(multilingual_counts)

multilingual_dist.sort_values(ascending=False, inplace=True)

multilingual_dist.index

In [None]:
%matplotlib inline

sns.set_style('whitegrid')

fig, ax = matplotlib.pyplot.subplots()

mult = sns.barplot(y=multilingual_dist.index,
                   x=multilingual_dist.values,
                   orient='h',
                   color='grey')

ax.set(xlim=(0, 12000),
       xlabel="Active journals using OJS",
       ylabel="*-lingual journals")

matplotlib.pyplot.xticks([2000, 4000, 6000, 8000, 10000],
                         ['2,000', '4,000', '6,000', '8,000', '10,000'])

for p in mult.patches:
    _x = p.get_x() + p.get_width()
    _y = p.get_y() + p.get_height() - 0.3
    percent = round(((p.get_width() / total) * 100), 1)
    value = '{}'.format(str(percent)+'%')
    mult.text(_x + 250, _y, value, ha='left', weight='bold')

fig.savefig('OJS_multilingual.png', bbox_inches=('tight'))