In [1]:
import pandas as pd
import nltk
import string
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import json
import os
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def parse_paper_json(paper_json):
    """
    Builds and returns a dict containing the paper_id, title, and
    body_text (joined sections) for further processing

    :param paper_json: loaded JSON object
    :return: dict containing targeted information
    """
    return \
        {
            'paper_id':     paper_json['paper_id'],
            'title':        paper_json['metadata']['title'],
            'body_text':    " ".join([x['text'] for x in paper_json['body_text']])
        }


def text_lang_likely(text):
    """
    Compares tokenized text to set of stopwords for each language contained
    within the NLTK stopwords corpus and outputs the likely language based on
    the number of common words.

    Adapted from http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/

    :param text: body of text
    :return: most likely language of text
    """
    wp_words = set(wd.lower() for wd in wordpunct_tokenize(text))
    lang_scores = {}
    for lang in stopwords.fileids():
        sw_set = set(stopwords.words(lang))
        intersection = wp_words & sw_set
        lang_scores[lang] = len(intersection)
    return max(lang_scores, key=lang_scores.get) # return language with highest score

In [4]:
n100_base = '../update1_explore/n100_files'
n100_paths = [f'{n100_base}/{filename}' for filename in os.listdir(n100_base)]

papers = []
for path in n100_paths:
    with open(path, 'r') as f:
        data = json.load(f)
        data_dict = parse_paper_json(data)
        data_dict['language'] = text_lang_likely(data_dict['body_text'])

        papers.append(data_dict)

In [5]:
df = pd.DataFrame(papers)

df[df['language'] != 'english']

Unnamed: 0,paper_id,title,body_text,language
5,6f545900ad20d4a7b92cab9581d507cfca3264fb,Journal Pre-proof CUIDAR AL QUE CUIDA: EL IMPA...,noticias en prensa y en las redes sociales. En...,spanish
51,e9be227f3e9abfdbb2a22312d4a69d8160704590,Journal Pre-proof Paniculitis eosinofílica sec...,"Sección de Dermatología, Hospital Universitari...",spanish
64,d8b3e15a543a14c8412496a26fea778c0fa8cd9a,9. Globalizzazione e salute: nuove prospettive...,Il binomio globalizzazione/salute salta usualm...,italian
