In [1]:
"""Create the corpus of articles in .txt format, filtered by language.

Extract articles from the Dialog website in .pdf format. Then, parse the
text out from the articles and save the result in .txt format. Finally,
detect the language of the articles and partition the articles by language.

The goal is to get a directory of raw text parsed from articles published in 
Russian in 2010 from the Dialog conference.

Typical usage example:

    articles = ArticleExtractor.extract_article_urls(JournalUri.DIALOG, 2010)
    ArticleExtractor.extract_articles(articles, target="../data/pdfs/2010")
    
    make_txt_from_pdf.convert(source_dir=input_dir, saving_dir=output_dir)
    # Or: create the output directory and
        run the convert_pdfs_to_text script in cmd prompt or shell
    
    langs = [Language.RUSSIAN.value, Language.ENGLISH.value]
    ArticleManipulatorByLanguage.move_text_files(input_dir, output_dir, langs)
    ArticleManipulatorByLanguage.peek_text_files(output_dir).resolve())
"""

# Reset the kernel (ipython magic command)
%reset -f

## Extract articles in .pdf format from the Dialog journal

In [4]:
from bs4 import BeautifulSoup, SoupStrainer
from django.utils.text import slugify
from enum import Enum
from typing import Dict, List, Tuple
import os
import re
import requests
import sys


def to_file_safe_string(value: str, max_strlen: int = 200) -> str:
    """Parses a string into a valid filename.
    
    Args:
        value: The string to parse.
        max_strlen: Optional; The maximum length of a file name (not including the file
        extension). Default is 200 characters
        
    Returns:
        A string that conforms to the django standards of a valid URL slug. For example:
        
        "управление-лексиконом-в-онтологической-семантике-p-636.txt"
        
    """
    slug = slugify(value, allow_unicode=True)
    return slug[0:max_strlen]


class JournalUri(Enum):
    """All possible journals from which to extract articles."""
    DIALOG = ("dialog", "http://www.dialog-21.ru")
    
    def __init__(self, journal, uri):
        self.journal = journal
        self.uri = uri

        
class ArticleExtractor(object):
    """Extracts articles from various Russian NLP converences in .pdf format.
    """
    
    @classmethod
    def extract_years(cls, journal: JournalUri) -> List[int]:
        """Extracts all years in which articles were published for the given journal.
        
        Args:
            journal: An enum representing the journal from which to extract years.
        
        Returns:
            A list of years as ints for each year in which the journal had a publication.
            For example:
            
            [2020, 2019, 2018, 2017]
            
        Raises:
            ValueError if the specified journal is not a valid JournalUri.
        """
        if journal == JournalUri.DIALOG:
            r = requests.get(journal.uri + "/digest") # www.dialog-21.ru/digest/
            journal_html = r.text
            only_digest_menu = SoupStrainer(id="digest-menu")
            digest_menu_html = BeautifulSoup(journal_html, 'html.parser', parse_only=only_digest_menu)

            years_html = digest_menu_html.findAll(attrs={"class": "digest-year-label"})
            years_lists = [year.findAll(text=True) for year in years_html]
            years = [int(year[0]) for year in years_lists]
            return years
        else:
            raise ValueError("Unsupported journal: ", journal.journal)
    
    
    @classmethod
    def extract_article_urls(cls, 
                             journal: JournalUri, 
                             year: int, 
                             online_articles: bool = True) -> List[Tuple[str, str, str]]:
        """Extracts all articles titles and urls for a given year and a given journal.
        
        Args:
            journal: An enum representing the journal from which to extract articles.
            year: The year of publication from which to extract articles.
            online_articles: Optional; Extract online articles as well or not. The default is 
                to include online articles in the extraction.
        
        Returns:
            A list of tuples, with each tuple containing (author, article title, article url).
            For example:
            
            [('author1', 'Устное дистантное общение', 'url1.pdf'), 
            ('author2', 'Именные модели управления', 'url2.pdf')]
        
        Raises:
            ValueError if the specified journal is not a valid JournalUri.
        """
        if journal == JournalUri.DIALOG:
            article_tuples = []
            
            # Non-online articles - www.dialog-21.ru/digest/<year>/articles/
            request_url = '/'.join([journal.uri, "digest", str(year), "articles"])
            r = requests.get(request_url)
            full_html = r.text
            
            only_article = SoupStrainer("article")
            article_divs = BeautifulSoup(full_html, 'html.parser', parse_only=only_article)

            for article in article_divs:
                children = article.findAll('div')

                authors = children[0].text.strip()
                title = children[1].text.strip()
                uri = children[1].a['href'].strip()
                article_tuples.append((authors, title, journal.uri + uri))
                
            # Online articles - www.dialog-21.ru/digest/<year>/online/
            if online_articles:
                request_url = '/'.join([journal.uri, "digest", str(year), "online"])
                r = requests.get(request_url)
                full_html = r.text

                only_article = SoupStrainer("article")
                article_divs = BeautifulSoup(full_html, 'html.parser', parse_only=only_article)

                for article in article_divs:
                    children = article.findAll('div')

                    authors = children[0].text.strip()
                    title = children[1].text.strip()
                    uri = children[1].a['href'].strip()
                    article_tuples.append((authors, title, journal.uri + uri))

            return article_tuples
        else:
            raise ValueError("Unsupported journal: ", journal.journal)
    
    
    @classmethod
    def extract_articles(cls, 
                         articles: List[Tuple[str, str, str]],
                         target: str = "../data") -> None:
        """Downloads .pdfs to the target directory for a given mapping of article titles and urls.
        
        Args:
            articles: A list of tuples, with each tuple containing (_, article title, article url)
            target: Optional; The directory in which to download the articles. The default is
                to download articles in the '../data' directory 
            
        The target directory will be created if it doesn't already exist.
        """
        if not os.path.exists(target):
            os.makedirs(target)

        for article in articles:
            filename = to_file_safe_string(article[1]) + '.pdf'
            filepath = os.path.join(target, filename)
            
            if not os.path.isfile(filepath):
                r = requests.get(article[2])
                with open(filepath, "wb") as file:
                    file.write(r.content)


In [5]:
# Extract the range of years from which the dialog has published journals
years = ArticleExtractor.extract_years(JournalUri.DIALOG)
print(years)

[2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000]


In [6]:
# Extract the authors, article titles, and article urls for all articles from 2010
urls = ArticleExtractor.extract_article_urls(JournalUri.DIALOG, 2010)
print(urls)

[('Алхимова И.С', 'Устное дистантное общение: о некоторых текстовых особенностях диалогов по мобильному телефону\n(p. 1)', 'http://www.dialog-21.ru/media/1476/1.pdf'), ('Антошина С.А., Ляшевская О.Н.', 'Именные модели управления с точки зрения грамматики конструкций\n(p. 7)', 'http://www.dialog-21.ru/media/1477/2.pdf'), ('Апресян В.Ю.', 'Семантическая структура слова и его взаимодействие с отрицанием\n(p. 13)', 'http://www.dialog-21.ru/media/1478/3.pdf'), ('Баранов А.Н.', 'Еще раз о факторах идиоматичности: тавтология и онимизация\n(p. 20)', 'http://www.dialog-21.ru/media/1479/4.pdf'), ('Баранов А.Н., Добровольский Д.О.', 'Семантика фразеологизмов: иерархия или сеть?\n(p. 25)', 'http://www.dialog-21.ru/media/1480/5.pdf'), ('Бергельсон М.Б., Некрасова А.Е.', 'Лингвистический анализ стереотипов: баланс между текстом и смыслом\n(p. 30)', 'http://www.dialog-21.ru/media/1481/6.pdf'), ('Богданова Н. В', 'О корпусе текстов живой речи: новые поступления и первые результаты исследования\n(p. 35

In [7]:
# Extract all articles from 2010
list_to_extract = ArticleExtractor.extract_article_urls(JournalUri.DIALOG, 2010)
ArticleExtractor.extract_articles(list_to_extract, target="../data/pdfs/2010")

## Transform articles from .pdf format to .txt format

In [None]:
"""Method used in:

@article{bakarovrussian,
  title={Russian Computational Linguistics: Topical Structure in 2007-2017 Conference Papers},
  journal={Komp'yuternaya Lingvistika i Intellektual'nye Tekhnologii},
  year={2018},
  author={Bakarov, Amir and Kutuzov, Andrey and Nikishina, Irina}
}

Note: Page header and footer must be removed after conversation.

The python wrapper for pdftotext doesn't allow for specifying the
margins of the bounding box from which to read text in the .pdf files.
"""

import make_txt_from_pdf

input_dir = str(Path("../data/pdfs").resolve())
output_dir = str(Path("../data/raw_txt").resolve())

if not os.path.exists(target):
    os.makedirs(target)
make_txt_from_pdf.convert(source_dir=input_dir, saving_dir=output_dir)

# <Parse header and footer from the text files in output_dir here, using regex>

In [None]:
"""This doesn't work in jupyter. Run the bash script in a shell on linux / mac, or 
run the batch script in a windows command prompt:

Linux/Mac:
    ./convert_pdfs_to_txt.sh input_dir output_dir

Windows:
    convert_pdfs_to_txt.cmd input_dir output_dir

For example, on Windows from a cmd shell from src dirctory: 
    convert_pdfs_to_txt.cmd ..\data\pdfs\2010 ..\data\raw_txt\2010

Note: Page header and footer are already removed in the conversion step.

The shell script and bash script runs the pdftotext command line interface, which allows for
specifying the margins of the bounding box from which to read text in the .pdf files. 
"""

from pathlib import Path
import subprocess

input_dir = str(Path("../data/pdfs/2010").resolve())
output_dir = str(Path("../data/raw_txt/2010").resolve())

out = None
err = None
if sys.platform.startswith('win'):
    proc = subprocess.Popen(["convert_pdfs_to_txt.cmd", input_dir, output_dir],
                        stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    out, err = proc.communicate()
else:
    proc = subprocess.Popen(["convert_pdfs_to_txt.sh", input_dir, output_dir],
                        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = proc.communicate()

if out != None:
    print("Output: ", out.decode("utf-8"))
if err != None:
    print("Error: ", err.decode("utf-8"))

## Filter out papers into directories by language

In [12]:
import langid
from pathlib import Path
import shutil

def get_text(filename: Path) -> str:
    """Get the text from a specified file.
    
    Args:
        filename: The file from which to get text.
    
    Returns:
        The text from a specified file.
    """
    with open(filename, "r", encoding="utf-8") as f:
        return f.read()


class Language(Enum):
    """All possible languages in which articles are written."""
    ENGLISH = "en"
    RUSSIAN = "ru"
    
    @classmethod
    def values(cls) -> List[str]:
        """Returns a list of the values for all possible Language enums.
        For example:
        
        ['en', ru']
        
        """
        return [lang.value for lang in cls]


class ArticleManipulatorByLanguage(object):
    """Manipulate and classify text files by language.
    """
    
    @classmethod
    def filter_text_files(cls,
                          dir_to_classify: Path,
                          languages: List[Language]) -> List[Tuple[Path, str]]:
        """Get a list of text files written in a specified set of languages.

        Args:
            dir_to_classify: The directory from which to filter text files.
            languages: A list of the languages, specified as enum values, to accept.

        Returns:
            A list of tuples, with each tuple containing the path and language of the
            filtered file. For example:

            [(path/to/file/1.txt, 'ru'),
            (path/to/file/2.txt, 'en')]

        """
        selected_files: List[Tuple[Path, str]] = []
        for root_dir, dirs, files in os.walk(dir_to_classify):
            for file in files:
                if file.endswith(".txt"):
                    filepath = Path(root_dir, file)
                    text = get_text(filepath)
                    text_lang = langid.classify(text)[0]
                    if text_lang in set(languages):
                        selected_files.append((Path(root_dir, file), text_lang))
        return selected_files


    @classmethod
    def move_text_files(cls,
                        input_dir: Path, 
                        output_dir: Path, 
                        filter_languages: List[str]) -> None:
        """Move text files into their respective language folders.

        Args:
            input_dir: The directory from which to move text files.
            output_dir: The directory to which text files will be moved.
            filter_languages: A list of the languages, specified as strings, to accept.
        
        The output directory will be created if it doesn't already exist.
        """
        filtered_files = cls.filter_text_files(input_dir, filter_languages)
        for lang in filter_languages:
            lang_output_dir = Path(output_dir, lang)
            if not os.path.exists(lang_output_dir):
                os.makedirs(lang_output_dir)
        
        # Move the files
        [shutil.move(filename, Path(output_dir, filelang, os.path.basename(filename))) 
         for (filename, filelang) in filtered_files]

        
    @classmethod
    def peek_text_files(cls,
                        directory: Path,
                        window_min: int = 50, 
                        window_max: int = 70) -> None:
        """Print a small window of the text files from the specified directory.

        Args:
            directory: The directory from which to peek at text files.
            window_min: Start printing after this amount of new lines.
            window_max: Keep printing until this amount of new lines.
        """
        for root_dir, dirs, files in os.walk(directory):
            for file in files:
                filename = Path(root_dir, file)
                text = get_text(filename).split('\n')
                print("file: ", file, "\n\n", 
                      ' '.join(text[window_min:window_max]), "\n\n", "-"*25, "\n")


In [10]:
# At this point, the articles have been converted from .pdf to .txt
# Filter the articles into subdirectories by language
langid.set_languages(Language.values())

filter_languages = [Language.RUSSIAN.value, Language.ENGLISH.value]
input_dir = Path("../data/raw_txt/2010").resolve()
output_dir = Path("../data/raw_txt/2010").resolve()

ArticleManipulatorByLanguage.move_text_files(input_dir, output_dir, filter_languages)

In [13]:
# Peek at a small snippet of all the articles identified to be written in English
ArticleManipulatorByLanguage.peek_text_files(Path("../data/raw_txt/2010/en").resolve())

file:  comparison-of-deep-neural-network-architectures-for-authorship-attribution-of-russian-social-media-texts.txt 

 Ключевые слова: глубокое обучение, глубокие нейронные сети, определение авторства, символьные n-граммы, социальные медиа  1. Introduction Author identification task for text documents published via Internet is one of the hottest issues in modern cybersecurity. User anonymity and irresponsible distribution of illegal materials regularly become subject of discussions at the highest level as being one of the most important issues of national security. Widespread use of social networks, instant messengers and blogs creates a great demand in the field of computer forensics for effective methods of authorship attribution for short texts. Despite the recent development of text classification methods, based on deep neural networks, there are still almost no studies of their application to authorship identification of short Russian texts. The main goal of this research is to an

In [14]:
# Peek at a small snippet of all the articles identified to be written in Russian
ArticleManipulatorByLanguage.peek_text_files(Path("../data/raw_txt/2010/ru").resolve())

file:  автоматическая-расстановка-пауз-в-системе-синтеза-русской-речи-по-тексту-p-531.txt 

 Система расстановки пауз, изначально разработанная в рамках системы синтеза речи по тексту «Оратор», включает в себя несколько этапов анализа текста для определения мест синтагматических границ. В первую очередь учитываются знаки препинания: в большинстве случаев, пунктуационные знаки соответствуют границам синтагм. При этом в исходной системе паузации «Оратора» имелось небольшое количество правил для тех случаев, когда, несмотря на наличие знаков пунктуации, пауза в определенном месте текста нежелательна или невозможна. На следующем этапе происходит поиск возможных мест пауз в последовательностях слов, не разделенных знаками препинания. При этом анализируются только те отрезки, которые оказались длиннее критического значения (для системы паузации «Оратора» это пять слов). Кроме того, рассматривается небольшой список «неделимых» последовательностей слов (например, «так как»). При нахождении пос