# 1. Базовый парсер заголовков

Вытаскивает из latex-кода заголовки статей и их расположение в файлах.

Разбивка происходит в полуручном режиме, т.к. нет уверенности в формате заголовков.

В тексте ищутся слова, содержащие в своём составе заглавные буквы на русском и английском языках в отношении, большем или равным заданному (по умолчанию 0.51, при меньших значениях количество вхождений значительно возрастает, например за счёт двухбуквенных предлогов). Предполагается, что таким образом удаётся обнаруживать неправильно машиинно распознанный капс. Слова или цепочки слов, состоящие из одного строчного символа включаются в заголовок, если стоят между слов, определённых как часть заголовка. При этом, одиночные заглавные буквы, а также инициалы не воспринимаются как начало заголовка.

## Использование
- При удовлетворительном определении заголовка нажать `Enter` без дополнительного ввода.
- Если предложенное место заголовком не является ввести `"n"`
- При неправильном определении границ заголовка ввести два корректировочных числа для сдвига левой и правой границы.
  - ЗАМЕЧАНИЕ: сдвиг производится попробельно, т.е. двойной пробел будет распознан как слово нулевой длины.
  - ЗАМЕЧАНИЕ: границы отображаемого фрагмента текста будут передвинуты автоматически. Длины левой и правой границ в словах задаются в параметрах.
  - ПРИМЕРЫ:
    - `out: a [B C] d e f` -> `in: 0 2` -> `out: a [B C D E] f`
    - `out: a b c [D E] f` -> `in: 2 -1` -> `out: a [B C D] e f`
- Также возможен посимвольный сдвиг правой границы в случае "сращивания" заголовка статьи и её текста. Ввести одно число, начиная с точки.
  - ПРИМЕРЫ:
    - `out: a[BC]def` -> `in: .2` -> `out: a[BCDE]f`
    - `out: a[BCDE]f` -> `in: .-1` -> `out: a[BCD]ef`

В выводе в терминале переносы строк для удобства заменены на `"$"`

### Прочее
- Для определителя капса достуны исключения, которые никогда не будут рассматриваться, как потенциальные начала заголовков, см. опции. По умолчанию: первые 10 римских цифр, "МэВ" и "ГэВ".
- Использовать системный терминал для взаимодействия оказывается удобнее, чем использовать jupyter, поэтому можно скопировать ячейку с кодом в файл `scripter.py` и запускать его.
- При положительном определении заголовка файл дополняется немедленно, прервать процесс можно в любой момент, как и продолжить после -- итоговый файл будет дополяться, а не перезаписываться с нуля при новом запуске программы (главное не забыть предварительно удалить из конца файла дубликаты, если вы начинаете с той страницы, на которой закончили в прошлый раз, а не со следующей).

In [None]:
from os import walk
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
import codecs


############################ VARS ################################
PAGES_DIR = "./matphys/rpages/"
EXIT_DIR = "./matphys/"
EXIT_FILE = "FMEv2.xml"
# First and last pages to be parsed
START_PAGE = 225
END_PAGE = 300
# How many words to display before and after a potential title
LEAD_WORDS = 5
AFT_WORDS = 5
# Look in the description
CAPS_QUOT = 0.51
EXCEPTIONS = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'МэВ', 'ГэВ']
##################################################################



class Article:
	start_title = 0
	end_title = 0
	filename = ''



# Write xml tree to file
def prettify(elem):
	# Pretty-printed XML string for the Element.
	rough_string = ET.tostring(elem, 'utf-8')
	reparsed = minidom.parseString(rough_string)
	return reparsed.toprettyxml(indent="  ")
def xml_write(root):
	with codecs.open(EXIT_DIR + EXIT_FILE, 'w', 'utf-8') as f:
		f.write(prettify(root))


# Get filenames needed
filenames_raw = next(walk(PAGES_DIR), (None, None, []))[2]  # [] if no file
filenames = []
for i in range(START_PAGE, END_PAGE + 1):
	for filename in filenames_raw:
		beginning = "rp-" + str(i) + "_"
		if filename[:len(beginning)] == beginning and filename[-4:] == ".mmd":
			filenames.append(filename)
						

# Check for existing xml
filenames_raw = next(walk(EXIT_DIR), (None, None, []))[2]  # [] if no file
if not(EXIT_FILE in filenames_raw):
	root = ET.Element('data')
	xml_write(root)


def remove_xml_spaces(elem):
	elem.tail = None
	if elem.text != None:
		is_space = True
		for letter in elem.text:
			is_space = False if letter != ' ' else is_space
		elem.text = None if is_space else elem.text
	for subelem in elem:
		subelem = remove_xml_spaces(subelem)
	return elem
def parse_xml():
	# Parse existing xml (string parsing is needed to avoid extra newlines appearing)
	exit_string = ''
	with codecs.open(EXIT_DIR + EXIT_FILE, 'r', 'utf-8') as f:
		for i in f.readlines():
			exit_string += i[:-1]
	root = ET.fromstring(exit_string)
	# Remove empty tails and texts
	root = remove_xml_spaces(root)
	return root
root = parse_xml()
num = len(root) + 1


# Add article title and metadata to xml tree
def add_artice(elem, root, num):
	article = ET.SubElement(root, 'article', {'n':str(num)})
	title = ET.SubElement(article, 'title')
	title.text = file[elem.start_title+1:elem.end_title]
	title_meta = ET.SubElement(article, 'title-meta')
	title_file = ET.SubElement(title_meta, 'title-file')
	title_file.text = elem.filename
	title_start = ET.SubElement(title_meta, 'title-start')
	title_start.text = str(elem.start_title + 1)
	title_end = ET.SubElement(title_meta, 'title-end')
	title_end.text = str(elem.end_title)
	xml_write(root)


# Count number of alphabetic letters in word
def count_letters(word):
	num = 0
	for letter in word:
		num += 0 if re.match(r"[A-ZА-Яa-zа-я]", letter) == None else 1
	return num

# Check if word is written in CAPS
def check_caps(word):
	num = 0
	len_word = 0
	for letter in word:
		#num += 0 if re.match(r"[A-ZА-Я0-9]|[!#$%&'*+-.^_`|~:]", letter) == None else 1					# Too many symbols, math formulas are being detected
		len_word += 1 if re.match(r"[!#$%&'*+-.^_`|~:]", letter) == None else 0
		num += 0 if re.match(r"[A-ZА-Я]", letter) == None else 1
	return 0 if len_word == 0 or num / len_word < CAPS_QUOT or word in EXCEPTIONS else num				# Also exclude common roman numbers

# Check for initials like "I.E."
def check_initials(word):
	initials = True
	for i in range(len(word) - 1):
		type_1 = 0 if re.match(r"[A-ZА-Яa-zа-я]", word[i]) == None else 1
		type_2 = 0 if re.match(r"[A-ZА-Яa-zа-я]", word[i + 1]) == None else 1
		initials = False if type_1 and type_2 else initials
	return initials


# Find next ot prev word boundary (space / newline)
def prev_from(pos, file):
	pos = max(pos, 0)
	prev_space = file.rfind(' ', 0, pos)
	prev_nl = file.rfind('\n', 0, pos)
	prev_space = -1 if prev_space == -1 else prev_space
	prev_nl = -1 if prev_nl == -1 else prev_nl
	return max(prev_nl, prev_space)
def next_from(pos, file, end_replace = True):
	next_space = file.find(' ', pos + 1)
	next_nl = file.find('\n', pos + 1)
	if end_replace:
		next_space = len(file) if next_space == -1 else next_space
		next_nl = len(file) if next_nl == -1 else next_nl
	return max(next_nl, next_space) if next_space == -1 or next_nl == -1 else min(next_nl, next_space)


# Main loop
for filename in filenames:
	print()
	print("################################ " + filename + " ################################")
	with codecs.open(PAGES_DIR + filename, 'r', 'utf-8') as f:
		file = f.read()
	
	word_bound_l = -1
	word_bound_r = next_from(word_bound_l, file, end_replace=False)
	EOF_reached = False

	while not EOF_reached:
		if word_bound_r == -1:
			word_bound_r = len(file)
			EOF_reached = True


		if check_caps(file[word_bound_l+1:word_bound_r]) < 2 or check_initials(file[word_bound_l+1:word_bound_r]):
			word_bound_l = word_bound_r
			word_bound_r = next_from(word_bound_l, file, end_replace=False)
		
		else: # Possibly found a title
			# Left border of a title is already known
			start_title = word_bound_l

			# Define right border of a title
			defined_end = False
			end_title = word_bound_r
			while not defined_end:
				word_bound_l = word_bound_r
				word_bound_r = next_from(word_bound_l, file)

				if word_bound_l == len(file):
					defined_end = True
				elif not check_caps(file[word_bound_l+1:word_bound_r]) and count_letters(file[word_bound_l+1:word_bound_r]) < 2:
					if re.match(r"[A-ZА-Яa-zа-я]", file[word_bound_l+1]) != None:
						# Most possibly belongs to title
						end_title = word_bound_r
					else:
						# Most possibly NOT belongs to title
						pass
				elif check_caps(file[word_bound_l+1:word_bound_r]):
					end_title = word_bound_r
				else:
					defined_end = True

			next_title = False
			while not next_title:
				# Update root in case it's been changed
				root = parse_xml()
				num = len(root) + 1

				# Console output for further user actions
				segment_start = start_title
				segment_end = end_title
				for i in range(LEAD_WORDS):
					segment_start = prev_from(segment_start, file)
				for i in range(AFT_WORDS):
					segment_end = next_from(segment_end, file)
				
				out_str = file[segment_start+1:segment_end]

				# Format
				for i in range(len(out_str)):
					out_str = out_str[:i] + ('$' if out_str[i] == '\n' else out_str[i]) + out_str[i+1:]
				out_str = f"{num})\n" + out_str + '\n' + ' ' * (start_title - segment_start) + '^' * (end_title - start_title - 1)
				# Check for "section" in the string. This is referred to alphabetic tip at the bottom of the page
				"""if 'section' in out_str or 'title' in out_str:
					out_str += '     ############################### Title or section found! ###############################'""" # Not Used
				print(out_str)

				# User actions
				response = input()
				try:
					if response == '':
						# Add article
						article = Article()
						article.start_title = start_title
						article.end_title = end_title
						article.filename = filename
						add_artice(article, root, num)
						next_title = True
						word_bound_l = end_title
						word_bound_r = next_from(word_bound_l, file, end_replace=False)
						print(f'Adding article, n="{num}", title="{file[start_title+1:end_title]}"\n\n')
					elif response == 'n' or response == 'т':
						# Do not add this one
						next_title = True
						print("Not an article, skipping\n\n")
					elif response[0] == '.':
						end_title += int(response[1:])
						print("Changing title right border\n\n")
					else:
						# Change title borders
						corrections = response.split(' ')
						corrections[0] = int(corrections[0])
						corrections[1] = int(corrections[1])
						if corrections[0] > 0:
							for i in range(abs(corrections[0])):
								start_title = prev_from(start_title, file)
						if corrections[0] < 0:
							for i in range(abs(corrections[0])):
								start_title = next_from(start_title, file)
						if corrections[1] < 0:
							for i in range(abs(corrections[1])):
								end_title = prev_from(end_title, file)
						if corrections[1] > 0:
							for i in range(abs(corrections[1])):
								end_title = next_from(end_title, file)
						print("Changing title borders\n\n")
				except:
					print("########## !!! Failed on input, try again !!! ##########\n\n")


# End reached
print('###########################################################################################')
print('Last requested page processd. Press "Enter" to close this window.')
response = input()

# 1.1. Добавление заголовков по одному

В разделе параметров указать номер страницы и ТОЧНУЮ формулировку заголовка из сырого текста, после чего запустить ячейку

In [None]:
from os import walk
import xml.etree.ElementTree as ET
from xml.dom import minidom
import codecs


############################ VARS ################################
PAGES_DIR = "./matphys/rpages/"
EXIT_DIR = "./matphys/"
EXIT_FILE = "FMEv2.xml"
# First and last pages to be parsed
PAGE = 224
TITLE = 'ИНФИНИТЕЗИМАЛЬНАЯ СВЯЗНОСТЬ'
##################################################################



class Article:
	start_title = 0
	end_title = 0
	filename = ''



# Write xml tree to file
def prettify(elem):
	# Pretty-printed XML string for the Element.
	rough_string = ET.tostring(elem, 'utf-8')
	reparsed = minidom.parseString(rough_string)
	return reparsed.toprettyxml(indent="  ")
def xml_write(root):
	with codecs.open(EXIT_DIR + EXIT_FILE, 'w', 'utf-8') as f:
		f.write(prettify(root))


# Get filenames needed
filenames_raw = next(walk(PAGES_DIR), (None, None, []))[2]  # [] if no file
filenames = []
for i in range(PAGE, PAGE + 1):
	for filename in filenames_raw:
		beginning = "rp-" + str(i) + "_"
		if filename[:len(beginning)] == beginning and filename[-4:] == ".mmd":
			filenames.append(filename)
						

# Check for existing xml
filenames_raw = next(walk(EXIT_DIR), (None, None, []))[2]  # [] if no file
if not(EXIT_FILE in filenames_raw):
	root = ET.Element('data')
	xml_write(root)


def remove_xml_spaces(elem):
	elem.tail = None
	if elem.text != None:
		is_space = True
		for letter in elem.text:
			is_space = False if letter != ' ' else is_space
		elem.text = None if is_space else elem.text
	for subelem in elem:
		subelem = remove_xml_spaces(subelem)
	return elem

# Parse existing xml (string parsing is needed to avoid extra newlines appearing)
exit_string = ''
with codecs.open(EXIT_DIR + EXIT_FILE, 'r', 'utf-8') as f:
	for i in f.readlines():
		exit_string += i[:-1]
root = ET.fromstring(exit_string)
# Remove empty tails and texts
root = remove_xml_spaces(root)


# Add article title and metadata to xml tree
def add_artice(elem, root, num):
	article = ET.SubElement(root, 'article', {'n':str(num)})
	title = ET.SubElement(article, 'title')
	title.text = file[elem.start_title+1:elem.end_title]
	title_meta = ET.SubElement(article, 'title-meta')
	title_file = ET.SubElement(title_meta, 'title-file')
	title_file.text = elem.filename
	title_start = ET.SubElement(title_meta, 'title-start')
	title_start.text = str(elem.start_title + 1)
	title_end = ET.SubElement(title_meta, 'title-end')
	title_end.text = str(elem.end_title)
	xml_write(root)

# Read requested file
with codecs.open(PAGES_DIR + filenames[0], 'r', 'utf-8') as f:
	file = f.read()

# Find titles and add them
start_title = 0
end_title = 0
num = len(root) + 1
while file.find(TITLE, end_title) != -1:
	start_title = file.find(TITLE, start_title)
	end_title = start_title + len(TITLE)
	start_title -= 1 # Set on space befor the title

	article = Article()
	article.start_title = max(start_title, 0)
	article.end_title = min(end_title, len(file))
	article.filename = filenames[0]
	add_artice(article, root, num)

# 2. Исправление ошибок в заголовках

Состоит из двух частей: "составитель пар" и "подстановщик".

Сначала "составитель" формирует xml-список всех заголовков с возможными автоматическими исправлениями (в формате было / стало):
1. замена латиницы на агалогичную кириллицу;
2. замена задаванных буквосочетаний (см. параметры)
3. удаление обрамляющих знаков препинания;
4. замена всех букв на заглавные (в том числе это избавляет дальнейшей необходимости исправлять имена);
5. слияние разорванных на отдельные буквы слов (если рядом оказываются несколько таких слов, то они оказываются слиты вместе).

Этот список необходимо просмотреть и исправить оставшиеся ошибки.

Затем запустить "подстановщик", который заменит все заголовки на исправленные.

## 2.1. Составитель пар "оригинальный - исправленный" для заголовков:

In [None]:
from os import walk
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
import codecs


############################ VARS ################################
WORK_DIR = "./matphys/"
INPUT_FILE = "FMEv2.xml"
CORRECTION_FILE = "FMEcorr.xml"
COMBINATIONS_CORR = {
	'U' : 'И',
	'r' : 'г',
	'n' : 'п',
	'Y' : 'У',
	' -' : '-',
	'- ' : '-',
	'S' : 'Я',
	'ХК' : 'Ж',
	'0' : 'О',
	'3' : 'З',
	'6' : 'б'
}
##################################################################



# Write xml tree to file
def prettify(elem):
	# Pretty-printed XML string for the Element.
	rough_string = ET.tostring(elem, 'utf-8')
	reparsed = minidom.parseString(rough_string)
	return reparsed.toprettyxml(indent="  ")
def xml_write(root):
	with codecs.open(WORK_DIR + CORRECTION_FILE, 'w', 'utf-8') as f:
		f.write(prettify(root))
						

# Check for existing xml
filenames_raw = next(walk(WORK_DIR), (None, None, []))[2]  # [] if no file
if not(INPUT_FILE in filenames_raw):
	root = ET.Element('data')
	xml_write(root)


# Parse input xml
exit_string = ''
with codecs.open(WORK_DIR + INPUT_FILE, 'r', 'utf-8') as f:
	for i in f.readlines():
		exit_string += i[:-1]
root = ET.fromstring(exit_string)


# Get all the titles into a dict
titles_dict = {}
pages_dict = {}
for article in root:
	for elem in article:
		if elem.tag == 'title':
			titles_dict[elem.text] = elem.text
			title = elem.text
		if elem.tag == 'title-meta':
			for eelem in elem:
				if eelem.tag == 'title-file':
					pages_dict[title] = eelem.text[eelem.text.find('-')+1:eelem.text.find('_')]


# Correct latin letters
letter_corr = {'A':'А', 'a':'а', 'B':'В', 'b':'Ь', 'E':'Е', 'e':'е', 'H':'Н', 'K':'К', 'M':'М', 'O':'О', 'P':'Р', 'p':'р', 'T':'Т', 'X':'Х', 'x':'x', 'y':'у'}
for title in titles_dict.keys():
	title_new = titles_dict[title]
	for i in range(len(title_new)):
		if title_new[i] in letter_corr:
			title_new = title_new[:i] + letter_corr[title_new[i]] + title_new[i+1:]
	titles_dict[title] = title_new

# Correct preferred combinations
for title in titles_dict.keys():
	title_new = titles_dict[title]
	for comb in COMBINATIONS_CORR.keys():
		while title_new.find(comb) != -1:
			title_new = title_new[:title_new.find(comb)] + COMBINATIONS_CORR[comb] + title_new[title_new.find(comb) + len(comb):]
	titles_dict[title] = title_new

# Remove bounding symbols
for title in titles_dict.keys():
	title_new = titles_dict[title]
	while re.match(r"[!#%&'*+-.^_`|~:;]", title_new[0]) != None:
		title_new = title_new[1:]
	while re.match(r"[!#%&'*+-.^_`|~:;]", title_new[-1]) != None:
		title_new = title_new[:-1]
	titles_dict[title] = title_new

# CAPS
for title in titles_dict.keys():
	titles_dict[title] = titles_dict[title].upper()

# Merge single-lettered words
for title in titles_dict.keys():
	title_new = titles_dict[title]
	title_new = ' ' + title_new + ' '
	for i in range(len(title_new) - 4):
		if (title_new[i] == ' ' or title_new[i] == '№') and title_new[i + 2] == ' ' and title_new[i + 4] == ' ':
			title_new = title_new[:i+2] + '№' + title_new[i+3:]
	i = 0
	while i < len(title_new):
		if title_new[i] == '№':
			title_new = title_new[:i] + title_new[i+1:]
			i = 0
		else:
			i += 1
	while title_new[0] == ' ':
		title_new = title_new[1:]
	while title_new[-1] == ' ':
		title_new = title_new[:-1]
	titles_dict[title] = title_new


# Write corrections xml
root = ET.Element('data')
for i in titles_dict.items():
	pair = ET.SubElement(root, 'pair')
	title_old = ET.SubElement(pair, 'title_old')
	title_old.text = i[0]
	title_new = ET.SubElement(pair, 'title_new')
	title_new.text = i[1]
	page = ET.SubElement(pair, 'page')
	page.text = pages_dict[i[0]]
xml_write(root)

## 2.2. Подстановщик исправленных заголовков:

In [None]:
import xml.etree.ElementTree as ET
from xml.dom import minidom
import codecs


############################ VARS ################################
WORK_DIR = "./matphys/"
INPUT_FILE = "FMEv2.xml"
CORRECTION_FILE = "FMEcorr.xml"
EXIT_FILE = "FMEtitles.xml"
##################################################################



# Write xml tree to file
def prettify(elem):
	# Pretty-printed XML string for the Element.
	rough_string = ET.tostring(elem, 'utf-8')
	reparsed = minidom.parseString(rough_string)
	return reparsed.toprettyxml(indent="  ")
def xml_write(root):
	with codecs.open(WORK_DIR + EXIT_FILE, 'w', 'utf-8') as f:
		f.write(prettify(root))


# Parse input xml
exit_string = ''
with codecs.open(WORK_DIR + CORRECTION_FILE, 'r', 'utf-8') as f:
	for i in f.readlines():
		exit_string += i[:-1]
root = ET.fromstring(exit_string)


# Get all the corrections into a dict
titles_dict = {}
for pair in root:
	for elem in pair:
		if elem.tag == 'title_old':
			title_old = elem.text
		if elem.tag == 'title_new':
			title_new = elem.text
	titles_dict[title_old] = title_new


def remove_xml_spaces(elem):
	elem.tail = None
	if elem.text != None:
		is_space = True
		for letter in elem.text:
			is_space = False if letter != ' ' else is_space
		elem.text = None if is_space else elem.text
	for subelem in elem:
		subelem = remove_xml_spaces(subelem)
	return elem
# Parse existing exit xml (string parsing is needed to avoid extra newlines appearing)
exit_string = ''
with codecs.open(WORK_DIR + INPUT_FILE, 'r', 'utf-8') as f:
	for i in f.readlines():
		exit_string += i[:-1]
root = ET.fromstring(exit_string)
# Remove empty tails and texts
root = remove_xml_spaces(root)


# Replace titles
for article in root:
	for elem in article:
		if elem.tag == 'title':
			elem.text = titles_dict[elem.text]
xml_write(root)

# 3. Сортировщик / сливщик файлов с заголовками

Сортирует статьи в файлах из данного списка в порядке страница-расположение, т.е. (если не сказано иного) в алфавитном порядке и выводит в один выходной файл. Также порядковый номер заменяется uri формата "http://libmeta.ru/fme/article/1_Kraevaya"

In [None]:
import xml.etree.ElementTree as ET
from xml.dom import minidom
import codecs
from transliterate import translit, get_available_language_codes


############################ VARS ################################
WORK_DIR = "./results/"
INPUT_FILES = ["FMEtitles-p5-100.xml", "FMEtitles-p101-200.xml", "FMEtitles-p201-300.xml"]
EXIT_FILE = "FMEtitles-merged.xml"
##################################################################



class Article:
	title = ''
	start_title = ''
	end_title = ''
	filename = ''



# Write xml tree to file
def prettify(elem):
	# Pretty-printed XML string for the Element.
	rough_string = ET.tostring(elem, 'utf-8')
	reparsed = minidom.parseString(rough_string)
	return reparsed.toprettyxml(indent="  ")
def xml_write(root):
	with codecs.open(WORK_DIR + EXIT_FILE, 'w', 'utf-8') as f:
		f.write(prettify(root))


# Add article title and metadata to xml tree
def add_artice(elem, root, num):
	article = ET.SubElement(root, 'article', {'uri':"http://libmeta.ru/fme/article/"+str(num)+"_"+translit(elem.title[:elem.title.find(' ')], 'ru', True)})
	title = ET.SubElement(article, 'title')
	title.text = elem.title
	title_meta = ET.SubElement(article, 'title-meta')
	title_file = ET.SubElement(title_meta, 'title-file')
	title_file.text = elem.filename
	title_start = ET.SubElement(title_meta, 'title-start')
	title_start.text = str(int(elem.start_title) + 1)
	title_end = ET.SubElement(title_meta, 'title-end')
	title_end.text = elem.end_title


def remove_xml_spaces(elem):
	elem.tail = None
	if elem.text != None:
		is_space = True
		for letter in elem.text:
			is_space = False if letter != ' ' else is_space
		elem.text = None if is_space else elem.text
	for subelem in elem:
		subelem = remove_xml_spaces(subelem)
	return elem
def parse_xml(filename):
	# Parse existing xml (string parsing is needed to avoid extra newlines appearing)
	exit_string = ''
	with codecs.open(WORK_DIR + filename, 'r', 'utf-8') as f:
		for i in f.readlines():
			exit_string += i[:-1]
	root = ET.fromstring(exit_string)
	root = remove_xml_spaces(root)
	return root


# Collect all the articles
articles_dict = {}
for filename in INPUT_FILES:
	root = parse_xml(filename)
	for article in root:
		num = ()
		page = ''
		pos = ''
		title = ''
		start = ''
		end = ''
		file = ''
		for tag in article:
			title += tag.text if tag.tag == 'title' else ''
			if tag.tag == 'title-meta':
				for taag in tag:
					page += taag.text[taag.text.find('-')+1:taag.text.find('_')] if taag.tag == 'title-file' else ''
				for taag in tag:
					pos += taag.text if taag.tag == 'title-start' else ''
					start += taag.text if taag.tag == 'title-start' else ''
					end += taag.text if taag.tag == 'title-end' else ''
					file += taag.text if taag.tag == 'title-file' else ''
				num = (int(page), int(pos))
		articles_dict[num] = {'title':title, 'file':file, 'start':start, 'end':end}


# Sort keys and wrtite articles accordingly
root = ET.Element('data')
nums_list = sorted(list(i for i in articles_dict.keys()))
for num in range(len(nums_list)):
	article = Article()
	article.title = articles_dict[nums_list[num]]['title']
	article.start_title = articles_dict[nums_list[num]]['start']
	article.end_title = articles_dict[nums_list[num]]['end']
	article.filename = articles_dict[nums_list[num]]['file']
	add_artice(article, root, num + 1)
xml_write(root)

# 4. Парсер текстов статей

По информации из указанного файла с заголовками вытаскивает в сыром виде тексты статей. Каждая статья помещается в свой .tex файл, с заголовком, содержащим номер статьи и первое слово из заголовка.

In [56]:
import xml.etree.ElementTree as ET
from xml.dom import minidom
import codecs


############################ VARS ################################
TITLES_FILE = "./results/FMEtitles-merged.xml"
PAGES_DIR = "./matphys/rpages/"
EXIT_DIR = "./results/FMEarticles/"
##################################################################


class Article:
	start_file = ''
	start_pos = 0
	end_file = ''
	end_pos = 0
	text = ''
	uri = ''
	title = ''
	xml = ''

	def get_text(self):
		if self.start_file == self.end_file:
			with codecs.open(PAGES_DIR + self.start_file, 'r', 'utf-8') as f:
				self.text += f.read()[self.start_pos:self.end_pos]
		else:
			with codecs.open(PAGES_DIR + self.start_file, 'r', 'utf-8') as f:
				self.text += f.read()[self.start_pos:]
			self.text += ' ' # Add a space to prevent word merging
			with codecs.open(PAGES_DIR + self.end_file, 'r', 'utf-8') as f:
				self.text += f.read()[:self.end_pos]
				
	def prettify(elem):
		# Pretty-printed XML string for the Element.
		rough_string = ET.tostring(elem, 'utf-8')
		reparsed = minidom.parseString(rough_string)
		return reparsed.toprettyxml(indent="  ")
	
	def make_xml(self):
		self.get_text()

		article = ET.Element("article", {'uri':self.uri})
		title = ET.SubElement(article, 'title')
		title.text = self.title
		author = ET.SubElement(article, 'author')
		title_short = ET.SubElement(article, 'title_short')
		pages = ET.SubElement(article, 'pages')
		start = ET.SubElement(pages, 'start')
		start.text = self.start_file[3:self.start_file.find('_', 3)]
		end = ET.SubElement(pages, 'end')
		end.text = self.end_file[3:self.end_file.find('_', 3)]
		literature = ET.SubElement(article, 'literature')
		literature_orig = ET.SubElement(literature, 'literature_orig')
		formulas_remote = ET.SubElement(article, 'formulas_main')
		formulas_inline = ET.SubElement(article, 'formulas_aux')
		relations = ET.SubElement(article, 'relations')
		text = ET.SubElement(article, 'text')
		text.text = self.text
		text_orig = ET.SubElement(article, 'text_orig')
		text_orig.text = self.text

		self.xml = prettify(article)
	
	

class Title:
	text = ''
	file = ''
	start_pos = 0
	end_pos = 0
	uri = ''


def remove_xml_spaces(elem):
	elem.tail = None
	if elem.text != None:
		is_space = True
		for letter in elem.text:
			is_space = False if letter != ' ' else is_space
		elem.text = None if is_space else elem.text
	for subelem in elem:
		subelem = remove_xml_spaces(subelem)
	return elem

def parse_xml(filename):
	# Parse existing xml (string parsing is needed to avoid extra newlines appearing)
	exit_string = ''
	with codecs.open(filename, 'r', 'utf-8') as f:
		for i in f.readlines():
			exit_string += i[:-1]
	root = ET.fromstring(exit_string)
	# Remove empty tails and texts
	root = remove_xml_spaces(root)
	return root

def get_title(n, root):
	otitle = Title()
	for title in root:
		if int(title.attrib['uri'][30:title.attrib['uri'].find('_', 30)]) == n:
			otitle.uri = title.attrib['uri']
			for tag in title:
				if tag.tag == 'title':
					otitle.text = tag.text
				if tag.tag == 'title-meta':
					for ttag in tag:
						if ttag.tag == 'title-file':
							otitle.file = ttag.text
						elif ttag.tag == 'title-start':
							otitle.start_pos = int(ttag.text)
						elif ttag.tag == 'title-end':
							otitle.end_pos = int(ttag.text)
	return otitle


root = parse_xml(TITLES_FILE)

# Create articles list
articles_list = []
title = Title()
for i in range(len(root)):
	title = get_title(i + 1, root)
	if i:
		articles_list[-1].end_file = title.file
		articles_list[-1].end_pos = title.start_pos - 2 # There is a shift for some reason
	articles_list.append(Article())
	articles_list[-1].uri = title.uri
	articles_list[-1].title = title.text
	articles_list[-1].start_file = title.file
	articles_list[-1].start_pos = title.end_pos
	articles_list[-1].end_file = title.file
	with codecs.open(PAGES_DIR + title.file, 'r', 'utf-8') as f:
		articles_list[-1].end_pos = len(f.read())

# Parse texts themselves and write
for i in range(len(articles_list)):
	articles_list[i].make_xml()
	with codecs.open(EXIT_DIR + '' + articles_list[i].uri[30:] + '.xml', 'w', 'utf-8') as f:
		f.write(articles_list[i].xml)

# 5. Парсер формул

Выносит из текстов ранее подготовленных xml-файлов статей сначала выносные, а затем строчные формулы, оставляя на их месте ссылку внутри их математического окружения. 

Минимальная длина в символах, которой должна обладать строчная формула, настраивается.

In [58]:
from os import walk
import xml.etree.ElementTree as ET
from xml.dom import minidom
import codecs


############################ VARS ################################
ARTICLES_DIR = "./results/FMEarticles/"
MIN_INLINE_LEN = 0
##################################################################


# Write xml tree to file
def prettify(elem):
	# Pretty-printed XML string for the Element.
	rough_string = ET.tostring(elem, 'utf-8')
	reparsed = minidom.parseString(rough_string)
	return reparsed.toprettyxml(indent="  ")


def remove_xml_spaces(elem):
	elem.tail = None
	if elem.text != None:
		is_space = True
		for letter in elem.text:
			is_space = False if letter != ' ' else is_space
		elem.text = None if is_space else elem.text
	for subelem in elem:
		subelem = remove_xml_spaces(subelem)
	return elem

def parse_xml(filename):
	# Parse existing xml (string parsing is needed to avoid extra newlines appearing)
	exit_string = ''
	with codecs.open(filename, 'r', 'utf-8') as f:
		for i in f.readlines():
			exit_string += i[:-1]
	root = ET.fromstring(exit_string)
	# Remove empty tails and texts
	root = remove_xml_spaces(root)
	return root


# Get filenames needed
filenames = next(walk(ARTICLES_DIR), (None, None, []))[2]  # [] if no file

for filename in filenames:
	# !!!BUG!!! for some reason newlines diappear in texts in parsed xml, so extract article texts manually and replace
	with codecs.open(ARTICLES_DIR + filename, 'r', 'utf-8') as f:
		file = f.read()
	article = parse_xml(ARTICLES_DIR + filename)
	#print('REMOTES: ' + article.attrib['uri'])
	for subelem in article:
		if subelem.tag == 'text':
			subelem.text = file[file.find('<text>')+6:file.find('</text>')]
			text = subelem
		if subelem.tag == 'text_orig':
			subelem.text = file[file.find('<text_orig>')+11:file.find('</text_orig>')]
		if subelem.tag == 'formulas_main':
			formulas_main = subelem
		if subelem.tag == 'formulas_aux':
			formulas_aux = subelem
			
# Get main formulas
	pos_find = 0
	pos_start = 0
	pos_end = 0
	n = 1
	while text.text != None and text.text.find('\\[', pos_find) != -1:
		pos_start = text.text.find('\\[', pos_find) + 2
		pos_end = text.text.find('\\]', pos_start)
		while text.text[pos_start] == '\n':
			pos_start += 1
		while text.text[pos_end-1] == '\n':
			pos_end -= 1
		pos_find = pos_start
		uri = 'http://libmeta.ru/fme/formula/main' + article.attrib['uri'][article.attrib['uri'].rfind('/', 0, article.attrib['uri'].find('_')):article.attrib['uri'].find('_')+1] + str(n) + article.attrib['uri'][article.attrib['uri'].find('_'):]
		n += 1
		formula = ET.SubElement(formulas_main, 'formula', {'uri':uri})
		formula.text = text.text[pos_start:pos_end]
		text.text = text.text[:pos_start] + 'URI[[' + uri + ']]/URI' + text.text[pos_end:]

# Get auxilary formulas
	pos_find = 0
	pos_start = 0
	pos_end = 0
	cnt = 0
	n = 1
	# Count dollar symbols
	while text.text.find('$', pos_find) != -1:
		pos_find = text.text.find('$', pos_find) + 1
		cnt += 1
	# If cnt is not even assume that first one is garbage from title
	pos_find = 0
	if cnt % 2:
		pos_find = text.text.find('$', pos_find)
		text.text = text.text[:pos_find] + '#' + text.text[pos_find+1:]
	while text.text.find('$', pos_find) != -1:
		pos_start = text.text.find('$', pos_find) + 1
		pos_end = text.text.find('$', pos_start)
		while text.text[pos_start] == '\n':
			pos_start += 1
		while text.text[pos_end-1] == '\n':
			pos_end -= 1
		pos_find = pos_start
		if pos_end - pos_start >= MIN_INLINE_LEN:
			uri = 'http://libmeta.ru/fme/formula/aux' + article.attrib['uri'][article.attrib['uri'].rfind('/', 0, article.attrib['uri'].find('_')):article.attrib['uri'].find('_')+1] + str(n) + article.attrib['uri'][article.attrib['uri'].find('_'):]
			n += 1
			formula = ET.SubElement(formulas_aux, 'formula', {'uri':uri})
			formula.text = text.text[pos_start:pos_end]
			text.text = text.text[:pos_start] + 'URI[[' + uri + ']]/URI' + text.text[pos_end:]
		pos_find = text.text.find('$', pos_find) + 1

	with codecs.open(ARTICLES_DIR + filename, 'w', 'utf-8') as f:
		f.write(prettify(article))

# 5.1. Вынос формул

Выносит все формулы в отдельный файл с указанием типа для возможной последующей обработки.

In [2]:
from os import walk
import xml.etree.ElementTree as ET
from xml.dom import minidom
import codecs


############################ VARS ################################
ARTICLES_DIR = "./results/FMEarticles/"
EXIT_FILE = "./results/FMEformulas.xml"
##################################################################


# Write xml tree to file
def prettify(elem):
	# Pretty-printed XML string for the Element.
	rough_string = ET.tostring(elem, 'utf-8')
	reparsed = minidom.parseString(rough_string)
	return reparsed.toprettyxml(indent="  ")


def remove_xml_spaces(elem):
	elem.tail = None
	if elem.text != None:
		is_space = True
		for letter in elem.text:
			is_space = False if letter != ' ' else is_space
		elem.text = None if is_space else elem.text
	for subelem in elem:
		subelem = remove_xml_spaces(subelem)
	return elem

def parse_xml(filename):
	# Parse existing xml (string parsing is needed to avoid extra newlines appearing)
	exit_string = ''
	with codecs.open(filename, 'r', 'utf-8') as f:
		for i in f.readlines():
			exit_string += i[:-1]
	root = ET.fromstring(exit_string)
	# Remove empty tails and texts
	root = remove_xml_spaces(root)
	return root


# Get filenames needed
filenames = next(walk(ARTICLES_DIR), (None, None, []))[2]  # [] if no file


formulas = ET.Element('formulas')

for filename in filenames:
	root = parse_xml(ARTICLES_DIR + filename)
	for elem in root:
		if elem.tag == 'formulas_main':
			fmain = elem
		if elem.tag == 'formulas_aux':
			faux = elem
	
	for formula in fmain:
		formulas.append(formula)
		formulas[-1].attrib['type'] = 'main'
	for formula in faux:
		formulas.append(formula)
		formulas[-1].attrib['type'] = 'aux'

with codecs.open(EXIT_FILE, 'w', 'utf-8') as f:
	f.write(prettify(formulas))