In [74]:
import re
import os
import json
from pathlib import Path

class TexProcesser():
    '''
    Latex Preprocessing 
    '''
    
    def __init__(self, latex_dir_path, output_path):
        self.latex_folder = Path(latex_dir_path)
        self.output_path = output_path
        
    def process_tex_files(self,):
        '''
        Latex Preprocessing
        '''
        
        for file_path in self.latex_folder.glob('**/*.tex'):
            with open(file_path, 'r', encoding='utf-8') as file:
                latex_content = file.read()
        
            matches = self._extract_info(latex_content=latex_content)
            print(matches)
            json_str = json.dumps(matches, ensure_ascii=False)
            
            with open(self.output_path, 'a', encoding='utf-8') as file:
                file.write(json_str + '\n')
        
    def _extract_info(self, latex_content=None):
        
        patterns = {
            'title': re.compile(r'\\title\{(.*?)\}', re.DOTALL),
            'etitle': re.compile(r'\\etitle\{(.*?)\}', re.DOTALL),
            'jabstract': re.compile(r'\\jabstract\{(.*?)\}', re.DOTALL),
            'eabstract': re.compile(r'\\eabstract\{(.*?)\}', re.DOTALL),                      
        }
        
        matches = {key: [] for key in patterns.keys()}
        
        for key, pattern in patterns.items():
            matches[key] = pattern.findall(latex_content)
            
        for key, values in matches.items():
            for idx, value in enumerate(values):
                if key == 'title' or key == 'etitle':
                    matches[key][idx] = re.sub(r'\s*\\\\\n\s*', ' ', value).strip()
                elif key == 'jabstract' or key == 'eabstract':
                    matches[key][idx] = re.sub(r'\s*\n\s*', ' ', value).strip()
        
        return matches
        
        
    # def _strip_equation(self,):
    #     pass
    
    # def _extract_title(self, latex_content=None):
    #     '''
    #     Extract title from latex file
    #     '''
    #     title = self._match_title(latex_content=latex_content, title_type='title')
    #     title = re.sub(r'\s*\\\\\n\s*', ' ', title)
        
    #     etitle = self._match_title(latex_content=latex_content, title_type='etitle')
    #     etitle = re.sub(r'\s*\\\\\n\s*', ' ', etitle)
        
    #     return title, etitle
        
    # def _match_title(self, latex_content=None, title_type=None,):
    #     '''
    #     re matching for title/etitle
    #     '''
    #     if title_type == 'title':
    #         pattern = re.compile(r'\\title\{(.*?)\}', re.DOTALL)
    #     elif title_type == 'etitle':
    #         pattern = re.compile(r'\\etitle\{(.*?)\}', re.DOTALL)
    #     match = pattern.search(latex_content)
        
    #     if match:
    #         title = match.group(1).strip()
    #     else:
    #         title = None
            
    #     return title
    

In [76]:
tex_processer = TexProcesser(latex_dir_path='./NLP_LATEX_CORPUS/V01/', output_path='./data/processed.json')
tex_processer.process_tex_files()


{'title': ['表層表現中の情報に基づく文章構造の自動抽出'], 'etitle': ['Automatic Detection of Discourse Structure by Checking Surface Information in Sentences'], 'jabstract': ['テキストや談話を理解するためには，まずその文章構造を理解する必要があ る．文章構造に関する従来の多くの研究では，解析に用いられる知識の問題 に重点がおかれていた．しかし，量的/質的に十分な計算機用の知識が作成さ れることはしばらくの間期待できない．本論文では，知識に基づく文理解と いう処理を行なわずに，表層表現中の種々の情報を用いることにより科学技 術文の文章構造を自動的に推定する方法を示す．表層表現中の情報としては， 種々の手がかり表現，同一/同義の語/句の出現，2文間の類似性，の3つのも のに着目した．実験の結果これらの情報を組み合わせて利用することにより 科学技術文の文章構造のかなりの部分が自動的に推定可能であることがわかっ た．'], 'eabstract': ['To understand a text or dialogue, one must track the discourse structure. While work on discourse structure has mainly focused on knowledge employed in the analysis, detailed knowledge with broad coverage availability to computers is unlikely to be constructed for the present. In this paper, we propose an automatic method for detecting discourse structure by a variety of keys existing in the surface information of sentences. We have considered three types of clue information: clue expressions, occur