In [119]:
import re
import os
import json
from pathlib import Path

class TexProcesser():
    '''
    Latex Preprocessing 
    '''
    
    def __init__(self, latex_dir_path, output_path):
        self.latex_folder = Path(latex_dir_path)
        self.output_path = output_path
            
    def extract_tex_infor(self,):
        '''
        extract title, etitle, jabstract, eabstract ... from latex files
        '''
        
        for file_path in self.latex_folder.glob('**/*.tex'):
            with open(file_path, 'r', encoding='utf-8') as file:
                latex_content = file.read()
        
            matches = self._extract_info(latex_content=latex_content)
            print(matches)
            json_str = json.dumps(matches, ensure_ascii=False)
            
            with open(self.output_path, 'a', encoding='utf-8') as file:
                file.write(json_str + '\n')
        
    def _extract_info(self, latex_content=None):
        
        patterns = {
            'title': re.compile(r'\\title\{(.*?)\}', re.DOTALL),
            'etitle': re.compile(r'\\etitle\{(.*?)\}', re.DOTALL),
            'jabstract': re.compile(r'\\jabstract\{(.*?)\}', re.DOTALL),
            'eabstract': re.compile(r'\\eabstract\{(.*?)\}', re.DOTALL),                      
        }
        
        matches = {key: [] for key in patterns.keys()}
        
        for key, pattern in patterns.items():
            matches[key] = pattern.findall(latex_content)
            
        for key, values in matches.items():
            for idx, value in enumerate(values):
                if key == 'title' or key == 'etitle':
                    matches[key][idx] = re.sub(r'\s*\\\\\n\s*', ' ', value).strip()
                elif key == 'jabstract' or key == 'eabstract':
                    matches[key][idx] = re.sub(r'\s*\n\s*', ' ', value).strip()
        
        return matches
    
    def _preprocess(self,):
        '''
        删除一些额外的表格和公式等等
        '''
        latex = './NLP_LATEX_CORPUS/V04/V04N01-08.tex'
        with open(latex, 'r', encoding='utf-8') as file:
            latex_content = file.read()
        
        # 删除图表
        latex_content = re.sub(r'\\begin{figure\*}.*?\\end{figure\*}', '', latex_content, flags=re.DOTALL)
        
        # 删除公式
        latex_content = re.sub(r'\$.*?\$', '', latex_content)
        
        # 删除引用
        latex_content = re.sub(r'\\cite\{.*?\}', '', latex_content)
        
        # 删除label
        latex_content = re.sub(r'\\label\{.*?\}', '', latex_content)
        
        # 删除subsection
        latex_content = re.sub(r'\\subsection\{.*?\}', '', latex_content)
        
        # 删除简历部分
        latex_content = re.sub(r'\\begin{biography}.*?\\end{biography}', '', latex_content, flags=re.DOTALL)
        latex_content = re.sub(r'\\bibliographystyle\{.*?\}', '', latex_content)
        latex_content = re.sub(r'\\bibliography\{.*?\}', '', latex_content)
        
        #删除\documentstyle 
        latex_content = re.sub(r'\\documentstyle(?:\[[^\]]*\])?\{[^\}]*\}', '', latex_content)

        # 删除 \setcounter{...}{...} 格式的内容
        latex_content = re.sub(r'\\setcounter\{.*?\}\{.*?\}', '', latex_content)

        # 删除 \newcommand{...}{...} 格式的内容
        latex_content = re.sub(r'\\newcommand\{.*?\}\{.*?\}', '', latex_content)
        latex_content = re.sub(r'\\newcommand\{.*?\}\[.*?\]\{.*?\}', '', latex_content)
        
        # 删除 \newtheorem{...}{...} 格式的内容
        latex_content = re.sub(r'\\newtheorem\{.*?\}\{.*?\}', '', latex_content)
        
        # 删除 \newcounter{...} 格式的内容
        latex_content = re.sub(r'\\newcounter\{.*?\}', '', latex_content)
        
        # 删除 \受付 \採録 \受付日 \採録日 格式的内容
        latex_content = re.sub(r'\\受付\{.*?\}\{.*?\}\{.*?\}', '', latex_content)
        latex_content = re.sub(r'\\採録\{.*?\}\{.*?\}\{.*?\}', '', latex_content)
        latex_content = re.sub(r'\\再受付\{.*?\}\{.*?\}\{.*?\}', '', latex_content)
        
        
        # 将处理后的内容保存到新文件中
        output_file = './test.tex'
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(latex_content)        
    

In [120]:
tex_processer = TexProcesser(latex_dir_path='./NLP_LATEX_CORPUS/V01/', output_path='./data/processed.json')
tex_processer._preprocess()
# tex_processer.process_tex_files()
