In [1]:
import sys
import os
import re
import argparse
import subprocess
import shutil
import pandas as pd

In [2]:
# Files to be processed
input_csv = './examples/img2latex_100k_raw_test.csv'
output_csv = './examples/img2latex_100k_normalize_test.csv'

# Temporary files
unclean_txt = './tmp_data/unclean.txt'
clean_txt = './tmp_data/clean.txt'

# Convert CSV to TXT

In [3]:
data = pd.read_csv(input_csv)
data.head()

Unnamed: 0,image_filename,latex
0,482d5e4808.png,( \Delta z ) ^ { 2 } \ge { \frac { \epsilon } ...
1,54f1d6dbee.png,"5 [ D _ { \mu } , [ D _ { \nu } , G _ { \mu \n..."
2,10e00a9cf4.png,"T r \, T _ { a } = \sqrt N \delta _ { a , N ^ ..."
3,4cb0ae54f8.png,"W _ { \mathrm { X } } \tilde { \Sigma } = 0 \,..."
4,51ffa53ddb.png,"R ( C _ { L } , D _ { L } ) \; Q ( K _ { \star..."


In [4]:
unclean_file = open(unclean_txt,'w')
for i in range(len(data)):
    tmp_str = data['latex'][i].replace('\\begin{align*}','\\begin{align}')
    tmp_str = tmp_str.replace('\begin{align*}','\\begin{align}')
    tmp_str = tmp_str.replace('\\end{align*}','\\end{align}')
    tmp_str = tmp_str.replace('\end{align*}','\\end{align}')
    tmp_str = tmp_str.replace('\\begin{gather*}','\\begin{align}')
    tmp_str = tmp_str.replace('\begin{gather*}','\\begin{align}')
    tmp_str = tmp_str.replace('\\end{gather*}','\\end{align}')
    tmp_str = tmp_str.replace('\end{gather*}','\\end{align}')
    if tmp_str.startswith('$') and tmp_str.endswith('$'):
        tmp_str = tmp_str[1:-1]
        tmp_str = '\\begin{align} ' + tmp_str + ' \\end{align}'
    unclean_file.write(tmp_str)
    unclean_file.write('\n')
unclean_file.close()
print('Done!')

Done!


# Normalize data

In [5]:
assert os.path.exists(unclean_txt), unclean_txt
shutil.copy(unclean_txt, clean_txt)
operators = '\s?'.join('|'.join(['arccos', 'arcsin', 'arctan', 'arg', 'cos', 'cosh', 'cot', 'coth', 'csc', 'deg', 'det', 'dim', 'exp', 'gcd', 'hom', 'inf',
                                    'injlim', 'ker', 'lg', 'lim', 'liminf', 'limsup', 'ln', 'log', 'max', 'min', 'Pr', 'projlim', 'sec', 'sin', 'sinh', 'sup', 'tan', 'tanh']))
ops = re.compile(r'\\operatorname {(%s)}' % operators)
temp_file = clean_txt + '.tmp'
with open(temp_file, 'w') as fout:
    prepre = open(clean_txt, 'r').read().replace('\r', ' ')  # delete \r
    # replace split, align with aligned
    prepre = re.sub(r'\\begin{(split|align|alignedat|alignat|eqnarray)\*?}(.+?)\\end{\1\*?}', r'\\begin{aligned}\2\\end{aligned}', prepre, flags=re.S)
    prepre = re.sub(r'\\begin{(smallmatrix)\*?}(.+?)\\end{\1\*?}', r'\\begin{matrix}\2\\end{matrix}', prepre, flags=re.S)
    fout.write(prepre)

cmd = r"cat %s | node %s %s > %s " % (temp_file, os.path.join(os.path.dirname('__file__'), 'preprocess_latex.js'), 'normalize', clean_txt)
ret = subprocess.call(cmd, shell=True)
os.remove(temp_file)
if ret != 0:
    print('FAILED: %s' % cmd)
temp_file = clean_txt + '.tmp'
shutil.move(clean_txt, temp_file)
with open(temp_file, 'r') as fin:
    with open(clean_txt, 'w') as fout:
        count_str = 0
        for line in fin:
            count_str = count_str + 1
            tokens = line.strip().split()
            tokens_out = []
            for token in tokens:
                tokens_out.append(token)
            if len(tokens_out) > 5:
                post = ' '.join(tokens_out)
                # use \sin instead of \operatorname{sin}
                names = ['\\'+x.replace(' ', '') for x in re.findall(ops, post)]
                post = re.sub(ops, lambda match: str(names.pop(0)), post).replace(r'\\ \end{array}', r'\end{array}')
                fout.write(post+'\n')
            else:
                fout.write('ERROR!\n')
os.remove(temp_file)
print('Done!')

Done!


# Conver TXT to CSV

In [6]:
clean_data_file = open(clean_txt,'r')
i = 0
new_data = data.copy()
for x in clean_data_file:
    if x != 'ERROR!\n':
        new_data['latex'][i] = x[:-1]
    i = i+1
        
clean_data_file.close()

# Save to CSV

In [7]:
new_data.to_csv(output_csv, encoding='utf-8', index=False)