# Cleaning up $\LaTeX$

There's a bit of a mismatch between the tools we're using and the task at hand. The `abstracts` (and the titles) contain $\LaTeX$ which is not stricly how the NLP tools we're using are 

### Load the data

Process the `abstracts` and `title` with `detex`.

In [1]:
from sqlalchemy import create_engine, Column, String, Integer, DATE, BOOLEAN
from sqlalchemy.orm import sessionmaker

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import func

import pandas as pd
import numpy as np
import json
import os
import subprocess


import pypandoc

In [2]:
with open('../../postgres.json') as pg_info:
    pg_json = json.load(pg_info)
    pg_username = pg_json['pg_username']
    pg_password = pg_json['pg_password']
    pg_ip = pg_json['pg_ip']


In [3]:
Base = declarative_base()


# the article class is how sqlalchemy ORM 
# interacts with tables

class articles_raw(Base):
    __tablename__ = 'arxiv_raw'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    title = Column(String)
    abstract = Column(String)
    
class articles_detex(Base):
    __tablename__ = 'arxiv_detex'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)
    
engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')
Base.metadata.create_all(engine)


# the article class is how sqlalchemy treats the objects of a row
class articles_pandoc(Base):
    __tablename__ = 'arxiv_pandoc'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)
    
engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')
Base.metadata.create_all(engine)


In [4]:
def str_fix(str_to_fix, detex_path, use_pandoc=None):
    if not use_pandoc:
        str_to_fix = bytes(str_to_fix, 'utf-8')
        try:
            new_str = subprocess.run(detex_path.split(), input=str_to_fix, stdout=subprocess.PIPE) 
            new_str = new_str.stdout
            detexed = True
        except:
            new_str = str_to_fix
            detexed = False

        new_str = str(new_str, 'utf-8')
        return new_str, detexed
    else:
        try:
            new_str = pypandoc.convert_text(str_to_fix, to='plain', format='latex')
            panddoc_bool = True
        except:
            new_str = str_to_fix
            panddoc_bool = False
            
        return new_str, panddoc_bool


In [5]:
def change_tex(record, detex_path, use_pandoc, article_class):
    processed_article_info = {
                'id':record.id,
                'created':record.created,
                'setspec':record.setspec,
                'title':record.title,
                'abstract':record.abstract,
            }
    
    
    processed_abstract, abstract_bool = str_fix(record.abstract, detex_path, use_pandoc)
    processed_title, title_bool = str_fix(record.title, detex_path, use_pandoc)
    
    processed_article_info['abstract'] = processed_abstract
    processed_article_info['abstract_converted'] = abstract_bool
    
    processed_article_info['title'] = processed_title
    processed_article_info['title_converted'] = title_bool
    

    processed_article = article_class(**processed_article_info)
    return processed_article

In [6]:
def query_tex(limit_num=None, detex_path='/usr/bin/detex', batch_size=10000,
              commit_size=1000, use_pandoc=None, article_class=articles_detex):
    
    engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')
    Session = sessionmaker(engine)

    #get a new session into the database
    #using one to query and one to commit

    # engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432', echo=True)
    query_session = Session()
    commit_session = Session()
    
    #use the session to query the database
    if limit_num:
        query = query_session.query(articles_raw).limit(limit_num)

    else:
        query = query_session.query(articles_raw).yield_per(batch_size)
    
    new_records = []
    
    for row_num, record in enumerate(query): 
        processed_article = change_tex(record, detex_path=detex_path,
                                       use_pandoc=use_pandoc, article_class=article_class)
        new_records.append(processed_article)
        
        if row_num % commit_size == 0:
            commit_session.add_all(new_records)
            commit_session.commit()
            new_records = []
    
    commit_session.add_all(new_records)    
    commit_session.commit()
    
    commit_session.close()
    query_session.close()
    engine.dispose()
        
    
    return row_num
    


### `detex` everything

In [None]:
## this is...kind of brittle to be honest. 
#I could only really get this to work
#on a t2.2xlarge in spite of the fact that I
#I never actually use all of the memory!

row_nums = query_tex()

### `pandoc` everything

In [None]:
#now try to convert everythign with 
#pandoc, record failures

row_nums = query_tex(use_pandoc=True, article_class=articles_pandoc)