In [1]:
from sqlalchemy import create_engine, Column, String, Integer, DATE, BOOLEAN
from sqlalchemy.orm import sessionmaker

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import func

import pandas as pd
import json

## Setup our `sqlalchemy`

Load the server credentials so we can actually look at our data. We'll use a `pandas` DataFrame object to hold the infomation locally and do some exploration.

In [2]:
with open('../../postgres.json') as pg_info:
    pg_json = json.load(pg_info)
    pg_username = pg_json['pg_username']
    pg_password = pg_json['pg_password']
    pg_ip = pg_json['pg_ip']

In [3]:
Base = declarative_base()
class articles_detex(Base):
    __tablename__ = 'arxiv_detex'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)
    
engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')
Base.metadata.create_all(engine)


class articles_pandoc(Base):
    __tablename__ = 'arxiv_pandoc'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)
    
engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')


In [4]:
Session = sessionmaker(bind=engine)
session = Session()

In [5]:
pandoc_query = session.query(articles_pandoc)
pandoc = pd.read_sql(pandoc_query.statement, pandoc_query.session.bind)
pandoc.head()

Unnamed: 0,id,created,setspec,title,title_converted,abstract,abstract_converted
0,physics/0208086,2002-08-23,physics:physics,Wakefield Band Partitioning In Linac Structures\n,True,In the NLC project multiple bunches of electro...,True
1,physics/0208087,2002-08-26,physics:physics,Measuring the Earth with Traceroute\n,True,The traceroute utility on any computer connect...,True
2,physics/0208088,2002-08-26,physics:physics,Precision measurement of the metastable 3P2 li...,True,The lifetime of the metastable 3P2 state of ne...,True
3,physics/0208089,2002-08-26,physics:physics,Dynamical Monte Carlo method for stochastic ep...,True,In this work we introduce a new approach to Dy...,True
4,physics/0208090,2002-08-26,physics:astro-ph,Proton Irradiation Experiment for the X-ray Ch...,True,We have investigated the radiation damage effe...,True


## Conversion Failures

We went through all the trouble of trying to remove the $\LaTeX$, and at least recorded when our methods failed. Let's go check what happened with that, i.e. how often did `detex` and `pandoc` fail?

In [6]:
def conversion_rates(series):
    series = series.value_counts()
    successes = series[True]
    failures = series[False] 
    total = successes + failures
    print(f'Conversion Rate: {(100*successes/total):.2f}%')
    print(f'Failure Rate: {100*failures/total:.2f}%')

In [7]:
conversion_rates(pandoc.title_converted)

Conversion Rate: 98.99%
Failure Rate: 1.01%


In [8]:
conversion_rates(pandoc.abstract_converted)

Conversion Rate: 93.35%
Failure Rate: 6.65%
