# Exploratory Data Analysis

### Load the data

In [1]:
from sqlalchemy import create_engine, Column, String, Integer, DATE, BOOLEAN
from sqlalchemy.orm import sessionmaker

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import func


from itertools import product
from collections import defaultdict

import pandas as pd
import numpy as np
import json

import spacy

import sys

import pypandoc

In [2]:
with open('../../postgres.json') as pg_info:
    pg_json = json.load(pg_info)
    pg_username = pg_json['pg_username']
    pg_password = pg_json['pg_password']
    pg_ip = pg_json['pg_ip']

engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')

In [3]:
Base = declarative_base()




# the article class is how sqlalchemy treats the objects of a row
class Articles_raw(Base):
    __tablename__ = 'arxiv_raw'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    title = Column(String)
    abstract = Column(String)


# the article class is how sqlalchemy treats the objects of a row
class Articles_pandoc(Base):
    __tablename__ = 'arxiv_pandoc'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)

    
    
    
Base.metadata.create_all(bind=engine)
    
    
Session = sessionmaker(bind=engine)


In [4]:
def commit_session_run(commit_session):
    try:
        commit_session.commit()
        return True
    except:
        commit_session.rollback()
    return False


In [5]:
def str_fix(str_to_fix):
    try:
        return pypandoc.convert_text(str_to_fix, to='plain', format='latex'), True
    except:
        return str_to_fix, False


In [6]:
def change_tex(record):
    processed_article_info = {
                'id':record.id,
                'created':record.created,
                'setspec':record.setspec,
                'title':record.title,
                'abstract':record.abstract,
            }
    
    processed_abstract, abstract_bool = str_fix(record.abstract)
    processed_title, title_bool = str_fix(record.title)
    
    processed_article_info['abstract'] = processed_abstract
    processed_article_info['abstract_converted'] = abstract_bool
    
    processed_article_info['title'] = processed_title
    processed_article_info['title_converted'] = title_bool
    

    processed_article = Articles_pandoc(**processed_article_info)
    return processed_article

In [7]:
def query_tex(limit_num=None, Session=None, batch_size=1000):
    #get a new session into the database
    #using one to query and one to commit
    query_session, commit_session = Session(), Session()

    #use the session to query the database
    if limit_num:
        query = query_session.query(Articles_raw).limit(limit_num)

    else:
        query = query_session.query(Articles_raw).yield_per(batch_size)
    
    for row_num, record in enumerate(query):        
        processed_article = change_tex(record)
        commit_session.add(processed_article)
        
        if row_num % 1000 == 0:
            commit_session_run(commit_session)
                
                
    commit_session_run(commit_session)
    
    return row_num
    

In [8]:
query_tex(limit_num=100, Session=Session)

99