In [1]:
from sqlalchemy import create_engine, Column, String, Integer, DATE, BOOLEAN
from sqlalchemy.orm import sessionmaker

from sqlalchemy.ext.declarative import declarative_base

import json

In [2]:
with open('../../postgres.json') as pg_info:
    pg_json = json.load(pg_info)
    pg_username = pg_json['pg_username']
    pg_password = pg_json['pg_password']
    pg_ip = pg_json['pg_ip']

In [3]:
Base = declarative_base()

class articles_detex(Base):
    __tablename__ = 'arxiv_detex'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)

class articles_pandoc(Base):
    __tablename__ = 'arxiv_pandoc'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)
    


In [4]:
engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')
Session = sessionmaker(bind=engine)


## Set up the query

In [5]:
import pandas as pd

In [6]:
def loop_over_detex(processing_function, saving_function)
    session = Session()

    query = session.query(articles_detex.abstract)
    query = query.filter(articles_detex.id == articles_pandoc.id)
    query = query.filter(articles_pandoc.abstract_converted == True)
    query = query.yield_per(1000)

    df = pd.read_sql(query.statement, query.session.bind)
    session.close()

## Iterate over the query to process the abstracts

In [5]:
import spacy

In [6]:
nlp = spacy.load('en_core_web_lg')

for word in nlp.Defaults.stop_words:
    lex = nlp.vocab[word]
    lex.is_stop = True
    


In [7]:
def process_abstract(abstract, nlp):
    abstract = abstract.lower()
    doc = nlp(abstract)
    doc_filtered = []
    for token in doc:
        token_filter = token.is_alpha and (not token.is_stop) and (len(token.text) != 1)
        
        if token_filter:
            doc_filtered.append(token.text)
    return ' '.join(doc_filtered)