# GloVe Vectors

## Try it twice

In [2]:
from sqlalchemy import create_engine, Column, String, Integer, DATE, BOOLEAN
from sqlalchemy.orm import sessionmaker

from sqlalchemy.ext.declarative import declarative_base

import json

In [3]:
with open('../../postgres.json') as pg_info:
    pg_json = json.load(pg_info)
    pg_username = pg_json['pg_username']
    pg_password = pg_json['pg_password']
    pg_ip = pg_json['pg_ip']

In [4]:
Base = declarative_base()


class articles_raw(Base):
    __tablename__ = 'arxiv_raw'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)

class articles_detex(Base):
    __tablename__ = 'arxiv_detex'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)

class articles_pandoc(Base):
    __tablename__ = 'arxiv_pandoc'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)
    


In [5]:
engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')
Session = sessionmaker(bind=engine)


## Iterate over the query to process the abstracts

In [6]:
import spacy

In [6]:
nlp = spacy.load('en_core_web_lg')

for word in nlp.Defaults.stop_words:
    lex = nlp.vocab[word]
    lex.is_stop = True
    


In [7]:
def process_abstract(abstract, nlp, white_space):
    abstract = abstract.lower()
    abstract = white_space.sub(' ', abstract)
    
    doc = nlp(abstract)
    doc_filtered = []
    for token in doc:
        token_filter = (not token.is_punct) and (not token.is_stop)
        
        if token_filter:
            doc_filtered.append(token.text)
    return ' '.join(doc_filtered)

## Set up the query

In [16]:
import pandas as pd

In [17]:
def loop_over_table(table_query, corpus_file, text_processer, nlp):    
    with open(corpus_file, 'w') as file:
        for text_object in table_query.yield_per(1000):
            doc = text_processer(text_object.abstract, nlp)
            file.write(doc)


In [18]:
def match_replace(match_obj):
    str_to_pad = match_obj.group(0)
    
    begins = r'\\begin\{.*?\}'
    ends = r'\\end\{.*?\}'
    punct = '\.\,\:'

    
    begin_match = re.match(begins, str_to_pad)
    end_match = re.match(ends, str_to_pad)

    if begin_match:
        str_to_return  = ' ' + begin_match.group(0) + ' '
        
    elif end_match:
        str_to_return  = ' ' + end_match.group(0) + ' '
        
    elif str_to_pad in punct:
        str_to_return = ' '
        
    else:
        str_to_return = f' {str_to_pad} '
    return str_to_return 

In [20]:
session = Session()
table_query = session.query(articles_raw.abstract).limit(10)
# %time loop_over_table(table_query, './test.txt', process_abstract, nlp)
df = pd.read_sql(table_query.statement, table_query.session.bind)

In [21]:
abstract = df.loc[2,'abstract']


In [22]:
import re

In [23]:
math_modes = r'\$'

begins = r'\\begin\{.*?\}'
ends = r'\\end\{.*?\}'

simple_math = '[+=\-/]'
functions = r'\\[a-zA-Z]+'
punct = '[.,:]'

pattern = f'{math_modes}|{begins}|{ends}|{simple_math}|{functions}|{punct}'



finder = re.compile(pattern)
finder.sub(match_replace, abstract)





'  The kappa - invariant and supersymmetric actions of D1 and D5 - branes in AdS_3 x\nS^3 are investigated  as well as the action of a D5 - brane in an AdS_5 x S^5\nbackground  The action of a D5 - brane lying totally in an AdS_3 x S^3 background\nis found  Some progress was made towards finding the action for the D5 - brane\nfree to move in the whole AdS_3 x S^3 x T^4 space  however the supersymmetric\naction found here is not kappa - invariant and the reasons the method used did\nnot find a kappa - invariant solution are discussed \n'

In [45]:
def arxiv_corpus(session, filepath, processer_function, compiled_regex, spacy_stopwords):
    
    table_query = session.query(articles_raw.abstract).yield_per(1000)
    
    
    with open(file_path, 'w') as file:
        for record in table_query:
            abstract_string = record.abstract.lower().replace('\n', ' ')
            abstract_string = compiled_regex.sub(processer_function, abstract_string)
            
            doc = nlp(abstract_string)
            
            token_list = [token.lemma_.strip() for token in doc if not token.is_stop]
            abstract_string = ' '.join(token_list)
                    
            
            file.write(f' {abstract_string} ')



In [46]:
nlp = spacy.load('en_core_web_lg', disable=['parser'])

#There's a bug in the current spaCy models that causes stop words to
#not be set correctys
for word in nlp.Defaults.stop_words:
    lex = nlp.vocab[word]
    lex.is_stop = True
    
    
    
len(nlp.Defaults.stop_words)


305

In [50]:
# sesson = Session()
# file_path = '../../vectors/arxiv_raw/corpus.txt'
# processer_function = match_replace
# compiled_regex = finder

# %time arxiv_corpus(session, file_path, processer_function, compiled_regex, nlp.Defaults.stop_words)

session.close()