# GloVe Vectors

## Try it twice

In [1]:
from sqlalchemy import create_engine, Column, String, Integer, DATE, BOOLEAN
from sqlalchemy.orm import sessionmaker

from sqlalchemy.ext.declarative import declarative_base

import json

In [2]:
with open('../../postgres.json') as pg_info:
    pg_json = json.load(pg_info)
    pg_username = pg_json['pg_username']
    pg_password = pg_json['pg_password']
    pg_ip = pg_json['pg_ip']

In [3]:
Base = declarative_base()


class articles_raw(Base):
    __tablename__ = 'arxiv_raw'
    
    id = Column(String, primary_key=True)
    created = Column(DATE)
    setspec = Column(String)
    
    title = Column(String)
    title_converted = Column(BOOLEAN)
    
    abstract = Column(String)
    abstract_converted = Column(BOOLEAN)


In [4]:
engine = create_engine(f'postgres://{pg_username}:{pg_password}@{pg_ip}:5432')
Session = sessionmaker(bind=engine)


## Iterate over the query to process the abstracts

In [5]:
import spacy

In [6]:
nlp = spacy.load('en_core_web_lg')

for word in nlp.Defaults.stop_words:
    lex = nlp.vocab[word]
    lex.is_stop = True
    


In [7]:
def process_abstract(abstract, nlp, white_space):
    abstract = abstract.lower()
    abstract = white_space.sub(' ', abstract)
    
    doc = nlp(abstract)
    doc_filtered = []
    for token in doc:
        if not token.is_stop:
            doc_filtered.append(token.text)
            
    return ' '.join(doc_filtered)

## Set up the query

In [8]:
import pandas as pd
import re

In [9]:
def loop_over_table(table_query, corpus_file, text_processer, nlp): 
    white_space_processor = re.compile('\s')
    with open(corpus_file, 'w') as file:
        for text_object in table_query.yield_per(1000):
            doc = text_processer(text_object.abstract, nlp, white_space_processor)
            file.write(doc + '\n')


In [10]:
def match_replace(match_obj):
    str_to_pad = match_obj.group(0)
    
    begins = r'\\begin\{.*?\}'
    ends = r'\\end\{.*?\}'
    punct = '[\'";:/?.,`]'
    

    
    begin_match = re.match(begins, str_to_pad)
    end_match = re.match(ends, str_to_pad)

    if begin_match:
        str_to_return  = ' ' + begin_match.group(0) + ' '
        
    elif end_match:
        str_to_return  = ' ' + end_match.group(0) + ' '
        
    elif str_to_pad in punct:
        str_to_return = ' '
        
    else:
        str_to_return = f' {str_to_pad} '
    return str_to_return 

In [11]:
def arxiv_corpus(session, file_path, processer_function, compiled_regex, nlp):
    
    table_query = session.query(articles_raw.abstract).yield_per(1000)
    
    with open(file_path, 'w') as file:
        for record in table_query:
            abstract_string = record.abstract.lower().replace('\n', ' ')
            abstract_string = compiled_regex.sub(processer_function, abstract_string)
            
            doc = nlp(abstract_string)
            
            token_list = [token.lemma_.strip() for token in doc if not token.is_stop]
            abstract_string = ' '.join(token_list)
                    
            
            file.write(f'{abstract_string}\n')



In [12]:
math_modes = '\$\(\)\[\]'

begins = r'\\begin\{.*?\}'
ends = r'\\end\{.*?\}'

simple_math = '[+=\-/]'
functions = r'\\[a-zA-Z]+'
punct = '[\'";:/?.,`]'

pattern = f'{math_modes}|{begins}|{ends}|{simple_math}|{functions}|{punct}'

finder = re.compile(pattern)




In [15]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])

#There's a bug in the current spaCy models that causes stop words to
#not be set correctly
for word in nlp.Defaults.stop_words:
    lex = nlp.vocab[word]
    lex.is_stop = True
    
    
    
len(nlp.Defaults.stop_words)


305

In [17]:
session = Session()
%time arxiv_corpus(session, './test.txt', match_replace, finder, nlp)
session.close()

CPU times: user 17min 50s, sys: 6.66 s, total: 17min 57s
Wall time: 18min 15s
