# Upload full text to DB

In [1]:
import os
import yaml
from sqlalchemy import create_engine, text

import pandas as pd
import dask.dataframe as dd
import sys
import csv
import janitor # to clean df column names to snake case
import numpy as np

import warnings
warnings.filterwarnings('ignore')

with open("/home/ubuntu/work/therapeutic_accelerator/config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    
with open("../config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)
    
bucket_path = os.path.join(config['paths']['root'], config['paths']['mount'])

# Create engine to connect to database
engine = create_engine(f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres')

In [2]:
# Parse text into sections
import json
import re

def get_section_metadata(text, annotations): 
    # Pulls the text from the full text based on the indexes passed in as annotations object.
    sections_list = []
    for i in annotations: 
        section = {}
        section['name'] = text[int(i['start']):int(i['end'])]
        section['start'] = i['start']
        section['end'] = i['end']
        sections_list.append(section)
        
    return sections_list

def find_sections(text_df): 
    # Create annotations index df to parse through
    
    sections_index = json.loads(text_df['annotations_sectionheader'][0])
    
    sections_df = pd.DataFrame(get_section_metadata(text_df['text'][0], sections_index))

    # maintain corpus id as Primary Key in DB
    sections_df['corpusid'] = text_df.corpusid[0]
    
    # rename colum for clarity
    sections_df = sections_df.rename({'name':'section'}, axis = 1)
    
    return sections_df

def refine_sections(section_df): 
    # find relevant sections based on pattern(s)
    pattern = "introduction|methods|results|discussion|conclusion"

    # create a new dataframe to hold values. Will reference original to get last section of text
    section_filter = section_df.section.str.contains(pat = pattern, regex = True, flags=re.IGNORECASE)
    
    # print(section_filter)
    
    if section_filter.isnull().all(): 
        return True

    # only major sections
    sections_df_refined = section_df[section_filter]
    
    # Get indices of sections
    indices = sections_df_refined.index.tolist()

    # Recode values to reflect text location rather than section header
    for i, v in enumerate(indices): 
        # index of section to start text
        start = indices[i]
        
        # Point to stop text, beginning of next section
        # case for last section in entire list
        if i == len(indices)-1:  # for the last section
            end = indices[i] + 1
        else: 
            end = indices[i+1]
        
        sections_df_refined.loc[v, 'start'] = section_df.loc[start, 'end']
        sections_df_refined.loc[v, 'end'] = section_df.loc[end, 'start']
        
    sections_df_refined[['start','end']] = sections_df_refined[['start','end']].astype("int")

    return sections_df_refined

def convert_sections(text, sections_df_refined):
    # Get text for sections
    for i in sections_df_refined.index.tolist():
        start = sections_df_refined.loc[i, 'start']
        end = sections_df_refined.loc[i, 'end']
        
        try: 
            # pull section text to next major section. Remove any new line characters and white space on ends.
            sections_df_refined.loc[i, 'text'] = text[start:end].replace('\n', ' ').strip()
        except: 
            print("could not extract text")
            return ""

    # flatten dataframe into final form
    sections_cleaned = sections_df_refined[['corpusid', 'section', 'text']]
    
    # convert to dataframe with sections as column names
    # sections_cleaned.pivot(index = 'corpusid', columns = 'section', values = 'text').reset_index() 
    
    return sections_cleaned

def extract_sections(row):
    
    if pd.isnull(row.annotations_sectionheader): 
        return np.nan
    
    # convert Pandas to Pandas Dataframe for easier access
    # for tuple iterator
    text_df = pd.DataFrame([dict(row._asdict())])
        
    # get all sections
    sections_df = find_sections(text_df)
    
    # refine only major sections
    try: 
        sections_df_refined = refine_sections(sections_df)
    except: 
        return np.nan
    
    if isinstance(sections_df_refined, bool): 
        # no results found
        return np.nan
    else: 
        try: 
            # convert sections to dataframe with corpusid as the PK and sections column headers
            sections_cleaned = convert_sections(text_df.text[0], sections_df_refined)
        except: 
            return np.nan
    
    return sections_cleaned

In [3]:
# Read in rows/columns with large number of bytes
maxInt = sys.maxsize

while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

In [4]:
csv_file = '/home/ubuntu/work/bucket/fulltext/final_full_text.csv'

with open(csv_file, 'r') as f:
    d_reader = csv.DictReader(f)

    #get fieldnames from DictReader object and store in list
    headers = d_reader.fieldnames

In [19]:
headers

['',
 'Unnamed: 0',
 'corpusid',
 'text',
 'source.pdfurls',
 'source.pdfsha',
 'source.oainfo',
 'annotations.abstract',
 'annotations.author',
 'annotations.authoraffiliation',
 'annotations.authorfirstname',
 'annotations.authorlastname',
 'annotations.bibauthor',
 'annotations.bibauthorfirstname',
 'annotations.bibauthorlastname',
 'annotations.bibentry',
 'annotations.bibref',
 'annotations.bibtitle',
 'annotations.bibvenue',
 'annotations.figure',
 'annotations.figurecaption',
 'annotations.figureref',
 'annotations.formula',
 'annotations.paragraph',
 'annotations.publisher',
 'annotations.sectionheader',
 'annotations.table',
 'annotations.tableref',
 'annotations.title',
 'annotations.venue',
 'source.oainfo.license',
 'source.oainfo.openaccessurl',
 'source.oainfo.status']

In [5]:
# Read in csv file with dask to allow such a big file to be read. 
df = dd.read_csv(csv_file, engine = 'python',
                 usecols=range(2,len(headers)),
                 quoting=csv.QUOTE_NONE,
                 on_bad_lines='skip',
                 dtype = str,
                 sample=1000) 

# Change column names to snakecase to follow postgres conventions. Otherwise the column names will have to be in quotes during queries
df = janitor.clean_names(df)

main_cols = ['corpusid', 'text', 'annotations_abstract', 'annotations_sectionheader']

df = df[main_cols]
df.columns.tolist()

['corpusid', 'text', 'annotations_abstract', 'annotations_sectionheader']

In [6]:
for i in df.columns.tolist(): 
    print(i)

corpusid
text
annotations_abstract
annotations_sectionheader


In [32]:
sql = ''' 
CREATE TABLE fulltext (
    empty TEXT,
    "Unnamed: 0" TEXT,
    corpusid                            TEXT,
    text                                TEXT,
    "source.pdfurls"                      VARCHAR,
    "source.pdfsha"                       VARCHAR,
    "source.oainfo"                       VARCHAR,
    "annotations.abstract"                VARCHAR,
    "annotations.author"                  VARCHAR,
    "annotations.authoraffiliation"       VARCHAR,
    "annotations.authorfirstname"         VARCHAR,
    "annotations.authorlastname"          VARCHAR,
    "annotations.bibauthor"               VARCHAR,
    "annotations.bibauthorfirstname"      VARCHAR,
    "annotations.bibauthorlastname"       VARCHAR,
    "annotations.bibentry"                VARCHAR,
    "annotations.bibref"                  VARCHAR,
    "annotations.bibtitle"                VARCHAR,
    "annotations.bibvenue"                VARCHAR,
    "annotations.figure"                  VARCHAR,
    "annotations.figurecaption"           VARCHAR,
    "annotations.figureref"               VARCHAR,
    "annotations.formula"                 VARCHAR,
    "annotations.paragraph"               VARCHAR,
    "annotations.publisher"               VARCHAR,
    "annotations.sectionheader"           VARCHAR,
    "annotations.table"                   VARCHAR,
    "annotations.tableref"                VARCHAR,
    "annotations.title"                   VARCHAR,
    "annotations.venue"                   VARCHAR,
    "source.oainfo.license"               VARCHAR,
    "source.oainfo.openaccessurl"         VARCHAR,
    "source.oainfo.status"                VARCHAR
);
'''


In [27]:
# delete_table = ''' DROP TABLE IF EXISTS fulltext;'''

In [33]:
with engine.connect() as conn: 
    query = conn.execute(text(sql))

In [8]:
df.memory_usage(deep=True).sum()

dd.Scalar<series-..., dtype=int64>

In [9]:
# import dask.dataframe as dd
# from dask_sql import Context

# c = Context()
# df = dd.read_csv("s3://nyc-tlc/trip data/yellow_tripdata_2019-01.csv")

# c.create_table("my_data", df)

In [11]:
# dto_sql = dask.delayed(pd.DataFrame.to_sql)
# out = [dto_sql(d, 'fulltext', str(engine.url), if_exists='replace', index=True)
#        for d in df.to_delayed()]
# dask.compute(*out)

MemoryError: Unable to allocate 44.0 GiB for an array with shape (204262, 28885) and data type object

In [6]:
# from dask.diagnostics import ProgressBar
# import dask.bag as db

# from dask.distributed import Client, progress
# client = Client(n_workers = 8, threads_per_worker=4)
# client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 32,Total memory: 7.65 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:45563,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 32
Started: Just now,Total memory: 7.65 GiB

0,1
Comm: tcp://127.0.0.1:34557,Total threads: 4
Dashboard: http://127.0.0.1:35727/status,Memory: 0.96 GiB
Nanny: tcp://127.0.0.1:43917,
Local directory: /tmp/dask-scratch-space/worker-4t0bhlph,Local directory: /tmp/dask-scratch-space/worker-4t0bhlph

0,1
Comm: tcp://127.0.0.1:33151,Total threads: 4
Dashboard: http://127.0.0.1:34753/status,Memory: 0.96 GiB
Nanny: tcp://127.0.0.1:45103,
Local directory: /tmp/dask-scratch-space/worker-9tnx6axi,Local directory: /tmp/dask-scratch-space/worker-9tnx6axi

0,1
Comm: tcp://127.0.0.1:34683,Total threads: 4
Dashboard: http://127.0.0.1:43905/status,Memory: 0.96 GiB
Nanny: tcp://127.0.0.1:34671,
Local directory: /tmp/dask-scratch-space/worker-bfa6ch2c,Local directory: /tmp/dask-scratch-space/worker-bfa6ch2c

0,1
Comm: tcp://127.0.0.1:40973,Total threads: 4
Dashboard: http://127.0.0.1:41503/status,Memory: 0.96 GiB
Nanny: tcp://127.0.0.1:44127,
Local directory: /tmp/dask-scratch-space/worker-j093r7z2,Local directory: /tmp/dask-scratch-space/worker-j093r7z2

0,1
Comm: tcp://127.0.0.1:44287,Total threads: 4
Dashboard: http://127.0.0.1:34321/status,Memory: 0.96 GiB
Nanny: tcp://127.0.0.1:46455,
Local directory: /tmp/dask-scratch-space/worker-rf7grxbf,Local directory: /tmp/dask-scratch-space/worker-rf7grxbf

0,1
Comm: tcp://127.0.0.1:45941,Total threads: 4
Dashboard: http://127.0.0.1:43025/status,Memory: 0.96 GiB
Nanny: tcp://127.0.0.1:34507,
Local directory: /tmp/dask-scratch-space/worker-mxppq5z9,Local directory: /tmp/dask-scratch-space/worker-mxppq5z9

0,1
Comm: tcp://127.0.0.1:35837,Total threads: 4
Dashboard: http://127.0.0.1:41305/status,Memory: 0.96 GiB
Nanny: tcp://127.0.0.1:41409,
Local directory: /tmp/dask-scratch-space/worker-21do9ngm,Local directory: /tmp/dask-scratch-space/worker-21do9ngm

0,1
Comm: tcp://127.0.0.1:33553,Total threads: 4
Dashboard: http://127.0.0.1:45539/status,Memory: 0.96 GiB
Nanny: tcp://127.0.0.1:41001,
Local directory: /tmp/dask-scratch-space/worker-2x4e86fc,Local directory: /tmp/dask-scratch-space/worker-2x4e86fc


In [None]:
df.dask

In [38]:
with engine.connect() as conn:
    # using dask it needs the url
    sql_upload = client.map(df.to_sql('fulltext', uri = str(engine.url), index = False, if_exists = 'replace', chunksize = 10, compute = True))

Key:       ('getitem-6490f642dceeebc2c1892515e5eebae3', 2)
Function:  execute_task
args:      ((subgraph_callable-1a4bcfb4-cf70-4af2-ab30-a52d5f0ce375, ['corpusid', 'text', 'annotations_abstract', 'annotations_sectionheader'], None, 'rename-80e21c38cc6c47834e58cede6aff337d', 'rename-2cc3f0be3fefaf894a2db6daa15f65d2', 'rename-a933a54e2f7f96e0ebc3f59889bdc1ed', 'rename-a040475da04c2fe1962adf34de4503f7', 'rename-1f14b7a522c85dec37b506e4ea561820', 'rename-f32843f4b3c9639141b0ea35fed8564c', 'read-csv-875d28554c6978ef669a1b48b871d16c', [(<function read_block_from_file at 0x7fbd10f33100>, <OpenFile '/home/ubuntu/work/bucket/fulltext/final_full_text.csv'>, 128479787, 64239893, b'\n'), None, False, False]))
kwargs:    {}
Exception: "MemoryError((202553, 32531), dtype('O'))"

Key:       ('getitem-6490f642dceeebc2c1892515e5eebae3', 0)
Function:  execute_task
args:      ((subgraph_callable-1a4bcfb4-cf70-4af2-ab30-a52d5f0ce375, ['corpusid', 'text', 'annotations_abstract', 'annotations_sectionheader

MemoryError: Unable to allocate 49.1 GiB for an array with shape (202553, 32531) and data type object

Key:       ('getitem-6490f642dceeebc2c1892515e5eebae3', 3)
Function:  execute_task
args:      ((subgraph_callable-1a4bcfb4-cf70-4af2-ab30-a52d5f0ce375, ['corpusid', 'text', 'annotations_abstract', 'annotations_sectionheader'], None, 'rename-80e21c38cc6c47834e58cede6aff337d', 'rename-2cc3f0be3fefaf894a2db6daa15f65d2', 'rename-a933a54e2f7f96e0ebc3f59889bdc1ed', 'rename-a040475da04c2fe1962adf34de4503f7', 'rename-1f14b7a522c85dec37b506e4ea561820', 'rename-f32843f4b3c9639141b0ea35fed8564c', 'read-csv-875d28554c6978ef669a1b48b871d16c', [(<function read_block_from_file at 0x7fbd10f33100>, <OpenFile '/home/ubuntu/work/bucket/fulltext/final_full_text.csv'>, 192719680, 64239894, b'\n'), None, False, False]))
kwargs:    {}
Exception: "MemoryError((202651, 23914), dtype('O'))"



# Parse Sections

In [None]:
extracted_text = []

for row in df.itertuples():
    if pd.notnull(row.annotations_sectionheader):
        extracted_text.append(extract_sections(row))
    else: 
        continue

In [None]:
# Quick peak at data
df_temp = df.partitions[0].compute()
df_temp.head()