# Upload full text to DB

In [1]:
import os
import yaml
from sqlalchemy import create_engine

import pandas as pd
import dask.dataframe as dd
import sys
import csv
import janitor # to clean df column names to snake case


with open("/home/ubuntu/work/therapeutic_accelerator/config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    
with open("../config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)
    
bucket_path = os.path.join(config['paths']['root'], config['paths']['mount'])

# Create engine to connect to database
engine = create_engine(f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres')

In [23]:
# Read in rows/columns with large number of bytes
maxInt = sys.maxsize

while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

In [4]:
csv_file = '/home/ubuntu/work/bucket/fulltext/final_full_text.csv'

with open(csv_file, 'r') as f:
    d_reader = csv.DictReader(f)

    #get fieldnames from DictReader object and store in list
    headers = d_reader.fieldnames

In [40]:
# Read in csv file with dask to allow such a big file to be read. 
df = dd.read_csv(csv_file, engine = 'python', usecols=range(2,len(headers)), dtype = str, sample=100000,blocksize=18e6)  

# Change column names to snakecase to follow postgres conventions. Otherwise the column names will have to be in quotes during queries
df = janitor.clean_names(df)
df.columns.tolist()

Index(['corpusid', 'text', 'source_pdfurls', 'source_pdfsha', 'source_oainfo',
       'annotations_abstract', 'annotations_author',
       'annotations_authoraffiliation', 'annotations_authorfirstname',
       'annotations_authorlastname', 'annotations_bibauthor',
       'annotations_bibauthorfirstname', 'annotations_bibauthorlastname',
       'annotations_bibentry', 'annotations_bibref', 'annotations_bibtitle',
       'annotations_bibvenue', 'annotations_figure',
       'annotations_figurecaption', 'annotations_figureref',
       'annotations_formula', 'annotations_paragraph', 'annotations_publisher',
       'annotations_sectionheader', 'annotations_table',
       'annotations_tableref', 'annotations_title', 'annotations_venue',
       'source_oainfo_license', 'source_oainfo_openaccessurl',
       'source_oainfo_status'],
      dtype='object')

In [41]:
# Quick peak at data
df_temp = df.partitions[0].compute()
df_temp.head()

Unnamed: 0,corpusid,text,source_pdfurls,source_pdfsha,source_oainfo,annotations_abstract,annotations_author,annotations_authoraffiliation,annotations_authorfirstname,annotations_authorlastname,...,annotations_paragraph,annotations_publisher,annotations_sectionheader,annotations_table,annotations_tableref,annotations_title,annotations_venue,source_oainfo_license,source_oainfo_openaccessurl,source_oainfo_status
0,250150147,\nNext-Generation Metagenome Sequencing Shows ...,,a3d95f615d674c56efc6cc0e134a773951328c79,,"[{""end"":4475,""start"":2322}]","[{""end"":254,""start"":245},{""end"":275,""start"":25...","[{""end"":494,""start"":385},{""end"":615,""start"":50...","[{""end"":248,""start"":245},{""end"":262,""start"":25...","[{""end"":253,""start"":249},{""end"":274,""start"":26...",...,"[{""end"":5260,""start"":4491},{""end"":6886,""start""...",,"[{""end"":4489,""start"":4477},{""end"":9898,""start""...","[{""end"":37053,""start"":33349},{""end"":38156,""sta...","[{""attributes"":{""ref_id"":""tab_0""},""end"":15910,...","[{""end"":195,""start"":1},{""end"":1738,""start"":1544}]","[{""end"":1756,""start"":1740}]",,,
1,251969342,\nProtective roles of cytoplasmic p21 Cip1/Waf...,,77d34fefc0ce5bbf0e130f621720f9490e07a1d4,,"[{""end"":3064,""start"":1441}]","[{""end"":270,""start"":99},{""end"":371,""start"":271...","[{""end"":198,""start"":115},{""end"":269,""start"":20...","[{""end"":104,""start"":99},{""end"":278,""start"":271...","[{""end"":113,""start"":105},{""end"":285,""start"":27...",...,"[{""end"":5305,""start"":3066},{""end"":5893,""start""...",,"[{""attributes"":{""n"":""2.1""},""end"":5333,""start"":...",,,"[{""end"":96,""start"":1},{""end"":1271,""start"":1176}]",,,,
2,7063556,\nThe Impact of Nutritional Status and Longitu...,,fcd362c9e4615d887d1600faad9b6e319422a57d,,"[{""end"":2815,""start"":1706}]","[{""end"":402,""start"":136},{""end"":672,""start"":40...","[{""end"":312,""start"":161},{""end"":401,""start"":31...","[{""end"":140,""start"":136},{""end"":409,""start"":40...","[{""end"":145,""start"":141},{""end"":415,""start"":41...",...,"[{""end"":4127,""start"":2831},{""end"":4959,""start""...",,"[{""attributes"":{""n"":""1.""},""end"":2829,""start"":2...",,"[{""end"":14302,""start"":14295}]","[{""end"":129,""start"":1},{""end"":1380,""start"":1252}]","[{""end"":1417,""start"":1382}]",CCBY,https://www.mdpi.com/1660-4601/8/1/105/pdf,GOLD
3,14778566,\nARTICLE Coarse-grained simulation reveals ke...,,4c80183d0ae3364fc2db2a9a024222cd2888d662,,"[{""end"":2734,""start"":1751}]","[{""end"":284,""start"":109},{""end"":458,""start"":28...","[{""end"":283,""start"":125},{""end"":457,""start"":29...","[{""end"":113,""start"":109},{""end"":117,""start"":11...","[{""end"":123,""start"":118},{""end"":297,""start"":29...",...,"[{""end"":3226,""start"":2736},{""end"":4255,""start""...",,"[{""end"":8028,""start"":8021},{""end"":23211,""start...","[{""end"":42963,""start"":42373},{""end"":45178,""sta...","[{""attributes"":{""ref_id"":""tab_0""},""end"":8877,""...","[{""end"":85,""start"":1},{""end"":1558,""start"":1474}]",,CCBY,https://doi.org/10.1038/ncomms11568,GOLD
4,475102,\nPhysiological relation between respiration a...,,a6c6bd1e378da5c7a0561f755808a38f282d8cbd,,"[{""end"":3427,""start"":436}]","[{""end"":169,""start"":153},{""end"":172,""start"":17...",,"[{""end"":159,""start"":153},{""end"":161,""start"":16...","[{""end"":168,""start"":162},{""end"":186,""start"":17...",...,"[{""end"":4323,""start"":3441},{""end"":5816,""start""...",,"[{""end"":3439,""start"":3429},{""end"":7292,""start""...","[{""end"":38677,""start"":38670},{""end"":39006,""sta...","[{""attributes"":{""ref_id"":""tab_0""},""end"":7981,""...","[{""end"":150,""start"":1},{""end"":407,""start"":258}]",,CCBY,https://microbialcellfactories.biomedcentral.c...,GOLD


In [42]:
df_temp.shape

(214, 31)