In [2]:
# Base code
%run /home/ubuntu/work/therapeutic_accelerator/scripts/base.py

In [29]:
from sqlalchemy import MetaData, text
import pandas as pd

In [30]:
# create function to submit query to database with sqlalchemy
def query_db(query):
    with engine.connect() as con:
        rs = con.execute(text(query))
        return rs

In [4]:
index_sql = '''
    CREATE INDEX idx_attr_corpusid 
    ON attributes(corpusid);
'''

query_db(index_sql)

In [6]:
# get metadata for table in postgresql database
def get_metadata(table_name):
    metadata = MetaData()
    metadata.reflect(engine)
    table = metadata.tables[table_name]
    return table

In [8]:
# display metadata for table in postgresql database as json
def display_metadata(table_name):
    table = get_metadata(table_name)
    return table.columns.keys()

In [9]:
display_metadata('attributes')

['index',
 'corpusid',
 'externalids',
 'url',
 'title',
 'authors',
 'venue',
 'publicationvenueid',
 'year',
 'referencecount',
 'citationcount',
 'influentialcitationcount',
 'isopenaccess',
 's2fieldsofstudy',
 'publicationtypes',
 'publicationdate',
 'journal',
 'updated',
 'id']

In [10]:
res = get_metadata('attributes')

In [13]:
# get all table names in postgresql database
def get_table_names():
    metadata = MetaData()
    metadata.reflect(engine)
    return metadata.tables.keys()

get_table_names()

dict_keys(['attributes', 'authors', 'fulltext', 'abstracts', 'abstracts_encodings'])

In [14]:
# get all column names for all tables in postgresql database
def get_column_names():
    metadata = MetaData()
    metadata.reflect(engine)
    return metadata.tables

In [17]:
for k in get_column_names().keys():
    print(k)
    print(get_column_names()[k].columns.keys())

attributes
['index', 'corpusid', 'externalids', 'url', 'title', 'authors', 'venue', 'publicationvenueid', 'year', 'referencecount', 'citationcount', 'influentialcitationcount', 'isopenaccess', 's2fieldsofstudy', 'publicationtypes', 'publicationdate', 'journal', 'updated', 'id']
authors
['authorId', 'name']
fulltext
['empty', 'Unnamed: 0', 'corpusid', 'text', 'source.pdfurls', 'source.pdfsha', 'source.oainfo', 'annotations.abstract', 'annotations.author', 'annotations.authoraffiliation', 'annotations.authorfirstname', 'annotations.authorlastname', 'annotations.bibauthor', 'annotations.bibauthorfirstname', 'annotations.bibauthorlastname', 'annotations.bibentry', 'annotations.bibref', 'annotations.bibtitle', 'annotations.bibvenue', 'annotations.figure', 'annotations.figurecaption', 'annotations.figureref', 'annotations.formula', 'annotations.paragraph', 'annotations.publisher', 'annotations.sectionheader', 'annotations.table', 'annotations.tableref', 'annotations.title', 'annotations.venu

In [19]:
# Create index column based on corpus id for tables in postgresql database
index_sql = '''
    CREATE INDEX idx_absend_corpusid 
    ON abstracts_encodings("corpusId");
    
    CREATE INDEX idx_fulltext_corpusid 
    ON fulltext(corpusid);
'''

query_db(index_sql)

In [20]:
# drop empty columns from postgresql database tables
def drop_empty_columns(table_name):
    table = get_metadata(table_name)
    for c in table.columns:
        if table.count().loc[c.name] == 0:
            query = f'ALTER TABLE {table_name} DROP COLUMN {c.name}'
            query_db(query)
            print(f'Dropped column {c.name} from table {table_name}')

In [40]:
# drop row from postgresql database table if column value is null
def drop_null_rows(table_name, column_name):
    table = get_metadata(table_name)
    query = f'DELETE FROM {table_name} WHERE {column_name} IS NULL'
    query_db(query)
    print(f'Dropped null rows from table {table_name}')

In [41]:
drop_null_rows('abstracts', 'abstract')

Dropped null rows from table abstracts


In [42]:
table_name = 'abstracts'
column_name = 'abstract'
null_sql = f''' SELECT * FROM {table_name} WHERE {column_name} IS NULL OR {column_name} = ' ';'''

def query_to_df(query):
    return pd.read_sql(query, engine)

query_to_df(null_sql)

Unnamed: 0,id,index,paperId,corpusId,abstract


In [44]:
abstract_sql = f''' SELECT * FROM {table_name};'''

abstract_df = query_to_df(abstract_sql)

In [50]:
for k in get_column_names().keys():
    print(k)
    print(get_column_names()[k].columns.keys())

attributes
['index', 'corpusid', 'externalids', 'url', 'title', 'authors', 'venue', 'publicationvenueid', 'year', 'referencecount', 'citationcount', 'influentialcitationcount', 'isopenaccess', 's2fieldsofstudy', 'publicationtypes', 'publicationdate', 'journal', 'updated', 'id']
authors
['authorId', 'name']
fulltext
['empty', 'Unnamed: 0', 'corpusid', 'text', 'source.pdfurls', 'source.pdfsha', 'source.oainfo', 'annotations.abstract', 'annotations.author', 'annotations.authoraffiliation', 'annotations.authorfirstname', 'annotations.authorlastname', 'annotations.bibauthor', 'annotations.bibauthorfirstname', 'annotations.bibauthorlastname', 'annotations.bibentry', 'annotations.bibref', 'annotations.bibtitle', 'annotations.bibvenue', 'annotations.figure', 'annotations.figurecaption', 'annotations.figureref', 'annotations.formula', 'annotations.paragraph', 'annotations.publisher', 'annotations.sectionheader', 'annotations.table', 'annotations.tableref', 'annotations.title', 'annotations.venu

In [54]:
table_name = 'fulltext'
column_name = 'Unnamed: 0'
null_sql = f''' SELECT * FROM {table_name} WHERE "{column_name}" IS NULL OR "{column_name}" = ' ';'''

query_to_df(null_sql)

Unnamed: 0.1,empty,Unnamed: 0,corpusid,text,source.pdfurls,source.pdfsha,source.oainfo,annotations.abstract,annotations.author,annotations.authoraffiliation,...,annotations.publisher,annotations.sectionheader,annotations.table,annotations.tableref,annotations.title,annotations.venue,source.oainfo.license,source.oainfo.openaccessurl,source.oainfo.status,id


In [52]:
drop_null_rows('fulltext', 'empty')

Dropped null rows from table fulltext


In [58]:
# get count of all null values in postgresql database table from column
def get_null_count(table_name, column_name):
    table = get_metadata(table_name)
    query = f'SELECT COUNT(*) FROM {table_name} WHERE "{column_name}" IS NULL'
    return query_db(query).fetchone()[0]

In [59]:
for k in get_column_names().keys():
    print(k)
    for c in get_column_names()[k].columns.keys(): 
        print("\t", c, get_null_count(k, c))
    print("-" * 50)

attributes
	 index 0
	 corpusid 0
	 externalids 0
	 url 0
	 title 0
	 authors 0
	 venue 209481
	 publicationvenueid 233902
	 year 77
	 referencecount 0
	 citationcount 0
	 influentialcitationcount 0
	 isopenaccess 0
	 s2fieldsofstudy 0
	 publicationtypes 212494
	 publicationdate 71461
	 journal 291
	 updated 0
	 id 0
--------------------------------------------------
authors
	 authorId 20650
	 name 0
--------------------------------------------------
fulltext
	 empty 0
	 Unnamed: 0 0
	 corpusid 0
	 text 1367
	 source.pdfurls 148681
	 source.pdfsha 1367
	 source.oainfo 158844
	 annotations.abstract 27060
	 annotations.author 6665
	 annotations.authoraffiliation 41333
	 annotations.authorfirstname 13712
	 annotations.authorlastname 13680
	 annotations.bibauthor 7642
	 annotations.bibauthorfirstname 7787
	 annotations.bibauthorlastname 7712
	 annotations.bibentry 7243
	 annotations.bibref 7419
	 annotations.bibtitle 9658
	 annotations.bibvenue 7296
	 annotations.figure 8401
	 annotations.

In [60]:
# drop column from postgresql database table
def drop_column(table_name, column_name):
    table = get_metadata(table_name)
    query = f'ALTER TABLE {table_name} DROP COLUMN {column_name}'
    query_db(query)
    print(f'Dropped column {column_name} from table {table_name}')

In [61]:
drop_column('abstracts', 'index')

Dropped column index from table abstracts
