# Setup

## Packages

In [2]:
import os
import yaml
import pandas as pd
import dask.dataframe as dd
import numpy as np
# import dask

# set up
with open("/home/ubuntu/work/therapeutic_accelerator/config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    
with open("../config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)

In [3]:
from sqlalchemy.engine import URL
from sqlalchemy import create_engine, text

url_object = URL.create(
    'postgresql', 
    username='postgres',
    password=keys["postgres"], 
    host=config["database"]["host"],
    database='postgres',
    port=5432
)

# Create engine to connect to database
# psql_string = f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres'
engine = create_engine(url_object)

## Tokenizers and Models

In [4]:
# import pytorch
max_sequence_length = 1200
embedding_size = 200

from transformers import T5Tokenizer # AutoModel, AutoTokenizer, BertTokenizer,BioGptModel, BioGptConfig, BioGptTokenizer
T5tokens = T5Tokenizer.from_pretrained('t5-base', model_max_length = max_sequence_length)

In [5]:
# bio_bert_model = AutoModel.from_pretrained("gsarti/biobert-nli")
# bio_bert_tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
# original_bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# T5Abstract_model = TFT5ForConditionalGeneration.from_pretrained('t5-base')
# biogpttokenizer = BioGptTokenizer.from_pretrained("microsoft/biogpt")
# biogptmodel = BioGptModel.from_pretrained("microsoft/biogpt")

# Pilot Test: Encodings

In [6]:
# # pull all the abstracts
# sql = '''SELECT * FROM abstracts LIMIT 5'''
# with engine.connect() as conn: 
#     results = conn.execute(text(sql))

# # Turn into dataframe    
# abstracts = pd.DataFrame(results.fetchall())

# # Remove empty abstracts
# print("Shape before: ", abstracts.shape)
# abstracts = abstracts.dropna(how = 'all', subset='abstract').reset_index(drop = True)
# print("Shape after: ", abstracts.shape)

# # Distribution of abstract lengths
# ab_lens = abstracts.abstract.apply(lambda x: len(x.split()))

# # Lengths of Abstracts
# display(ab_lens.describe())

# from seaborn import displot
# displot(ab_lens)

In [7]:
# # create new columns for DB table
# abstracts[['input_ids', 'attention_mask']] = abstracts.abstract.apply(T5tokens).apply(pd.Series)

# # Cleanup
# abstracts.drop(['index', 'id'], axis = 1, inplace = True)

# # QA check
# abstracts

# Create Encodings using Dask

In [8]:
ddf = dd.read_sql_table('abstracts', 
                        con = f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres',
                        index_col = 'id',
                        head_rows = 5,
                        npartitions = 200)

# Remove unnecessary columns
ddf = ddf.drop(columns = ['index'])

# Remove empty abstract rows
ddf = ddf.dropna(how = 'all', subset='abstract').reset_index(drop = True)

In [9]:
# create new columns for DB table
ddf2 = ddf.abstract.apply(T5tokens, meta=('abstract', 'string')).apply(pd.Series, meta=({'input_ids':'object', 'attention_mask':'object'}))

# concatenate two dataframes
ddf = dd.concat([ddf, ddf2], axis = 1) # create divisions in ddfs? 
# QA check
# results

We're assuming that the indices of each dataframes are 
 aligned. This assumption is not generally safe.


In [10]:
ddf

Unnamed: 0_level_0,paperId,corpusId,abstract,input_ids,attention_mask
npartitions=200,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,object,object,object,object,object
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [43]:
# Rename columns for easier reading later
ddf.columns

Index(['paperId', 'corpusId', 'abstract', 'input_ids', 'attention_mask'], dtype='object')

In [12]:
# ddf3.dask

In [13]:
# Save backup
# name_function = lambda x: f"abstracts-{x}.parquet"
# ddf3.to_parquet('/home/ubuntu/work/backup/', name_function = name_function)

# Upload to Postgresql DB

In [14]:
# sql = text(''' 
#     SELECT EXISTS (
#         SELECT FROM information_schema.tables 
#         WHERE    table_name   = 'abstracts'
#     );
# ''')

# with engine.connect() as conn: 
#     conn.execute(sql)

Create array columns to store encoding and mask

In [53]:
sql = text(''' 
    DROP TABLE IF EXISTS abstracts_encodings;
''')

with engine.connect() as conn: 
    query = conn.execute(sql)

In [54]:
table_name = 'abstracts_encodings'

In [55]:
# create table
# Create Table in DB first before uploading
from sqlalchemy import MetaData, Table, Column, Integer, String, ARRAY

metadata_obj = MetaData()

# Create abstracts metadata
abstracts = Table(
    table_name,
    metadata_obj,
    Column("paperId", String, nullable = True),
    Column("corpusId", String, nullable=True),
    Column("abstract", String, nullable = True),
    Column("input_ids", String, nullable=True),
    Column("attention_mask", String, nullable=True),
)

metadata_obj.create_all(engine)

In [56]:
# Takes about 13 minutes
# ddf.to_csv("/home/ubuntu/work/backup/abstract_encodings/abstract_encodings-*.csv",name_function = lambda x: str(x), index = False)

In [57]:
from glob import glob
csv_files = glob("/home/ubuntu/work/backup/abstract_encodings/*.csv")
# csv_files

In [58]:
import chardet

for csv_file_path in csv_files: 
    with open(csv_file_path, 'r') as f: 
        df = pd.read_csv(f)
        with engine.connect() as conn: 
            df.to_sql(table_name, con = conn, index = False, if_exists='append')

In [None]:
# Upload dask dataframe to psql
# ddf = ddf.to_sql(name = table_name, uri = str(url_object), if_exists = 'replace', index = False, chunksize = 10000, method = 'multi')

In [59]:
# Check if it worked
import pandas as pd

sql = text(f''' 
    SELECT * FROM {table_name} LIMIT 5;
''')

with engine.connect() as conn: 
    query = conn.execute(sql)

test = pd.DataFrame(query.fetchall())
test.head()

Unnamed: 0,paperId,corpusId,abstract,input_ids,attention_mask
0,859c91de1ab22aeff85558dcf676ee5ffc4981a5,33235381,Summary This work aims at applying concepts of...,"[20698, 100, 161, 3, 8345, 44, 6247, 6085, 13,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,2460a37a3305b3bb072770e5bb57ed95496ecf80,72970529,Summary Objectives: To diagnose the hospital i...,"[20698, 27919, 7, 10, 304, 18730, 8, 2833, 251...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,e92f481f3be6f0956e0cd6c160a2a384c4eacd76,26375252,"Holdoff et al.1 described a retrospective, mon...","[8470, 1647, 3, 15, 17, 491, 5, 536, 3028, 3, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,d51f04cfdc8fe907e4a66e948028bf09f3a6af7a,11026954,1. In forty‐one out of forty‐seven dogs under ...,"[1300, 86, 19662, 2, 782, 91, 13, 19662, 2, 7,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,111d6c65ad374861a3c9c70b210996b76b0a7080,95308098,Calculated and observed excited singlet state ...,"[18555, 920, 11, 6970, 2787, 712, 17, 538, 703...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
