In [1]:
import sys
sys.version

'2.7.15 | packaged by conda-forge | (default, Feb 28 2019, 04:00:11) \n[GCC 7.3.0]'

In [2]:
import yaml, psycopg2
from psycopg2.extensions import AsIs

# Connect to Postgres
with open('./credentials', 'r') as credential_yaml:
    credentials = yaml.load(credential_yaml,Loader = yaml.SafeLoader)

with open('./config', 'r') as config_yaml:
    config = yaml.load(config_yaml, Loader = yaml.SafeLoader)
    
snorkel_connection = psycopg2.connect(
    dbname=credentials['snorkel_postgres']['database'],
    user=credentials['snorkel_postgres']['user'],
    password=credentials['snorkel_postgres']['password'],
    host=credentials['snorkel_postgres']['host'],
    port=credentials['snorkel_postgres']['port'])
snorkel_cursor=snorkel_connection.cursor()


In [3]:

# CLEANING THE SLATE FOR SNORKEL 
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS candidate CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS context CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS document CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS feature CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS feature_key CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS gold_label CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS gold_label_key CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS label CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS label_key CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS marginal CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS prediction CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS prediction_key CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS sentence CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS span CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS spouse CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS stable_label CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS mineral CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS strat CASCADE;
""")
snorkel_cursor.execute("""
    DROP TABLE IF EXISTS strat_age CASCADE;
""")
snorkel_connection.commit()
snorkel_connection.close()

In [4]:
# STARTING SNORKEL PROGRAM
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
## POSTGRES DATABASE CONN FOR SNORKEL 
os.environ['SNORKELDB']="postgres://postgres:password123@localhost:5432/snorkel"
from snorkel import SnorkelSession
session = SnorkelSession()

In [5]:
from datetime import datetime 
from tqdm import tqdm 
start_time = datetime.now()
# SPECIFYING CONNECTION TO EXISTING SENTENCES 
connection = psycopg2.connect(
    dbname=credentials['postgres']['database'],
    user=credentials['postgres']['user'],
    password=credentials['postgres']['password'],
    host=credentials['postgres']['host'],
    port=credentials['postgres']['port'])
cursor = connection.cursor()

# SPECIFYING CONNECTION TO SNORKEL 
snorkel_connection = psycopg2.connect(
    dbname=credentials['snorkel_postgres']['database'],
    user=credentials['snorkel_postgres']['user'],
    password=credentials['snorkel_postgres']['password'],
    host=credentials['snorkel_postgres']['host'],
    port=credentials['snorkel_postgres']['port'])
snorkel_cursor = snorkel_connection.cursor()

cursor.execute("""
    SELECT DISTINCT(docid) FROM final_merged_sentences;
""")

count = 1
for docid in cursor:
    snorkel_cursor.execute("INSERT INTO context (id, type, stable_id) VALUES (nextval('context_id_seq'), 'document', %(stable_id)s)", {"stable_id": docid[0] + "::document:0:0"})
    snorkel_cursor.execute("INSERT INTO document (id, name) VALUES (currval('context_id_seq'), %(docid)s)", {"count" : count, "docid": docid[0]})
    snorkel_connection.commit()
    count += 1

#IMPORT THE SENTENCES DUMP
cursor.execute("""
            SELECT docid, sentid, words, poses, ners, lemmas, dep_paths, dep_parents FROM final_merged_sentences ORDER BY docid, sentid;
            """)

# Need to get document-level offsets for stable_id at the sentence level.
count = 1

doc_char_counts = {}


for sent in tqdm(cursor,total=1235485):
    parsed_sent = {}
    snorkel_cursor.execute("SELECT id FROM document WHERE name=%(docid)s", {"docid" : sent[0]})
    document_id = snorkel_cursor.fetchone()[0]
    parsed_sent["document_id"] = document_id
    parsed_sent["position"] = sent[1]
    parsed_sent["words"] = sent[2]
    parsed_sent["pos_tags"] = sent[3]
    parsed_sent["ner_tags"] = sent[4]
    parsed_sent["lemmas"] = sent[5]
    parsed_sent["dep_labels"] = sent[6]
    parsed_sent["dep_parents"] = sent[7]
    parsed_sent["text"] = " ".join(word for word in parsed_sent["words"])
    parsed_sent["char_offsets"] = [0 for i in range(len(parsed_sent["words"]))]
    parsed_sent["abs_char_offsets"] = [0 for i in range (len(parsed_sent["words"]))] 
    parsed_sent["entitiy_cids"] = ['O']

        
    sentence_running_count = 0
    for wordidx in range(len(parsed_sent["words"])):
        parsed_sent["char_offsets"][wordidx] = sentence_running_count
        sentence_running_count += len(parsed_sent["words"][wordidx]) + 1

    # This will probably be off by one...
    if sent[0] in doc_char_counts:
        sentence_start = doc_char_counts[sent[0]] + 1
        doc_char_counts[sent[0]] += sentence_running_count
    else:
        sentence_start = 0
        doc_char_counts[sent[0]] = sentence_running_count

    # keep this running count as the sentence-level offset stable_id
    snorkel_cursor.execute("INSERT INTO context (id, type, stable_id) VALUES (nextval('context_id_seq'), 'sentence', %(stable_id)s)", {"stable_id": sent[0] + "::sentence:%s:%s" % (sentence_start, doc_char_counts[sent[0]])})

    snorkel_connection.commit()
    snorkel_cursor.execute(" \
        INSERT INTO sentence (id, document_id, position, words, pos_tags, ner_tags, lemmas, dep_labels, dep_parents, char_offsets, abs_char_offsets, text) VALUES \
                (currval('context_id_seq'), \
                %(document_id)s, \
                %(position)s, \
                %(words)s, \
                %(pos_tags)s, \
                %(ner_tags)s,  \
                %(lemmas)s, \
                %(dep_labels)s, \
                %(dep_parents)s, \
                %(char_offsets)s, \
                %(abs_char_offsets)s, \
                %(text)s);", parsed_sent)
    snorkel_connection.commit()
    count += 1

snorkel_cursor.close()
snorkel_connection.close()
cursor.close()
connection.close()

time_elapsed = datetime.now() - start_time
print('Time elapsed (hh:mm:ss.ms) {}'.format(time_elapsed))

100%|██████████| 1235485/1235485 [1:00:41<00:00, 339.24it/s]

Time elapsed (hh:mm:ss.ms) 1:01:00.191888





In [6]:
# SOME NUMBERS TO CHECK WAS PROPERTY CONFIGURED
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

('Documents:', 922L)
('Sentences:', 1235485L)
