In [1]:
from datetime import datetime as dt

def time_func(func):
    def inner():
        stime = dt.now()
        result = func()
        ftime = dt.now()
        print (f"Completed in {(ftime - stime).total_seconds()} seconds")
        return result
    return inner

db_path = './n100.db'

#path = './2020-07-15/document_parses/pdf_json/*.json'
#path = './n10k_pdfs/*.json'
path = './n100_pdfs/*.json'

In [2]:
# Test global time
gstart = dt.now()

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "2048m").\
        getOrCreate()

sc = spark.sparkContext

sc

In [4]:
import json
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize

sw_fileids = stopwords.fileids()
sw_dict = {lang:stopwords.words(lang) for lang in sw_fileids}

In [5]:
import os
import sqlite3

if os.path.exists(db_path):
    os.remove(db_path)

create_papers_table = '''
    CREATE TABLE papers (
        paper_id text PRIMARY KEY NOT NULL,
        title text NOT NULL
    );
'''

create_sentences_table = '''
    CREATE TABLE sentences (
        paper_id tetx NOT NULL,
        sentence_number integer NOT NULL,
        sentence text,
        PRIMARY KEY (paper_id, sentence_number)
        FOREIGN KEY (paper_id)
            REFERENCES papers (paper_id)
    );
'''

try:
    conn = sqlite3.connect(db_path)

    conn.execute(create_papers_table)
    conn.execute(create_sentences_table)

    conn.commit()
    
    print("OK!")

except sqlite3.Error as error:
    print(error)

finally:
    conn.close()


OK!


In [6]:
def load_paper_json(raw_json):
    try:
        res = json.loads(raw_json)
    except:
        res = ''
    
    return res

def parse_paper_json(loaded_json):
    """
    Builds and returns a dict containing the paper_id, title, and
    body_text (joined sections) for further processing

    :param paper_json: loaded JSON object
    :return: dict containing targeted information
    """
    return \
        {
            'paper_id':     loaded_json['paper_id'],
            'title':        loaded_json['metadata']['title'],
            'body_text':    " ".join([x['text'] for x in loaded_json['body_text']])
        }

def text_lang_likely(text):
    """
    Compares tokenized text to set of stopwords for each language contained
    within the NLTK stopwords corpus and outputs the likely language based on
    the number of common words.

    Adapted from http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/

    :param text: body of text
    :return: most likely language of text
    """
    wp_words = set(wd.lower() for wd in wordpunct_tokenize(text))
    lang_scores = {}
    for lang in sw_fileids:
        sw_set = set(sw_dict[lang])
        intersection = wp_words & sw_set
        lang_scores[lang] = len(intersection)
    return max(lang_scores, key=lang_scores.get) # return language with highest score

def lang_likely_wrapper(data):
    import nltk
    data.update({'lang': text_lang_likely(data['body_text'])})
    return data

def process_sentences(data):
    from nltk.tokenize import sent_tokenize
        
    sentences = sent_tokenize(data['body_text'])
    
    return [{
        'paper_id': data['paper_id'],
        'sentence_number': i,
        'sentence': sentence
    } for i,sentence in enumerate(sentences)]
            

In [7]:
@time_func
def get_rdd():
    return sc \
            .wholeTextFiles(path).values() \
            .map(load_paper_json) \
            .filter(lambda x: x != '') \
            .map(parse_paper_json) \
            .map(lang_likely_wrapper) \
            .filter(lambda x: x['lang'] == 'english')

rdd = get_rdd()

Completed in 1.697108 seconds


In [8]:
add_paper = "INSERT INTO papers (paper_id, title) VALUES (?, ?);"

@time_func
def process_papers():
    paper_table_data = rdd.map(lambda x: {'paper_id':x['paper_id'], 'title':x['title']}).collect()
    
    conn = sqlite3.connect(db_path)

    for paper_data in paper_table_data:
        conn.execute(add_paper, (paper_data['paper_id'], paper_data['title']))
    
    conn.commit()

    conn.close()
    
    return "Done with papers!"
    
process_papers()

Completed in 4.606717 seconds


'Done with papers!'

In [9]:
add_sentence = "INSERT INTO sentences (paper_id, sentence_number, sentence) VALUES (?, ?, ?)"

sentence_table_data = rdd.map(process_sentences).reduce(lambda x,y: x+y)

@time_func
def process_sentences():

    conn = sqlite3.connect(db_path)

    for sentence_data in sentence_table_data:
        conn.execute(add_sentence, (sentence_data['paper_id'], sentence_data['sentence_number'], sentence_data['sentence']))
    
    conn.commit()

    conn.close()
    
    return "Done with sentences!"
    
process_sentences()

Completed in 0.14097 seconds


'Done with sentences!'

In [10]:
# Report global run time
gfinish = dt.now()

print(f"Notebook took: {(gfinish - gstart).total_seconds()} seconds to run.")

Notebook took: 15.508026 seconds to run.
