In [None]:
# Take all json files, extract abstracts and body, lemmatize terms
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [1]:
import globals

import os

import json
import requests
from pprint import pprint

import spacy
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

import nltk.data
from nltk import sent_tokenize, tokenize, word_tokenize
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer  
lemmatizer = WordNetLemmatizer() 


from azure.storage.blob import BlockBlobService

from joblib import Parallel, delayed

[nltk_data] Downloading package punkt to /home/liamca/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/liamca/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /home/liamca/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
valid_characters = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 '-")
def remover(my_string = ""):
    for item in my_string:
        if item not in valid_characters:
            my_string = my_string.replace(item, " ")
    return my_string

def processFile(blob_name):
    counter = 0

    existing_file = blob_name
    if existing_file in processed_files:
        print ('Skipping already processed file: ', existing_file)
    else:
        json_content = {}
        content = block_blob_service.get_blob_to_text(globals.blob_container_name, blob_name).content
        try:
            if not os.path.exists(os.path.dirname(os.path.join(globals.processed_text_dir, blob_name))):
                try:
                    os.makedirs(os.path.dirname(os.path.join(globals.processed_text_dir, blob_name)))
                except OSError as exc: # Guard against race condition
                    if exc.errno != errno.EEXIST:
                        raise
                        
            with open(os.path.join(globals.processed_text_dir, blob_name).replace('.json','.txt'), 'w', encoding='utf-8') as f:
                json_content = json.loads(content)
                if "abstract" in json_content:
                    for c in json_content["abstract"]:
                        f.write(convertTextToTerms(c["text"].strip()) + '\r\n')

                if "body_text" in json_content:
                    for c in json_content["body_text"]:
                        f.write(convertTextToTerms(c["text"].strip()) + '\r\n')

        except Exception as ex:
            print (blob_name, " - Error:", str(ex))
        

def convertTextToTerms(content):
    tagged_sentence = ''
    text = word_tokenize(content)
    tagged_content = nltk.pos_tag(text)
    for (word, tag) in tagged_content:
        #print (word.lower(), tag)
        if 'JJ' in tag or 'NN' in tag or 'VB' in tag:
            parsed_word = word.lower()
            if (len(parsed_word) >= globals.minPhraseLen):
                parsed_word = lemmatizer.lemmatize(parsed_word)
            tagged_sentence += parsed_word + ' '
        else: 
            #tagged_sentence += ' ' + word.lower()
            tagged_sentence += ' '
    # remove double spaces
    while '  ' in tagged_sentence:
        tagged_sentence = tagged_sentence.replace('  ', ' ')
    return tagged_sentence.strip()

In [None]:
# Get all files to be procesed
block_blob_service = BlockBlobService(account_name=globals.blob_account_name, account_key=globals.blob_account_key)
generator = block_blob_service.list_blobs(globals.blob_container_name, globals.blob_container_path)

# Add Blobs to a list
blobs = []
for blob in generator:
    blobs.append(blob.name)

In [None]:
# print ("Resetting dir: ", os.path.join(globals.processed_text_dir, globals.blob_container_path))
# globals.resetDir(os.path.join(globals.processed_text_dir, globals.blob_container_path))
processed_files = globals.getFilesInDir(globals.processed_text_dir)


In [None]:
# convertTextToTerms("The quick brown foxes jumped over the lazy dogs.")

In [None]:
print ("Downloading text files and processing (lemmas format)...")
processed_list = Parallel(n_jobs=globals.max_parallelism)(delayed(processFile)(blob_name)
    for blob_name in blobs)

# blob_counter = 0
# for blob_name in blobs:
#     processFile(blob_name)
#     blob_counter += 1
#     if blob_counter % 100 == 0:
#         print ("Processed file count:", blob_counter)