# CORE Processing Articles for Downstream Use in Modeling

* Ingestion of data in S3 from the CORE API stored data as JSONs with up to 100 search results stored in each file. 
Per [BlazingText Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext.html), the algorithm requires each line of the input file should contain a single sentence of space separated tokens. Raw data will need to be processed to accomodate the training format. 
* Before processing the raw data, a summary sheet will be created to catalog the data

## Initial Prep

Imports

In [21]:
import time
import boto3
import pandas as pd
import json
import ast
from io import StringIO
from nltk import tokenize
import nltk
nltk.download('punkt')
import re, string
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer as netlem
lem = netlem()

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


Declarations

In [22]:
core_bucket_name = 'core0823'
stg_bucket = 'core0823-stg'
stg_catalog_bucket = stg_bucket + '/Catalog'
stg_bt_bucket = stg_bucket + '/BT_STG'

Prep

In [23]:
s3_client = boto3.client('s3')

In [24]:
json_list = [i['Key'] for i in s3_client.list_objects(Bucket=core_bucket_name)['Contents']]

Generic Functions

In [25]:
def s3_file_location(f_bucket, f_file):
    """
    Simply returns a formatted string with the S3 file location
    """
    data_location = 's3://{}/{}'.format(f_bucket,f_file)
    return data_location

def json_file_to_dict(f_bucket, f_json_file):
    """
    Intakes bucket and json file.
    Returns dictionary.
    """
    try:
        json_s3_obj = s3_client.get_object( Bucket= f_bucket, Key = f_json_file )
        tmp_str_json = json_s3_obj['Body'].read().decode('utf-8')
        fnl_json = ast.literal_eval(tmp_str_json)
        return fnl_json
    except:
        pass 
        print('Fail importing json file/')

## Text Data Processing

Functions

In [26]:
def json_text_parse(f_bucket, f_file_name):
    """
    Intakes a bucket and file name. 
    Parses CORE API JSON.     
    Returns list of lists, where each entry is a sentence.
    """
    results_list = []
    tmp_file = json_file_to_dict(f_bucket, f_file_name)
    if tmp_file is not None and tmp_file['data'] is not None:
        for item in tmp_file['data']:
            if item['_source']['description'] is not None:
                tmp_parse_list = tokenize.sent_tokenize(item['_source']['description'])
                results_list.extend(tmp_parse_list)

            if item['_source']['fullText'] is not None:
                tmp_parse_list = tokenize.sent_tokenize(item['_source']['fullText'])
                results_list.extend(tmp_parse_list)
            
    return results_list


def json_extract_text(f_bucket, f_file_list):
    """
    Intakes a bucket and list of JSON files from CORE API. 
    Parses CORE API JSON. 
    This function iterates over a list of files, where json_text_parse is for a single file.
    Returns list of lists, where each entry is a sentence.
    """
    t0 = time.time()
    print('JSON extract text starting at: {}'.format(t0))
    results_list = []
    for file in f_file_list:
        tmp_results = json_text_parse(f_bucket, file)
        results_list.extend(tmp_results)
    
    results_list = [sent for sent in results_list if len(sent) > 1]
    t1 = time.time()
    print('JSON extract text complete at: {}'.format(t1))
    print('JSON extract text tool: {}'.format(t1-t0))
    return results_list


def prep_sent_list(f_sent_list):
    """
    Intakes a list of sentences. 
    Uses a series of list comprehensions to prepare sentences for analysis.
    Returns a list of sentences. 
    """
    t0 = time.time()
    print('Preparing sentences started at: {}'.format(t0))
    sent_list = [re.sub(r'[%s]' % re.escape(string.punctuation),'',sent.lower()) for sent in f_sent_list] # make lowercase and remove punctuation
    t1 = time.time()
    print('Lowercase and punctuation removal completed at {}, taking {} seconds.'.format(t1,t1-t0))
    sent_list = [re.sub(r'\w*\d\w*', '',sent) for sent in f_sent_list if len( re.sub(r'\w*\d\w*', '',sent) ) > 0 ] # remove words with numbers and only where non-zero length
    t2 = time.time()
    print('Words with numbers and zero-length removal completed at {}, taking {} seconds.'.format(t2,t2-t1))
    
    # lemmatize and remove stop words
    for i,sent in enumerate(sent_list):
        tmp = sent.split(' ')
        sent_list[i] = ' '.join([lem.lemmatize(word) for word in tmp if lem.lemmatize(word) not in STOPWORDS])
    
    t3 = tiime.time()
    print('Word lemmatization and stop word removal completed at {}, taking {} seconds.'.format(t2,t2-t1))
    
    return sent_list


def extract_and_clean(f_bucket, f_file_list):
    """
    Executes json text extraction and cleaning of text. 
    Returns a list of prep'd sentences. 
    """
    tmp_text = json_extract_text(f_bucket, f_file_list)
    tmp_clean = prep_sent_list(tmp_text)
    
    return tmp_clean

In [27]:
sentences = json_extract_text(core_bucket_name, json_list)

JSON extract text starting at: 1599555295.939915
JSON extract text complete at: 1599555684.5175335
JSON extract text tool: 388.577618598938


In [29]:
import pickle
sentences_serialized = pickle.dumps(sentences)

In [30]:
s3_client.put_object(Body=sentences_serialized, Bucket='core0823-stg', Key='BT_STG/sentences.txt')

{'ResponseMetadata': {'RequestId': 'FAD19FE322C66D48',
  'HostId': 'o7Lms5X051vFIPXEmxLqSiwtYh/Bm60C54D0aoxx1wICAcpsKCRyBBS42+dcXpoEgIv/EPkQtK8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'o7Lms5X051vFIPXEmxLqSiwtYh/Bm60C54D0aoxx1wICAcpsKCRyBBS42+dcXpoEgIv/EPkQtK8=',
   'x-amz-request-id': 'FAD19FE322C66D48',
   'date': 'Tue, 08 Sep 2020 09:11:15 GMT',
   'etag': '"8fd77685d3b62325d08ab023f7658441"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'ETag': '"8fd77685d3b62325d08ab023f7658441"'}

In [None]:
t0 = time.time()
cleaned_text = extract_and_clean(core_bucket_name, json_list)
t1 = time.time()
print('Time to extract and prep text: {} seconds'.format(t1-t0))

In [11]:
test = ['a b c d e','f g h i j k','l m n o p']
notin = ['a','f','l']
[' '.join([word.upper() for word in words if word not in notin]) for words in [sent.split(' ') for sent in test]]

['B C D E', 'G H I J K', 'M N O P']

In [12]:
import timeit

In [16]:
stmt_txt = """
test = ['a b c d e','f g h i j k','l m n o p']
notin = ['a','f','l']
[' '.join([word.upper() for word in words if word not in notin]) for words in [sent.split(' ') for sent in test]]
"""
print("The time taken is ",timeit.timeit(stmt=stmt_txt))

The time taken is  6.071857801000078


In [19]:
stmt_txt = """
test = ['a b c d e','f g h i j k','l m n o p']
notin = ['a','f','l']
for i,sent in enumerate(test):
    tmp = sent.split(' ')
    test[i] = ' '.join([word.upper() for word in tmp if word not in notin])
"""
print("The time taken is ",timeit.timeit(stmt=stmt_txt))

The time taken is  6.2610089179997885


In [20]:
test = ['a b c d e','f g h i j k','l m n o p']
notin = ['a','f','l']
for i,sent in enumerate(test):
    tmp = sent.split(' ')
    test[i] = ' '.join([word.upper() for word in tmp if word not in notin])
test

['B C D E', 'G H I J K', 'M N O P']