# CORE Processing Articles for Downstream Use in Modeling

* Ingestion of data in S3 from the CORE API stored data as JSONs with up to 100 search results stored in each file. 
Per [BlazingText Documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext.html), the algorithm requires each line of the input file should contain a single sentence of space separated tokens. Raw data will need to be processed to accomodate the training format. 
* Before processing the raw data, a summary sheet will be created to catalog the data

## Initial Prep

Imports

In [13]:
import boto3
import pandas as pd
import json
import ast
from io import StringIO
from nltk import tokenize
import nltk
nltk.download('punkt')
import re, string
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer as netlem
lem = netlem()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Declarations

In [2]:
core_bucket_name = 'core0823'
stg_bucket = 'core0823-stg'
stg_catalog_bucket = stg_bucket + '/Catalog'
stg_bt_bucket = stg_bucket + '/BT_STG'

Prep

In [3]:
s3_client = boto3.client('s3')

In [4]:
json_list = [i['Key'] for i in s3_client.list_objects(Bucket=core_bucket_name)['Contents']]

Generic Functions

In [5]:
def s3_file_location(f_bucket, f_file):
    """
    Simply returns a formatted string with the S3 file location
    """
    data_location = 's3://{}/{}'.format(f_bucket,f_file)
    return data_location

def json_file_to_dict(f_bucket, f_json_file):
    """
    Intakes bucket and json file.
    Returns dictionary.
    """
    try:
        json_s3_obj = s3_client.get_object( Bucket= f_bucket, Key = f_json_file )
        tmp_str_json = json_s3_obj['Body'].read().decode('utf-8')
        fnl_json = ast.literal_eval(tmp_str_json)
        return fnl_json
    except:
        pass 
        print('Fail importing json file/')

## Text Data Processing

Functions

In [21]:
def json_text_parse(f_bucket, f_file_name):
    """
    Intakes a bucket and file name. 
    Parses CORE API JSON.     
    Returns list of lists, where each entry is a sentence.
    """
    results_list = []
    tmp_file = json_file_to_dict(f_bucket, f_file_name)
    if tmp_file is not None and tmp_file['data'] is not None:
        for item in tmp_file['data']:
            if item['_source']['description'] is not None:
                tmp_parse_list = tokenize.sent_tokenize(item['_source']['description'])
                results_list.extend(tmp_parse_list)

            if item['_source']['fullText'] is not None:
                tmp_parse_list = tokenize.sent_tokenize(item['_source']['fullText'])
                results_list.extend(tmp_parse_list)
            
    return results_list


def json_extract_text(f_bucket, f_file_list):
    """
    Intakes a bucket and list of JSON files from CORE API. 
    Parses CORE API JSON. 
    This function iterates over a list of files, where json_text_parse is for a single file.
    Returns list of lists, where each entry is a sentence.
    """
    results_list = []
    for file in f_file_list:
        tmp_results = json_text_parse(f_bucket, file)
        results_list.extend(tmp_results)
    
    results_list = [sent for sent in results_list if len(sent) > 1]
    
    return results_list


def prep_sent_list(f_sent_list):
    """
    Intakes a list of sentences. 
    Uses a series of list comprehensions to prepare sentences for analysis.
    Returns a list of sentences. 
    """
    sent_list = [re.sub(r'[%s]' % re.escape(string.punctuation),'',sent.lower()) for sent in f_sent_list] # make lowercase and remove punctuation
    sent_list = [re.sub(r'\w*\d\w*', '',sent) for sent in f_sent_list if len( re.sub(r'\w*\d\w*', '',sent) ) > 0 ] # remove words with numbers and only where non-zero length

    
    # lemmatize and remove stop words
    for i,sent in enumerate(sent_list):
        tmp = sent.split(' ')
        sent_list[i] = ' '.join([lem.lemmatize(word) for word in tmp if lem.lemmatize(word) not in STOPWORDS])
    
    return sent_list


def extract_and_clean(f_bucket, f_file_list):
    """
    Executes json text extraction and cleaning of text. 
    Returns a list of prep'd sentences. 
    """
    tmp_text = json_extract_text(f_bucket, f_file_list)
    tmp_clean = prep_sent_list(tmp_text)
    
    return tmp_clean

In [None]:
t0 = time.time()
cleaned_text = extract_and_clean(core_bucket_name, json_list)
t1 = time.time()
print('Time to extract and prep text: {} seconds'.format(t1-t0))