In [3]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import smart_open
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config)

utils.py loaded: v0.2.12
config.py loaded: v0.1


In [25]:
keys = {}
s3_client = boto3.client('s3')
files = s3_client.list_objects(Bucket=config.DEFAULT_S3_BUCKET_NAME, Prefix='01_data/01_raw/inspec/keys/', Delimiter='/', MaxKeys=10000)
for file_ref in files['Contents']:
    source_filepath = file_ref['Key']
    with smart_open.open(f's3://{config.DEFAULT_S3_BUCKET_NAME}/{source_filepath}') as file_source:
        for raw_line in file_source:
            line = raw_line.strip()
            line = line.replace('\t', '\n')
            if not line in keys:
                keys[line] = 0
            keys[line] += 1

sorted_keys = {k: v for k, v in sorted(keys.items(), key=lambda item: item[1], reverse=True)}
print(json.dumps(sorted_keys, indent=2, default=str))

{
  "Internet": 95,
  "information resources": 46,
  "optimisation": 36,
  "computational complexity": 32,
  "stability": 28,
  "human factors": 28,
  "polynomials": 28,
  "matrix algebra": 27,
  "electronic commerce": 27,
  "robust control": 26,
  "probability": 26,
  "control system synthesis": 22,
  "computational geometry": 22,
  "artificial intelligence": 21,
  "differential equations": 21,
  "closed loop systems": 20,
  "medical image processing": 20,
  "statistical analysis": 20,
  "nonlinear control systems": 19,
  "feedback": 19,
  "parallel programming": 19,
  "iterative methods": 19,
  "interpolation": 19,
  "process control": 18,
  "business data processing": 18,
  "graph theory": 18,
  "neural nets": 17,
  "decision theory": 17,
  "data mining": 17,
  "learning (artificial intelligence)": 16,
  "image reconstruction": 16,
  "software agents": 16,
  "approximation theory": 16,
  "formal logic": 16,
  "telemedicine": 16,
  "hypermedia markup languages": 15,
  "human resource