# Console

In this notebook, we debug our DAGs without running then through Apache Airflow.

In [1]:
import sys
sys.path.append('airflow/dags/lib')
import emrspark_lib as emrs
import configparser
import time

import logging
import os
import json

logger = logging.getLogger()
logger.setLevel(logging.INFO)

config = configparser.ConfigParser()
config.read('airflow/config.cfg')

CLUSTER_NAME = config['AWS']['CLUSTER_NAME']
VPC_ID = config['AWS']['VPC_ID']
SUBNET_ID = config['AWS']['SUBNET_ID']

if config['App']['STOCKS'] == '':
    STOCKS = []
else:
    STOCKS = json.loads(config.get('App', 'STOCKS').replace("'", '"'))

ec2, emr, iam = emrs.get_boto_clients(config['AWS']['REGION_NAME'], config=config)

if VPC_ID == '':
    VPC_ID = emrs.get_first_available_vpc(ec2)

if SUBNET_ID == '':
    SUBNET_ID = emrs.get_first_available_subnet(ec2, VPC_ID)

In [2]:
master_sg_id = emrs.create_security_group(ec2, '{}SG'.format(CLUSTER_NAME),
    'Master SG for {}'.format(CLUSTER_NAME), VPC_ID)
slave_sg_id = emrs.create_security_group(ec2, '{}SlaveSG'.format(CLUSTER_NAME),
    'Slave SG for {}'.format(CLUSTER_NAME), VPC_ID)

keypair = emrs.recreate_key_pair(ec2, '{}_pem'.format(CLUSTER_NAME))

emrs.recreate_default_roles(iam)

cluster_id = emrs.create_emr_cluster(emr, CLUSTER_NAME,
                master_sg_id,
                slave_sg_id,
                keypair['KeyName'], SUBNET_ID,
                release_label='emr-5.28.1')
cluster_dns = emrs.get_cluster_dns(emr, cluster_id)

INFO:root:Found Security Group: sg-0d32ef88aafc53667 in vpc vpc-d4fb5aaf (us-east-1).
INFO:root:Found Security Group: sg-0e715d65ff15edd5d in vpc vpc-d4fb5aaf (us-east-1).
INFO:root:keypair ShortInterestEffectDL_pem created:
{'KeyFingerprint': 'f7:9c:a0:7a:fb:fb:44:43:51:cc:82:7e:dd:e3:11:90:12:f3:f7:f5', 'KeyMaterial': '-----BEGIN RSA PRIVATE KEY-----\nMIIEowIBAAKCAQEArwftCKZcfsfB679dWvEw7WKhqHg8ZG6TnbjWfrqlQuugq6wvF3NiDc51DkHp\n5p7yc04FmBPY1YXRyRZkuRaeMtLsuzbh1O+UVHeT2imtuRIU8fVg0VtTlmmi7mK0Vu9CIZOyG+VB\nnbcaoxbh9PkUBwBKIHExUSSRYSaROVHsqnPNRqHuxDgkwNGyqBPMOsddaSFOyqEdKOZkXxTwQicx\nP8dgaEDjz+nzhCDb2Aa/oPpBFlwbegULZWqgXIrYyAwGVGg5PNBUHciGq6kXNQfbfDjVTkfzL1mA\nu2wZ3irhOFLzOqiqM/fDXnKCr7hyM799PpkwCSUSiuw+MxJFhAe3cQIDAQABAoIBAFc9MxbBnI6Q\nUAaJziUa3FElfP+0Vh2Uw7y0OSzXKLHPMbj/TEV6/B93jP57OfL5vExeUgl3svbSCTsaDz/1lwor\n+VRiyXLRqvPi2e2+IlmBOrFSpVwUEfiBVHVO+yXtgN+tdvPSc78BWaR6LktkrmfC7C95W6Re/psW\nlwGj5Lsbw+rbJdPBziNCVU384uegnTSzvGdyYXQcLcE4n/qN6YT11PMeTX0N4a8qHQyqIcxUPz7d\nyTf2YfVbylfxDip88Rcj

## Pull Stock Info

In [3]:
args = {
    'AWS_ACCESS_KEY_ID': config['AWS']['AWS_ACCESS_KEY_ID'],
    'AWS_SECRET_ACCESS_KEY': config['AWS']['AWS_SECRET_ACCESS_KEY'],
    'START_DATE': config['App']['START_DATE'],
    'URL_NASDAQ': 'https://old.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nasdaq&render=download',
    'URL_NYSE': 'https://old.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nyse&render=download',
    'DB_HOST': 's3a://short-interest-effect',
    'TABLE_STOCK_INFO_NASDAQ': '/data/raw/stock_info_nasdaq',
    'TABLE_STOCK_INFO_NYSE': '/data/raw/stock_info_nyse',
}

In [6]:
emrs.kill_all_inactive_spark_sessions(cluster_dns)
session_headers = emrs.create_spark_session(cluster_dns)
emrs.wait_for_spark(cluster_dns, session_headers)
job_response_headers = emrs.submit_spark_job_from_file(
        cluster_dns, session_headers,
        'airflow/dags/etl/pull_stock_info.py',
        args=args,
        commonpath='airflow/dags/etl/common.py',
        helperspath='airflow/dags/etl/helpers.py'
)
final_status, logs = emrs.track_spark_job(cluster_dns, job_response_headers)
emrs.kill_spark_session(cluster_dns, session_headers)
for line in logs:
    logging.info(line)
    if '(FAIL)' in str(line):
        logging.error(line)
        raise Exception("ETL process fails.")

INFO:root:Killing all inactive spark sessions
INFO:root:Killed idle spark session id 4
INFO:root:Sent spark session creation command to http://ec2-100-26-146-164.compute-1.amazonaws.com:8998/sessions
INFO:root:Response headers: {'Date': 'Sun, 26 Jan 2020 21:32:47 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/5', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:{'id': 5, 'name': None, 'appId': None, 'owner': None, 'proxyUser': None, 'state': 'starting', 'kind': 'pyspark', 'appInfo': {'driverLogUrl': None, 'sparkUiUrl': None}, 'log': ['stdout: ', '\nstderr: ', '\nYARN Diagnostics: ']}
INFO:root:Session headers: {'Date': 'Sun, 26 Jan 2020 21:32:47 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/5', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:Spark session status: starting
INFO:root:Spark session status: sta

## Pull Short Interests

In [7]:
args_si = {
    'START_DATE': config['App']['START_DATE'],
    'QUANDL_API_KEY': config['Quandl']['API_KEY'],
    'YESTERDAY_DATE': '2020-12-10',
    'LIMIT': config['App']['STOCK_LIMITS'],
    'STOCKS': STOCKS,
    'AWS_ACCESS_KEY_ID': config['AWS']['AWS_ACCESS_KEY_ID'],
    'AWS_SECRET_ACCESS_KEY': config['AWS']['AWS_SECRET_ACCESS_KEY'],
    'DB_HOST': config['App']['DB_HOST'],
    'TABLE_STOCK_INFO_NASDAQ': config['App']['TABLE_STOCK_INFO_NASDAQ'],
    'TABLE_STOCK_INFO_NYSE': config['App']['TABLE_STOCK_INFO_NYSE'],
    'TABLE_SHORT_INTERESTS_NASDAQ': config['App']['TABLE_SHORT_INTERESTS_NASDAQ'],
    'TABLE_SHORT_INTERESTS_NYSE': config['App']['TABLE_SHORT_INTERESTS_NYSE'],
}

In [8]:
emrs.kill_all_inactive_spark_sessions(cluster_dns)
session_headers = emrs.create_spark_session(cluster_dns)
emrs.wait_for_spark(cluster_dns, session_headers)
job_response_headers = emrs.submit_spark_job_from_file(
        cluster_dns, session_headers,
        'airflow/dags/etl/pull_short_interests.py',
        args=args_si,
        commonpath='airflow/dags/etl/common.py',
        helperspath='airflow/dags/etl/helpers.py'
)
final_status, logs = emrs.track_spark_job(cluster_dns, job_response_headers)
emrs.kill_spark_session(cluster_dns, session_headers)
for line in logs:
    logging.info(line)
    if '(FAIL)' in str(line):
        logging.error(line)
        raise Exception("ETL process fails.")

INFO:root:Killing all inactive spark sessions
INFO:root:Sent spark session creation command to http://ec2-100-26-146-164.compute-1.amazonaws.com:8998/sessions
INFO:root:Response headers: {'Date': 'Sun, 26 Jan 2020 21:35:32 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/6', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:{'id': 6, 'name': None, 'appId': None, 'owner': None, 'proxyUser': None, 'state': 'starting', 'kind': 'pyspark', 'appInfo': {'driverLogUrl': None, 'sparkUiUrl': None}, 'log': ['stdout: ', '\nstderr: ', '\nYARN Diagnostics: ']}
INFO:root:Session headers: {'Date': 'Sun, 26 Jan 2020 21:35:32 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/6', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:Spark session status: starting
INFO:root:Spark session status: starting
INFO:root:Spark session status: sta

## Pull Prices

In [9]:
args_p = {
    'START_DATE': config['App']['START_DATE'],
    'QUANDL_API_KEY': config['Quandl']['API_KEY'],
    'YESTERDAY_DATE': '2020-12-10',
    'LIMIT': config['App']['STOCK_LIMITS'],
    'STOCKS': STOCKS,
    'AWS_ACCESS_KEY_ID': config['AWS']['AWS_ACCESS_KEY_ID'],
    'AWS_SECRET_ACCESS_KEY': config['AWS']['AWS_SECRET_ACCESS_KEY'],
    'DB_HOST': config['App']['DB_HOST'],
    'TABLE_STOCK_INFO_NASDAQ': config['App']['TABLE_STOCK_INFO_NASDAQ'],
    'TABLE_STOCK_INFO_NYSE': config['App']['TABLE_STOCK_INFO_NYSE'],
    'TABLE_STOCK_PRICES': config['App']['TABLE_STOCK_PRICES'],
    'URL': "http://app.quotemedia.com/quotetools/getHistoryDownload.csv?&webmasterId=501&startDay={sd}&startMonth={sm}&startYear={sy}&endDay={ed}&endMonth={em}&endYear={ey}&isRanged=true&symbol={sym}",
}

In [10]:
emrs.kill_all_inactive_spark_sessions(cluster_dns)
session_headers = emrs.create_spark_session(cluster_dns)
emrs.wait_for_spark(cluster_dns, session_headers)
job_response_headers = emrs.submit_spark_job_from_file(
        cluster_dns, session_headers,
        'airflow/dags/etl/pull_prices.py',
        args=args_p,
        commonpath='airflow/dags/etl/common.py',
        helperspath='airflow/dags/etl/helpers.py'
)
final_status, logs = emrs.track_spark_job(cluster_dns, job_response_headers)
emrs.kill_spark_session(cluster_dns, session_headers)
for line in logs:
    logging.info(line)
    if '(FAIL)' in str(line):
        logging.error(line)
        raise Exception("ETL process fails.")

INFO:root:Killing all inactive spark sessions
INFO:root:Sent spark session creation command to http://ec2-100-26-146-164.compute-1.amazonaws.com:8998/sessions
INFO:root:Response headers: {'Date': 'Sun, 26 Jan 2020 21:39:32 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/7', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:{'id': 7, 'name': None, 'appId': None, 'owner': None, 'proxyUser': None, 'state': 'starting', 'kind': 'pyspark', 'appInfo': {'driverLogUrl': None, 'sparkUiUrl': None}, 'log': ['stdout: ', '\nstderr: ', '\nYARN Diagnostics: ']}
INFO:root:Session headers: {'Date': 'Sun, 26 Jan 2020 21:39:32 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/7', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:Spark session status: starting
INFO:root:Spark session status: starting
INFO:root:Spark session status: idl

## Quality-check

In [11]:
args_q = {
            'AWS_ACCESS_KEY_ID': config['AWS']['AWS_ACCESS_KEY_ID'],
            'AWS_SECRET_ACCESS_KEY': config['AWS']['AWS_SECRET_ACCESS_KEY'],
            'YESTERDAY_DATE': '2020-12-10',
            'STOCKS': STOCKS,
            'DB_HOST': config['App']['DB_HOST'],
            'TABLE_STOCK_INFO_NASDAQ': config['App']['TABLE_STOCK_INFO_NASDAQ'],
            'TABLE_STOCK_INFO_NYSE': config['App']['TABLE_STOCK_INFO_NYSE'],
            'TABLE_STOCK_PRICES': config['App']['TABLE_STOCK_PRICES'],
        }

In [12]:
emrs.kill_all_inactive_spark_sessions(cluster_dns)
session_headers = emrs.create_spark_session(cluster_dns)
emrs.wait_for_spark(cluster_dns, session_headers)
job_response_headers = emrs.submit_spark_job_from_file(
        cluster_dns, session_headers,
        'airflow/dags/etl/pull_prices_quality.py',
        args=args_q,
        commonpath='airflow/dags/etl/common.py',
        helperspath='airflow/dags/etl/helpers.py'
)
final_status, logs = emrs.track_spark_job(cluster_dns, job_response_headers)
emrs.kill_spark_session(cluster_dns, session_headers)
for line in logs:
    logging.info(line)
    if '(FAIL)' in str(line):
        logging.error(line)
        raise Exception("ETL process fails.")

INFO:root:Killing all inactive spark sessions
INFO:root:Sent spark session creation command to http://ec2-100-26-146-164.compute-1.amazonaws.com:8998/sessions
INFO:root:Response headers: {'Date': 'Sun, 26 Jan 2020 21:41:01 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/8', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:{'id': 8, 'name': None, 'appId': None, 'owner': None, 'proxyUser': None, 'state': 'starting', 'kind': 'pyspark', 'appInfo': {'driverLogUrl': None, 'sparkUiUrl': None}, 'log': ['stdout: ', '\nstderr: ', '\nYARN Diagnostics: ']}
INFO:root:Session headers: {'Date': 'Sun, 26 Jan 2020 21:41:01 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/8', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:Spark session status: starting
INFO:root:Spark session status: starting
INFO:root:Spark session status: sta

## Combine Data

In [13]:
args_c = {
            'YESTERDAY_DATE': '2020-12-10',
            'AWS_ACCESS_KEY_ID': config['AWS']['AWS_ACCESS_KEY_ID'],
            'AWS_SECRET_ACCESS_KEY': config['AWS']['AWS_SECRET_ACCESS_KEY'],
            'DB_HOST': config['App']['DB_HOST'],
            'TABLE_STOCK_PRICES': config['App']['TABLE_STOCK_PRICES'],
            'TABLE_SHORT_INTERESTS_NASDAQ': config['App']['TABLE_SHORT_INTERESTS_NASDAQ'],
            'TABLE_SHORT_INTERESTS_NYSE': config['App']['TABLE_SHORT_INTERESTS_NYSE'],
            'TABLE_SHORT_ANALYSIS': config['App']['TABLE_SHORT_ANALYSIS'],
        }

In [14]:
logger.setLevel(logging.INFO)
emrs.kill_all_inactive_spark_sessions(cluster_dns)
session_headers = emrs.create_spark_session(cluster_dns)
emrs.wait_for_spark(cluster_dns, session_headers)
job_response_headers = emrs.submit_spark_job_from_file(
        cluster_dns, session_headers,
        'airflow/dags/etl/combine.py',
        args=args_c,
        commonpath='airflow/dags/etl/common.py',
        helperspath='airflow/dags/etl/helpers.py'
)
final_status, logs = emrs.track_spark_job(cluster_dns, job_response_headers)
emrs.kill_spark_session(cluster_dns, session_headers)
for line in logs:
    logging.info(line)
    if '(FAIL)' in str(line):
        logging.error(line)
        raise Exception("ETL process fails.")

INFO:root:Killing all inactive spark sessions
INFO:root:Sent spark session creation command to http://ec2-100-26-146-164.compute-1.amazonaws.com:8998/sessions
INFO:root:Response headers: {'Date': 'Sun, 26 Jan 2020 21:41:56 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/9', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:{'id': 9, 'name': None, 'appId': None, 'owner': None, 'proxyUser': None, 'state': 'starting', 'kind': 'pyspark', 'appInfo': {'driverLogUrl': None, 'sparkUiUrl': None}, 'log': ['stdout: ', '\nstderr: ', '\nYARN Diagnostics: ']}
INFO:root:Session headers: {'Date': 'Sun, 26 Jan 2020 21:41:56 GMT', 'Content-Type': 'application/json;charset=utf-8', 'Content-Encoding': 'gzip', 'Location': '/sessions/9', 'Transfer-Encoding': 'chunked', 'Server': 'Jetty(9.3.24.v20180605)'}
INFO:root:Spark session status: starting
INFO:root:Spark session status: starting
INFO:root:Spark session status: sta

## Delete Cluster

In [15]:
emrs.delete_cluster(emr, cluster_id)

INFO:botocore.vendored.requests.packages.urllib3.connectionpool:Resetting dropped connection: elasticmapreduce.us-east-1.amazonaws.com
INFO:root:Cluster j-3KRMX1NGNM7ZU has not been terminated (Current cluster state: TERMINATING). waiting until the status is TERMINATED...
INFO:botocore.vendored.requests.packages.urllib3.connectionpool:Resetting dropped connection: elasticmapreduce.us-east-1.amazonaws.com
INFO:root:Cluster j-3KRMX1NGNM7ZU has not been terminated (Current cluster state: TERMINATING). waiting until the status is TERMINATED...
INFO:botocore.vendored.requests.packages.urllib3.connectionpool:Resetting dropped connection: elasticmapreduce.us-east-1.amazonaws.com
INFO:root:Cluster j-3KRMX1NGNM7ZU has not been terminated (Current cluster state: TERMINATING). waiting until the status is TERMINATED...
INFO:botocore.vendored.requests.packages.urllib3.connectionpool:Resetting dropped connection: elasticmapreduce.us-east-1.amazonaws.com
INFO:root:Cluster j-3KRMX1NGNM7ZU has not been

Cluster j-3KRMX1NGNM7ZU Deleted
