In [1]:
import boto3
import pandas as pd
from botocore.exceptions import ClientError
from spdynamodb import DynamoTable
import json
from decimal import Decimal
from datetime import datetime, timedelta
import time
import random

In [2]:
dt=DynamoTable()
try:
    dt.select_table('LogFileScan')
    print(dt)
except:
    dt.create_table(
        table_name='LogFileScan',
        partition_key='PK',
        partition_key_type='S',
        rcu=5000,
        wcu=5000
    )

Table created successfully!


In [12]:
dt.create_global_secondary_index(
    att_name="GSI_1_PK",
    att_type="S",
    sort_index="GSI_1_SK",
    sort_type="S",
    proj_type="KEYS_ONLY",
    i_rcu=3000,
    i_wcu=5000,
    i_name="GSI_1"
)

status = dt.check_status_gsi()
if status == 'CREATING':
    print("Global secondary index is being created, this may take a few minutes...")
    start = time.time()
    while status == 'CREATING':
        status = dt.check_status_gsi()
        time.sleep(30)
end = time.time()
minute = (end - start) / 60
print("Global secondary index created. Time elapsed: {0:.2f} minute".format(minute))

Global secondary index is being created, this may take a few minutes...
Global secondary index created. Time elapsed: 6.53 minute


In [84]:
df_1 = pd.read_csv('workshop-data/data/logfile_medium1.csv', header=None)
df_2 = pd.read_csv('workshop-data/data/logfile_medium2.csv', header=None)
df_3 = pd.read_csv('workshop-data/data/logfile_stream.csv', header=None)
df_4 = pd.read_csv('workshop-data/data/logfile_small1.csv', header=None)
df = pd.concat([df_1, df_2, df_3, df_4])
df.reset_index(inplace=True, drop=True)
df.columns = ['PK', 'host', 'date', 'hourofday', 'timezone', 'method', 'url', 'responsecode', 'bytessent', 'useragent']
df['GSI_1_PK'] = df['host'].apply(lambda x: "host#{}".format(x))
df['GSI_1_SK'] = df['responsecode'].astype(str) + "#" + df['date'] + "#" + df['hourofday'].astype(str)
df.reset_index(inplace=True)
df['PK'] = df['index']
df.drop(labels=['index'], axis=1, inplace=True)
df['PK'] = df['PK'].apply(lambda x: "id#{}".format(x))

In [86]:
# Add data to DynamoDB table
dt.batch_pandas(df)

In [196]:
import threading
from multiprocessing import Queue
queue = Queue()

scannedItems = 0
totalbytessent = 0
pageSize = 10000
totalsegments = 2
threadsegment = 1

fe = "responsecode <> :f"
eav = {":f": 200}

For a parallel Scan request, TotalSegments represents the total number of segments into which the Scan operation will be divided. The value of TotalSegments corresponds to the number of application workers that will perform the parallel scan. For example, if you want to use four application threads to scan a table or an index, specify a TotalSegments value of 4.

In [197]:
response = dt.table.scan(
    FilterExpression=fe,
    ExpressionAttributeValues=eav,
    Limit=pageSize,
    TotalSegments=totalsegments,
    Segment=threadsegment,
    ProjectionExpression='bytessent',
    ReturnConsumedCapacity='TOTAL'
    )
scannedItems += len(response['Items'])
for i in response['Items']:
        totalbytessent += i['bytessent']
print("Consumed capacity: ", response['ConsumedCapacity']['CapacityUnits'])

while 'LastEvaluatedKey' in response:
    key_value = response['LastEvaluatedKey']
    response = dt.table.scan(
        FilterExpression=fe,
        ExpressionAttributeValues=eav,
        Limit=pageSize,
        TotalSegments=totalsegments,
        Segment=threadsegment,
        ExclusiveStartKey=key_value,
        ProjectionExpression='bytessent'
        )
    scannedItems += len(response['Items'])
    for i in response['Items']:
        totalbytessent += i['bytessent']

print("Total scanned items:", scannedItems)

Consumed capacity:  119.0
Total scanned items: 454
