## Dynamodb Batch Operations

Let us understand how we can take care of batch inserts into Dynamodb table using batch writer.
* We can use `batch_writer` to load the data to dynamodb table in batches.
* It can be used for deletes as well.

In [1]:
import requests

In [2]:
import json

In [3]:
def list_repos(token, since='333255899'):
    res = requests.get(
        f'https://api.github.com/repositories?since={since}',
        headers={'Authorization': f'token {token}'}
    )
    return json.loads(res.content.decode('utf-8'))

In [4]:
def get_repo_details(owner, name, token):
    repo_details = json.loads(requests.get(
        f'https://api.github.com/repos/{owner}/{name}',
        headers={'Authorization': f'token {token}'}
    ).content.decode('utf-8'))
    return repo_details

In [5]:
def extract_repo_fields(repo_details):
    repo_fields = {
        'id': repo_details['id'],
        'node_id': repo_details['node_id'],
        'name': repo_details['name'],
        'full_name': repo_details['full_name'],
        'owner': {
            'login': repo_details['owner']['login'],
            'id': repo_details['owner']['id'],
            'node_id': repo_details['owner']['node_id'],
            'type': repo_details['owner']['type'],
            'site_admin': repo_details['owner']['site_admin']
        },
        'html_url': repo_details['html_url'],
        'description': repo_details['description'],
        'fork': repo_details['fork'],
        'created_at': repo_details['created_at']
    }
    return repo_fields

In [6]:
def get_repos(repos, token):
    repos_details = []
    for repo in repos:
        try:
            owner = repo['owner']['login']
            name = repo['name']
            repo_details = get_repo_details(owner, name, token)
            repo_fields = extract_repo_fields(repo_details)
            repos_details.append(repo_fields)
        except:
            pass
    return repos_details

In [7]:
repos = list_repos('bd8a9c237cfd84a454a69ab4f68bc799d4d2e08f')

In [8]:
repos_details = get_repos(repos, 'bd8a9c237cfd84a454a69ab4f68bc799d4d2e08f')

In [9]:
import boto3

In [10]:
import os

In [11]:
os.environ.setdefault('AWS_PROFILE', 'itvgithub')

'itvgithub'

In [12]:
os.environ.setdefault('AWS_DEFAULT_REGION', 'us-east-1')

'us-east-1'

In [13]:
dynamodb = boto3.resource('dynamodb')

In [14]:
ghrepos_table = dynamodb.Table('ghrepos')

In [None]:
ghrepos_table.delete_item?

In [None]:
%%time

for repo in ghrepos_table.scan()['Items']:
    print(f'Deleting entry with repo id {repo["id"]}')
    ghrepos_table.delete_item(Key={'id': repo['id']})

In [17]:
batch_writer = ghrepos_table.batch_writer()

In [18]:
type(batch_writer)

boto3.dynamodb.table.BatchWriter

In [35]:
help(batch_writer.put_item)

Help on method put_item in module boto3.dynamodb.table:

put_item(Item) method of boto3.dynamodb.table.BatchWriter instance



In [36]:
def load_repos(repos_details, ghrepos_table, batch_size=50):
    with ghrepos_table.batch_writer() as batch:
    
        repos_count = len(repos_details)
        for i in range(0, repos_count, batch_size):
            print(f'Processing from {i} to {i+batch_size}')
            for repo in repos_details[i:i+batch_size]:
                batch.put_item(Item=repo)  

In [37]:
list(range(0, 100, 50))

[0, 50]

In [38]:
%%time
load_repos(repos_details, ghrepos_table)

Processing from 0 to 50
Processing from 50 to 100
CPU times: user 58.4 ms, sys: 3.36 ms, total: 61.8 ms
Wall time: 3.08 s


In [39]:
rs = ghrepos_table.scan()

In [40]:
len(rs['Items'])

96

In [41]:
rs['Items'][0]

{'created_at': '2021-01-27T00:28:37Z',
 'owner': {'site_admin': False,
  'id': Decimal('57299679'),
  'login': 'blorin948',
  'type': 'User',
  'node_id': 'MDQ6VXNlcjU3Mjk5Njc5'},
 'full_name': 'blorin948/triche',
 'html_url': 'https://github.com/blorin948/triche',
 'description': None,
 'id': Decimal('333255974'),
 'fork': False,
 'name': 'triche',
 'node_id': 'MDEwOlJlcG9zaXRvcnkzMzMyNTU5NzQ='}

In [42]:
def delete_repos(repos_details, ghrepos_table, batch_size=50):
    with ghrepos_table.batch_writer() as batch:
    
        repos_count = len(repos_details)
        for i in range(0, repos_count, batch_size):
            print(f'Processing from {i} to {i+batch_size}')
            for repo in repos_details[i:i+batch_size]:
                key = {'id': repo['id']}
                batch.delete_item(Key=key)  

In [43]:
%%time
delete_repos(rs['Items'], ghrepos_table)

Processing from 0 to 50
Processing from 50 to 100
CPU times: user 22.5 ms, sys: 2.01 ms, total: 24.5 ms
Wall time: 1.07 s


In [44]:
ghrepos_table.scan()

{'Items': [],
 'Count': 0,
 'ScannedCount': 0,
 'ResponseMetadata': {'RequestId': '2DTSOOIG4MA04FBRKD735HQ5M7VV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Sun, 31 Jan 2021 23:27:33 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '39',
   'connection': 'keep-alive',
   'x-amzn-requestid': '2DTSOOIG4MA04FBRKD735HQ5M7VV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '3413411624'},
  'RetryAttempts': 0}}