Permalink
Browse files

Initial commit. Basic cli.

  • Loading branch information...
hmalphettes committed Nov 19, 2017
0 parents commit 74828730fa29119d69666bd9eccdc78b92353a25
Showing with 332 additions and 0 deletions.
  1. +34 −0 .README.md
  2. +6 −0 .gitignore
  3. +6 −0 .travis.yml
  4. +24 −0 LICENSE
  5. +5 −0 Makefile
  6. +35 −0 README.rst
  7. +3 −0 requirements-dev.txt
  8. +2 −0 requirements.txt
  9. +123 −0 s3_storage_analyser.py
  10. +23 −0 setup.py
  11. +71 −0 test_s3_storage_analyzer.py
@@ -0,0 +1,34 @@
Quick notes while I am <strike>developing</strike> learning python
Project structure
=================
http://docs.python-guide.org/en/latest/writing/structure/
Testing:
========
https://github.com/spulec/moto
http://echorand.me/replacing-boto-s3-mocks-using-moto-in-python.html
Libraries
=========
Tabulate and align: https://pypi.python.org/pypi/tabulate/
Dont forget to add
==================
- A docker file
- A travis file for CI
- A public docker image in docker hub
- An AWS instance where all this is installed
More indexes than the one per bucket:
=====================================
By storage type
By storage and by region
By storage and by bucket
Async to process buckets in parallel
=========================================================
https://github.com/aio-libs/aiobotocore Problem: asyncio for aws seems flaky and low-level. The mock lib moto wont work
This looks more main stream:
https://www.ploggingdev.com/2017/01/multiprocessing-and-multithreading-in-python-3/
https://docs.python.org/3/library/multiprocessing.html
@@ -0,0 +1,6 @@
*.egg-info
__pycache__
.cache
.vscode
*.pyc
.env
@@ -0,0 +1,6 @@
language: python
python:
- "3.6"
install: "pip install -r requirements-dev.txt"
script: pytest -s
24 LICENSE
@@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to [http://unlicense.org]
@@ -0,0 +1,5 @@
test:
pytest -s
install:
pip install -r requirements-dev.txt
@@ -0,0 +1,35 @@
S3 Storage Analyser - WIP
=========================
A command line tool to display the objects stored in your AWS S3 account.
Requirements
-------------
python-3.x
Development
-----------
For now development install only.
::
git clone --depth 1 https://github.com/hmalphettes/s3_storage_analyser
cd s3_storage_analyser
pip install -r requirements.txt
Usage - Command Line
--------------------
::
python s3_storage_analyser.py
python3 s3_storage_analyser.py --unit TB --prefix s3://hm
Name CreationDate bucket_location total_bytes total_files last_modified
hm.many01 2017-11-18 08:13:58+00:00 ap-southeast-1 60000 10000 2017-11-18 08:37:59+00:00
hm.many02 2017-11-18 08:14:14+00:00 ap-southeast-1 60000 10000 2017-11-18 08:50:51+00:00
hm.many03 2017-11-18 08:14:25+00:00 ap-southeast-1 132006 22001 2017-11-18 09:30:26+00:00
hm.samples 2017-11-16 08:13:39+00:00 ap-southeast-1 2259547 4 2017-11-16 08:47:39+00:00
hm.samples.encrypted 2017-11-16 08:15:17+00:00 ap-southeast-1 3428897 1 2017-11-16 08:47:05+00:00
hm.samples.eu-west1 2017-11-18 08:12:38+00:00 eu-west-1 108160 1 2017-11-18 08:13:32+00:00
hm.samples.versioned 2017-11-16 08:16:19+00:00 ap-southeast-1 0 0 0001-01-01 00:00:00+00:00
License
-------
Public domain.
@@ -0,0 +1,3 @@
-r requirements.txt
moto>=1.1.24
pytest>=3.2.5
@@ -0,0 +1,2 @@
boto3>=1.4.7
tabulate>=0.8.1
@@ -0,0 +1,123 @@
"""
S3 Storage Analysis Tool
"""
import argparse
import re
import multiprocessing as multi
from pprint import pprint
from operator import itemgetter
from datetime import datetime
import pytz
import boto3
import tabulate
def parse_args():
"""cli parser"""
parser = argparse.ArgumentParser(description='Analyse the S3 Buckets of an Amazon AWS account.')
parser.add_argument('--unit', # type='string',
choices=['B', 'KB', 'MB', 'GB', 'TB'],
help='file size unit B|KB|MB|GB|TB', default='MB')
parser.add_argument('--prefix', help='Filter the keys by prefix')
parser.add_argument('--pool-size', help='Number of parallel workers')
return parser.parse_args()
UNIT_DEFS = {'KB':1024, 'MB':1024**2, 'GB':1024**3, 'TB':1024**4}
def convert_bytes(nbytes, unit='MB', append_unit=False):
"""Converts a number of bytes into a specific unit"""
# Credit: https://stackoverflow.com/a/39284216/1273401
formatted = ('%.2f' % (nbytes/UNIT_DEFS[unit])).rstrip('0').rstrip('.')
return f'{formatted}{unit}' if append_unit else formatted
def _get_s3_client():
"""Return the s3 connection."""
return boto3.client('s3')
def _list_buckets(prefix=None):
"""Return the list of buckets {'Name','CreationDate'} """
client = _get_s3_client()
resp = client.list_buckets(prefix=prefix)
if not 'Buckets' in resp:
return []
buckets = resp['Buckets']
if prefix is not None:
_m = re.match(r'^s3://([^\/]+).*$', prefix)
if _m is not None:
buckets = filter(lambda x: x['Name'].startswith(_m.group(1)), buckets)
else:
raise Exception(f'Invalid prefix "{prefix}"; expected "s3://bucket_name[/blah]"')
return sorted(buckets, key=itemgetter('Name'))
def fetch_bucket_info(bucket):
"""Fetches some extra info about the bucket {'Name':bucket_name}"""
name = bucket['Name']
bucket_location = _get_s3_client().get_bucket_location(Bucket=name)['LocationConstraint']
bucket.update({'bucket_location': bucket_location})
return bucket
def _analyse_bucket(bucket, prefix=None):
bucket = fetch_bucket_info(bucket)
stats = traverse_bucket(bucket['Name'], prefix=prefix)
bucket.update(stats)
return bucket
def _analyse_buckets(prefix=None):
"""Traverse all the buckets and collect the info"""
buckets = _list_buckets(prefix=prefix)
pool = multi.Pool(multi.cpu_count())
buckets = list(pool.map(_analyse_bucket, buckets))
pool.close()
return buckets
def traverse_bucket(bucket, prefix=None, max_keys=None):
"""Paginates through the objects in the bucket
keep track of the number of files
sum the size of each file"""
total_bytes = 0
total_files = 0
last_modified = pytz.utc.localize(datetime.min)
kwargs = {'Bucket': bucket}
if prefix is not None:
kwargs['Prefix'] = prefix
if max_keys is not None:
kwargs['MaxKeys'] = max_keys
for obj in _list_objects(**kwargs):
total_bytes += obj['Size']
total_files += 1
if obj['LastModified'] > last_modified:
last_modified = obj['LastModified']
return {'total_bytes': total_bytes, 'total_files': total_files, 'last_modified': last_modified}
def _list_objects(**kwargs):
"""Generator to iterate the objects found in a bucket.
yield one object at a time
bucket, prefix=None, max_keys=1000, Marker=None"""
objects = _get_s3_client().list_objects_v2(**kwargs)
if not 'Contents' in objects:
return
contents = objects['Contents']
for content in contents:
yield content
if objects['IsTruncated'] is True:
if 'ContinuationToken' in objects:
kwargs['ContinuationToken'] = objects['NextContinuationToken']
else:
kwargs['StartAfter'] = contents[-1]['Key']
for i in _list_objects(**kwargs):
yield i
def _format_buckets(buckets, args):
"""Format a list of buckets as dictionary into a list of arrays for tabulate"""
def main():
"""CLI entry point"""
args = parse_args()
buckets = _analyse_buckets(prefix=args.prefix)
print(tabulate.tabulate(buckets, headers='keys', tablefmt='plain'))
if __name__ == "__main__":
main()
@@ -0,0 +1,23 @@
#!/usr/bin/env python
from setuptools import setup
"""
===============
s3_storage_analyser setup
===============
"""
with open("README.rst", "rb") as f:
setup(
name="s3_storage_analyser",
packages=["s3_storage_analyser"],
entry_points={
"console_scripts": ['s3_storage_analyser = s3_storage_analyser:main']
},
version="0.1",
description="S3 Storage Analyser",
long_description=f.read().decode("utf-8"),
author="Hugues MALPHETTES",
author_email="hmalphettes@gmail.com",
url="https://github.com/hmalphettes/s3_storage_analyser",
)
@@ -0,0 +1,71 @@
"""
Test indeed
"""
from pprint import pprint
from s3_storage_analyser import _get_s3_client, _list_buckets
from s3_storage_analyser import convert_bytes, traverse_bucket, fetch_bucket_info
from moto import mock_s3
def test_convert_bytes():
"""Test convert bytes to a unit"""
assert convert_bytes(1048576, 'MB', True) == '1MB'
assert convert_bytes(1048576, 'KB', True) == '1024KB'
assert convert_bytes(1073741824, 'GB') == '1'
def _setup_s3():
client = _get_s3_client()
client.create_bucket(Bucket='hm.samples')
for i in range(0, 3):
client.put_object(Bucket='hm.samples', Body=b'abcdef', Key=f'{i}.txt')
@mock_s3
def test_traverse_bucket():
"""Traverse bucket. single internal call"""
_setup_s3()
bucket_descr = traverse_bucket('hm.samples')
assert bucket_descr['total_files'] == 3
assert bucket_descr['total_bytes'] == 18
@mock_s3
def test_traverse_bucket_2():
"""Traverse bucket. multiple s3 calls as there are more resources than the max_keys"""
_setup_s3()
bucket_descr = traverse_bucket('hm.samples', max_keys=2)
assert bucket_descr['total_files'] == 3
assert bucket_descr['total_bytes'] == 18
@mock_s3
def test_bucket_xinfo():
"""Test loading the extra info of a bucket"""
client = _get_s3_client()
client.create_bucket(Bucket='hm.samples.encrypted')
client.put_bucket_encryption(
Bucket='hm.samples.encrypted',
ServerSideEncryptionConfiguration={
'Rules': [
{
'ApplyServerSideEncryptionByDefault': {
'SSEAlgorithm': 'AES256',
'KMSMasterKeyID': 'foo'
}
},
]
}
)
bucket_info = fetch_bucket_info({'Name':'hm.samples.encrypted'})
assert bucket_info['bucket_location'] == 'us-east-1'
@mock_s3
def test_buckets_filter():
"""Test listing the buckets"""
client = _get_s3_client()
# map(lambda n: client.create_bucket(Bucket=n), ['aa', 'a', 'b']) # does not work. why?
for name in ['c', 'a', 'aa']:
client.create_bucket(Bucket=name)
bucket_list = _list_buckets()
assert len(bucket_list) == 3
assert bucket_list[0]['Name'] == 'a'
bucket_list = _list_buckets(prefix='s3://a')
assert len(bucket_list) == 2

0 comments on commit 7482873

Please sign in to comment.