In [1]:
import boto3
import pandas as pd
import io
import os
from concurrent import futures

### connect to s3

In [13]:
session = boto3.Session(profile_name = 'default')
s3b = session.client('s3', 
        region_name = 'ap-southeast-1')

### generate size of soil health card folder

In [8]:
BUCKET_NAME = 'yara-sh-dads-scd-stage'

s3_paginator = s3b.get_paginator('list_objects_v2')

def get_size(bucket_name, prefix, delimiter='/', start_after=''):
    total_size = 0
    prefix = prefix[1:] if prefix.startswith(delimiter) else prefix
    start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after
    
    for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after):
        for content in page.get('Contents', ()):
            total_size = total_size +content['Size']
            
    return total_size

## size of clean sample folder
folder_size_CS = get_size(BUCKET_NAME, 'clean_sample/', delimiter='/', start_after='')
print(f'size of clean sample folder: {folder_size_CS} bytes, {folder_size_CS/(1024*1024*1024)} GB')

## size of soil health cards folder
folder_size_SHC = get_size(BUCKET_NAME, 'soil_health_cards/', delimiter='/', start_after='')
print(f'size of soil health cards folder: {folder_size_SHC} bytes, {folder_size_SHC/(1024*1024*1024)} GB')

## side of state soil tests folder
folder_size_SST = get_size(BUCKET_NAME, 'state_soil_tests/', delimiter='/', start_after='')
print(f'size of state soil tests folder: {folder_size_SST} bytes, {folder_size_SST/(1024*1024*1024)} GB')

## total bucket size
bucket_size = folder_size_CS + folder_size_SHC + folder_size_SST
print(f'size of bucket: {bucket_size} bytes, {bucket_size/(1024*1024*1024)} GB')

size of clean sample folder: 361949251 bytes, 0.33709150832146406 GB
size of soil health cards folder: 124998373077 bytes, 116.41380663681775 GB
size of state soil tests folder: 9200667480 bytes, 8.568789325654507 GB
size of bucket: 134560989808 bytes, 125.31968747079372 GB


### generate small sample (using Bihar)

In [9]:
BUCKET_NAME = 'yara-sh-dads-scd-stage'
KEY = 'state_soil_tests/Bihar.csv'
obj = s3b.get_object(Bucket = BUCKET_NAME, Key = KEY)

In [10]:
df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8',sep = '\t')
df

Unnamed: 0,state,state_id,district,district_id,block,block_id,village,village_id,sample_no,year,...,zn,zn_qual,fe,fe_qual,cu,cu_qual,mn,mn_qual,b,b_qual
0,Bihar,10,Araria,209,Araria,1116,Azmatpur,221846,BR221846/2016-17/52923857,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D
1,Bihar,10,Araria,209,Araria,1116,Azmatpur,221846,BR221846/2016-17/52923863,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D
2,Bihar,10,Araria,209,Araria,1116,Azmatpur,221846,BR221846/2016-17/52923880,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D
3,Bihar,10,Araria,209,Araria,1116,Azmatpur,221846,BR221846/2016-17/52923894,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D
4,Bihar,10,Araria,209,Araria,1116,Azmatpur,221846,BR221846/2016-17/52923912,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216311,Bihar,10,Sitamarhi,206,Sursand,1068,Sursand Asli,219244,BR219244/2016-17/53834333,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D
216312,Bihar,10,Sitamarhi,206,Sursand,1068,Sursand Asli,219244,BR219244/2016-17/53834338,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D
216313,Bihar,10,Sitamarhi,206,Sursand,1068,Sursand Asli,219244,BR219244/2016-17/53834342,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D
216314,Bihar,10,Sitamarhi,206,Sursand,1068,Sursand Asli,219244,BR219244/2016-17/53834355,2016-17,...,0.0,D,0.0,D,0.0,D,0.0,D,0.0,D


In [11]:
sample = df.head(100)
sample.to_csv('testsample_shc.csv', index = False, sep = '\t')

### upload sample to s3

In [14]:
file_name = 'testsample_shc.csv'
bucket_name = BUCKET_NAME
key = f'state_soil_tests/{file_name}'
local_file = os.path.abspath(file_name)

try:
    s3b.upload_file(local_file, bucket_name, key)
    print("Upload Successful")
except FileNotFoundError:
    print("The file was not found")

Upload Successful


### download sample from s3

In [24]:
file_name = 'testsample_shc.csv'
bucket_name = BUCKET_NAME
key = f'state_soil_tests/{file_name}'

try:
    s3b.download_file(bucket_name, key, file_name)
    print("Download Successful")
except Exception as e:
    print(e)

Download Successful
