<a href="https://colab.research.google.com/github/kartoch/colab-eda/blob/master/98%20-%20Loading%20and%20Saving%20from%20GCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.oauth2 import service_account
from google.cloud.storage import client
import io
import pandas as pd
from io import BytesIO
import json
import os.path
import logging
from zipfile import ZipFile

In [0]:
START_YEAR = 2007
END_YEAR = 2019
CACHE_DIRECTORY = "/tmp/"
LOG_LEVEL = "DEBUG"

In [0]:
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(LOG_LEVEL)

In [0]:
SERVICE_ACCOUNT = json.loads(r"""{
  "type": "service_account",
  "project_id": "...",
  "private_key_id": "...",
  "private_key": "...",
  "client_email": "...",
  "client_id": "...",
  "auth_uri": "...",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "...",
}""")
BUCKET_DATASETS = "[TO BE FILLED]"
BUCKET_PERSONAL = "[TO BE FILLED]"

In [0]:
credentials = service_account.Credentials.from_service_account_info(
    SERVICE_ACCOUNT,
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client_gcs = client.Client(
    credentials=credentials,
    project=credentials.project_id,
)

In [0]:
def save_file(local_filename, remote_filename, bucket):
    blob = bucket.blob(remote_filename)
    blob.upload_from_filename(local_filename)

In [0]:
def download_file(local_filename, remote_filename, bucket):
    if os.path.isfile(local_filename):
      logger.info("Already donwloaded: %s", local_filename)
    blob = bucket.blob(remote_filename)
    blob.download_to_filename(local_filename)

In [0]:
def download_datasets(blob_pathname, bucket):
    blob = bucket.blob(blob_pathname)
    return io.BytesIO(blob.download_as_string())

def generator_zip_file(client):
    bucket = client_gcs.bucket(BUCKET_DATASETS)
    for year in range(START_YEAR,END_YEAR+1):
        blob_pathname = "PrixCarburants_annuel_" + str(year) + ".zip"
        local_filename = CACHE_DIRECTORY + blob_pathname
        download_file(local_filename,blob_pathname,bucket)
        zip_ref = ZipFile(local_filename)
        [xml_filename] = zip_ref.namelist()
        yield (zip_ref.open(xml_filename),year)
        zip_ref.close()

In [15]:
for f,year in generator_zip_file(client):
    print(f,year)

INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2007.zip
DEBUG:urllib3.util.retry:Converted retries value: 3 -> Retry(total=3, connect=None, read=None, redirect=None, status=None)
DEBUG:google.auth.transport.requests:Making request: POST https://oauth2.googleapis.com/token
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): oauth2.googleapis.com:443
DEBUG:urllib3.connectionpool:https://oauth2.googleapis.com:443 "POST /token HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.googleapis.com:443
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2007.zip?alt=media HTTP/1.1" 200 9348185
INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2008.zip


<zipfile.ZipExtFile name='PrixCarburants_annuel_2007.xml' mode='r' compress_type=deflate> 2007


DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2008.zip?alt=media HTTP/1.1" 200 15525824
INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2009.zip
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2009.zip?alt=media HTTP/1.1" 200 14962469


<zipfile.ZipExtFile name='PrixCarburants_annuel_2008.xml' mode='r' compress_type=deflate> 2008


INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2010.zip
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2010.zip?alt=media HTTP/1.1" 200 15388391


<zipfile.ZipExtFile name='PrixCarburants_annuel_2009.xml' mode='r' compress_type=deflate> 2009


INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2011.zip
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2011.zip?alt=media HTTP/1.1" 200 17050489


<zipfile.ZipExtFile name='PrixCarburants_annuel_2010.xml' mode='r' compress_type=deflate> 2010


INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2012.zip
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2012.zip?alt=media HTTP/1.1" 200 20668580


<zipfile.ZipExtFile name='PrixCarburants_annuel_2011.xml' mode='r' compress_type=deflate> 2011


INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2013.zip


<zipfile.ZipExtFile name='PrixCarburants_annuel_2012.xml' mode='r' compress_type=deflate> 2012


DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2013.zip?alt=media HTTP/1.1" 200 21811631
INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2014.zip


<zipfile.ZipExtFile name='PrixCarburants_annuel_2013.xml' mode='r' compress_type=deflate> 2013


DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2014.zip?alt=media HTTP/1.1" 200 16034299
INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2015.zip
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2015.zip?alt=media HTTP/1.1" 200 21240234


<zipfile.ZipExtFile name='PrixCarburants_annuel_2014.xml' mode='r' compress_type=deflate> 2014


INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2016.zip
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2016.zip?alt=media HTTP/1.1" 200 24267384


<zipfile.ZipExtFile name='PrixCarburants_annuel_2015.xml' mode='r' compress_type=deflate> 2015


INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2017.zip
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2017.zip?alt=media HTTP/1.1" 200 20902664


<zipfile.ZipExtFile name='PrixCarburants_annuel_2016.xml' mode='r' compress_type=deflate> 2016


INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2018.zip


<zipfile.ZipExtFile name='PrixCarburants_annuel_2017.xml' mode='r' compress_type=deflate> 2017


DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2018.zip?alt=media HTTP/1.1" 200 23033869
INFO:root:Already donwloaded: /tmp/PrixCarburants_annuel_2019.zip
DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/essence-dataset-eda/o/PrixCarburants_annuel_2019.zip?alt=media HTTP/1.1" 200 24511070


<zipfile.ZipExtFile name='PrixCarburants_annuel_2018.xml' mode='r' compress_type=deflate> 2018
<zipfile.ZipExtFile name='PrixCarburants_annuel_2019.xml' mode='r' compress_type=deflate> 2019


In [0]:
df_test = pd.DataFrame(
    {"col1": [1,2,3],
     "col2": [4,5,6]}
).to_csv(path_or_buf="/tmp/test.csv")

In [17]:
save_file("/tmp/test.csv","test.csv", client_gcs.bucket(BUCKET_PERSONAL))

DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "POST /upload/storage/v1/b/eda-essence-vincent/o?uploadType=multipart HTTP/1.1" 200 743


In [18]:
download_file("/tmp/test2.csv","test.csv", client_gcs.bucket(BUCKET_PERSONAL))

DEBUG:urllib3.connectionpool:https://www.googleapis.com:443 "GET /download/storage/v1/b/eda-essence-vincent/o/test.csv?alt=media HTTP/1.1" 200 29


In [0]:
import filecmp
assert filecmp.cmp('/tmp/test.csv', '/tmp/test2.csv')