<a href="https://colab.research.google.com/github/kartoch/colab-eda/blob/master/50%20-%20Annexe%20-%20Loading%20and%20Saving%20from%20GCP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.oauth2 import service_account
from google.cloud.storage import client
import io
import pandas as pd
from io import BytesIO
import json
import os.path
import logging
from zipfile import ZipFile

In [0]:
START_YEAR = 2007
END_YEAR = 2019
CACHE_DIRECTORY = "/tmp/"
LOG_LEVEL = "DEBUG"

In [0]:
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(LOG_LEVEL)

In [0]:
SERVICE_ACCOUNT = json.loads(r"""{
  "type": "service_account",
  "project_id": "...",
  "private_key_id": "...",
  "private_key": "...",
  "client_email": "...",
  "client_id": "...",
  "auth_uri": "...",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "...",
}""")
BUCKET_DATASETS = "[TO BE FILLED]"
BUCKET_PERSONAL = "[TO BE FILLED]"

In [0]:
credentials = service_account.Credentials.from_service_account_info(
    SERVICE_ACCOUNT,
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client_gcs = client.Client(
    credentials=credentials,
    project=credentials.project_id,
)

In [0]:
def save_file(local_filename, remote_filename, bucket):
    blob = bucket.blob(remote_filename)
    blob.upload_from_filename(local_filename)

In [0]:
def download_file(local_filename, remote_filename, bucket):
    if os.path.isfile(local_filename):
      logger.info("Already donwloaded: %s", local_filename)
    blob = bucket.blob(remote_filename)
    blob.download_to_filename(local_filename)

In [0]:
def download_datasets(blob_pathname, bucket):
    blob = bucket.blob(blob_pathname)
    return io.BytesIO(blob.download_as_string())

def generator_zip_file(client):
    bucket = client_gcs.bucket(BUCKET_DATASETS)
    for year in range(START_YEAR,END_YEAR+1):
        blob_pathname = "PrixCarburants_annuel_" + str(year) + ".zip"
        local_filename = CACHE_DIRECTORY + blob_pathname
        download_file(local_filename,blob_pathname,bucket)
        zip_ref = ZipFile(local_filename)
        [xml_filename] = zip_ref.namelist()
        yield (zip_ref.open(xml_filename),year)
        zip_ref.close()

In [0]:
for f,year in generator_zip_file(client):
    print(f,year)

In [0]:
df_test = pd.DataFrame(
    {"col1": [1,2,3],
     "col2": [4,5,6]}
).to_csv(path_or_buf="/tmp/test.csv")

In [0]:
save_file("/tmp/test.csv","test.csv", client_gcs.bucket(BUCKET_PERSONAL))

In [0]:
download_file("/tmp/test2.csv","test.csv", client_gcs.bucket(BUCKET_PERSONAL))

In [0]:
import filecmp
assert filecmp.cmp('/tmp/test.csv', '/tmp/test2.csv')