<a href="https://colab.research.google.com/github/kartoch/colab-eda/blob/master/02%20-%20Load%20CSV%20and%20type%20data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Instructions

This notebook is the second one, and has the task to load the zipped CSV from the first notebook, type the data in each dataframe and save them as CSV format. 

*Note*: it would be better to save it as Parquet or HDF, but the Pandas library does not implement `Int8`, `Int16`, `Int32`, etc.

The code to handle the load and save from Google Cloud Storage (GCS) is included.

In [0]:
from google.oauth2 import service_account
from google.cloud.storage import client
import io
import pandas as pd
from io import BytesIO
import json
import os.path
import logging
from zipfile import ZipFile

# Constants

* `LOG_LEVEL` : log level used by the logging instance logger

In [0]:
LOG_LEVEL = "DEBUG"

In [0]:
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(LOG_LEVEL)

# GCS configuration

* `SERVICE_ACCOUNT` : copy/paste your service account here
* `BUCKET_PERSONAL` : bucket where you can read/write to save/load files between notebooks



In [0]:
SERVICE_ACCOUNT = json.loads(r"""{
  "type": "service_account",
  "project_id": "...",
  "private_key_id": "...",
  "private_key": "...",
  "client_email": "...",
  "client_id": "...",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "..."
}""")

BUCKET_PERSONAL = "eda-essence-student_firstname"

In [0]:
credentials = service_account.Credentials.from_service_account_info(
    SERVICE_ACCOUNT,
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client_gcs = client.Client(
    credentials=credentials,
    project=credentials.project_id,
)

# Init and functions for GCS

In [0]:
def download_file(local_filename, remote_filename, bucket):
    blob = bucket.blob(remote_filename)
    blob.download_to_filename(local_filename)


In [0]:
download_file("/tmp/test.csv.zip", "test.csv.zip", client_gcs.bucket(BUCKET_PERSONAL))

In [0]:
df = pd.read_csv("/tmp/test.csv.zip", dtype = {'col1': 'Int8','col2': 'Int8'}, compression = 'zip')

In [0]:
df

In [0]:
def save_file(local_filename, remote_filename, bucket):
    blob = bucket.blob(remote_filename)
    blob.upload_from_filename(local_filename)

In [0]:
df.to_csv("/tmp/test2.csv.zip", compression = 'zip')

In [0]:
save_file("/tmp/test2.csv.zip","test2.csv.zip", client_gcs.bucket(BUCKET_PERSONAL))