# Google Cloud Storage

Using INPUT:

 - Recursive?
 - Run checks?
 - Download incomplete?

> gcs_source
 - List of Files (can be a list with one blob)
    .Does these files exist (with a specific extension)?
      .Check if all files are in the correct formation (starts with gs:// and ends with .parquet)
         .Exception: Malformed filepath
      .Run .exists to guarantee they exist
         .Exception: Path does not exists
 - List of Folders (Paths)
    .Path exists?
    .List all files from path
      .Are there files with the determined extension
 - Union[path + folder]


> gcs_destination
 - Path to an empty folder in GCS

> local_destination
 - Path to local folder to write output files generated by NVTabular


### What I need to check:
Local path: Exists and have .parquet files
GCS: 

gs://workshop-datasets/criteo-parque/

https://stackoverflow.com/questions/9532499/check-whether-a-path-is-valid-in-python-without-creating-a-file-at-the-paths-ta
https://engdoc.corp.google.com/eng/doc/devguide/py/totw/035.md?cl=head

## Exploring google.cloud.storage API

In [1]:
from google.cloud import storage
client = storage.Client()

In [22]:
# Case 1: Single file
file_format = 'parquet'
gcs_source = 'gs://renatoleite-criteo-partial/data/day_0.parquet'

if isinstance(gcs_source, str) and file_format in gcs_source:
    bucket = client.get_bucket(gcs_source.split(sep='/')[2])
    object_name = 
    
storage.Blob()

google.cloud.storage.bucket.Bucket

In [24]:
gcs_source = 'gs://renatoleite-criteo-partial/data/day_0.parquet'
gcs_source.split(sep='/')

['gs:', '', 'renatoleite-criteo-partial', 'data', 'day_0.parquet']

In [7]:
client = storage.Client()

bucket = client.get_bucket('renatoleite-criteo-partial')

blobs = client.list_blobs(bucket, prefix='data', delimiter='data')
l_blobs = list(blobs)
l_blobs[0]

print([blob.name for blob in l_blobs if '.parquet' in blob.name])

['data/day_0.parquet', 'data/day_1.parquet', 'data/day_2.parquet', 'data/day_3.parquet', 'data/training/day_3_1.parquet']


In [43]:
bucket = client.get_bucket('rl-gcloud-alpha')

In [46]:
storage.Blob('sub1/renato.txt', bucket).exists()

True

## Exploring the gcloud alpha storage command

In [4]:
# Posso montar o comando
# {path}*.{extension} se for apenas 1 nível
# {path}**.{extension} se for recursivo

# E se eu passar uma lista de Paths ou uma lista de files?
# Como passar isso para o comando do gcloud? Terei que fazer um loop? Criar threads?
# Como vou tratar os erros?

# cat filelist | gsutil -m cp -I ./download_dir

In [5]:
import subprocess
from typing import Union
from typing import List, Dict

In [3]:
def _compose_gcloud_download_cmd(gcs_paths: List[str],
                              local_destination: str, 
                              extension: str = 'parquet', 
                              recursive: bool = False) -> List[str]:
    rec_symbol = '**' if recursive else '*'

    formated_paths = []
    for path in gcs_paths:
        if path.endswith(f'.{extension}'):
            formated_paths.append(path)
        else:
            if path.endswith('/'):
                formated_paths.append(f'{path}{rec_symbol}.{extension}')
            elif path.endswith('/*') or path.endswith('/**'):
                formated_paths.append(f'{path}.{extension}')
            else:
                formated_paths.append(f'{path}/{rec_symbol}.{extension}')

    gcloud_cmd = ['gcloud', 'alpha', 'storage', 'cp', *formated_paths, local_destination]

    return gcloud_cmd

In [13]:
paths = ['gs://rl-gcloud-alpha/sub1/', 'gs://rl-gcloud-alpha/test.txt']
d_cmd = _compose_gcloud_download_cmd(paths, '/home/renatoleite/output', 'txt', True)

In [20]:
def _compose_gcloud_upload_cmd(local_path: str, 
                               gcs_destination: str) -> List[str]:
    gcloud_cmd = ['gcloud', 'alpha', 'storage', 'cp', '-r', local_path, gcs_destination]
    return gcloud_cmd

In [31]:
up = _compose_gcloud_upload_cmd('/home/renatoleite/output/', 'gs://rl-gcloud-alpha/')

In [11]:
def _execute_gcloud_cmd(gcloud_cmd: List) -> Dict[str,str]:
    output = subprocess.run(gcloud_cmd, capture_output=True, text=True)
    return {'returncode': output.returncode,
            'stdout': output.stdout,
            'stderr': output.stderr}

In [32]:
_execute_gcloud_cmd(up)

{'returncode': 0,
 'stdout': '',
 'stderr': 'Copying file:///home/renatoleite/output/s2/b.txt to gs://rl-gcloud-alpha/output/s2/b.txt\nCopying file:///home/renatoleite/output/s3/c.txt to gs://rl-gcloud-alpha/output/s3/c.txt\n  \nCopying file:///home/renatoleite/output/s1/a.txt to gs://rl-gcloud-alpha/output/s1/a.txt\nCopying file:///home/renatoleite/output/opa.txt to gs://rl-gcloud-alpha/output/opa.txt\n.\n\nAverage throughput: 1.5kiB/s\n'}

# PathInterface and Implementation

In [1]:
from src.path import Path

In [2]:
a = Path()

In [5]:
a.path

10

In [33]:
from .path import Path

class GenericPath:
    """Base class for Path definition"""
    def __init__(self, path, extension):
        self.path = path
        self.extension = extension
        self.path_metadata = {}

    def path_check(self):
        raise NotImplementedError("""Check if it is a valid path""")

    def _path_exists(self):
        raise NotImplementedError("""Check if path exists""")

    def _is_file_directory(self):
        raise NotImplementedError("""Is directory or file""")

    def _count_files(self):
        raise NotImplementedError("""Count number of files with extension in directory""")

In [4]:
from dataclasses import dataclass

@dataclass
class Path:
    path_name: str

In [5]:
Path('opa')

Path(path_name='opa')

In [6]:
from dataclasses import dataclass

@dataclass
class PathMetadata:
    is_directory: bool
    num_files: int
    protocol: str

In [10]:
a = {'num_files':False,'is_directory':10, 'protocol': 'gs'}

In [12]:
b = PathMetadata(**a)

In [21]:
a = 'gs://rl-gcloud-alpha/test.txt'
b = 'gs://rl-gcloud-alpha/'
c = 'gs://workshop-datasets/criteo-parque/*'
d = 'gs://workshop-datasets/criteo-parque/'
e = 'gs://oloko/test/'
f = '/home/renatoleite/data'

In [22]:
# Check if it is folder
import fsspec
from fsspec.core import get_fs_token_paths

import gcsfs

In [25]:
fs, _, _ = get_fs_token_paths(b)

In [26]:
fs.info(a)

{'kind': 'storage#object',
 'id': 'rl-gcloud-alpha/test.txt/1631475631454574',
 'selfLink': 'https://www.googleapis.com/storage/v1/b/rl-gcloud-alpha/o/test.txt',
 'mediaLink': 'https://storage.googleapis.com/download/storage/v1/b/rl-gcloud-alpha/o/test.txt?generation=1631475631454574&alt=media',
 'name': 'rl-gcloud-alpha/test.txt',
 'bucket': 'rl-gcloud-alpha',
 'generation': '1631475631454574',
 'metageneration': '1',
 'contentType': 'text/plain',
 'storageClass': 'STANDARD',
 'size': 2,
 'md5Hash': 'ouY+4BQBquyni+Aj37uMWQ==',
 'crc32c': 'j/AQhw==',
 'etag': 'CO7y46uY+vICEAE=',
 'timeCreated': '2021-09-12T19:40:31.455Z',
 'updated': '2021-09-12T19:40:31.455Z',
 'timeStorageClassUpdated': '2021-09-12T19:40:31.455Z',
 'type': 'file'}

In [None]:
fs.ls(d)

In [27]:
fs.expand_path(b, recursive=True)

['rl-gcloud-alpha/',
 'rl-gcloud-alpha/output',
 'rl-gcloud-alpha/output/opa.txt',
 'rl-gcloud-alpha/output/s1',
 'rl-gcloud-alpha/output/s1/a.txt',
 'rl-gcloud-alpha/output/s2',
 'rl-gcloud-alpha/output/s2/b.txt',
 'rl-gcloud-alpha/output/s3',
 'rl-gcloud-alpha/output/s3/c.txt',
 'rl-gcloud-alpha/sub1',
 'rl-gcloud-alpha/sub1/',
 'rl-gcloud-alpha/sub1/renato.txt',
 'rl-gcloud-alpha/test.txt']

In [30]:
fs.glob(f'{b}**.txt')

['rl-gcloud-alpha/output/opa.txt',
 'rl-gcloud-alpha/output/s1/a.txt',
 'rl-gcloud-alpha/output/s2/b.txt',
 'rl-gcloud-alpha/output/s3/c.txt',
 'rl-gcloud-alpha/sub1/renato.txt',
 'rl-gcloud-alpha/test.txt']

In [106]:
fs.isdir(f)

True

In [107]:
fs.protocol

'file'

In [91]:
'gs' in fs_spec.protocol

True

In [93]:
'gs' in 'gs'

True

In [69]:
#def _path_exists(self):

of = fsspec.open(d)

In [1]:
fs_spec = fsspec.filesystem('gs')

NameError: name 'fsspec' is not defined

In [None]:
fs_spec.exists()

In [114]:
test = fsspec.open_files(c)

In [115]:
len(test)

27

In [None]:
test.