In [None]:
# download nltk stopwords
import nltk
nltk.download('stopwords')

In [None]:
# Install a particular version of `google-cloud-storage` because (oddly enough)
# the  version on Colab and GCP is old. A dependency error below is okay.
!pip install -q google-cloud-storage==1.43.0

In [None]:
# authenticate below for Google Storage access as needed
from google.colab import auth
auth.authenticate_user()

In [None]:
from google.cloud import storage
import csv
import gzip
import io


In [None]:
bucket_name = "bucket_ir_100"
file_name = "pr/part-00000-bba051bd-4ed5-42d2-ac51-b81e7da0af95-c000.csv.gz"

In [None]:
def check_bucket_existence(bucket_name):
    storage_client = storage.Client()
    try:
        bucket = storage_client.get_bucket(bucket_name)
        return True
    except Exception as e:
        return False


if check_bucket_existence(bucket_name):
    print("Bucket exists.")
else:
    print("Bucket does not exist.")

In [None]:
def list_folder_contents(bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=folder_name)

    if blobs:
        print(f"Contents of folder '{folder_name}':")
        for blob in blobs:
            print(blob.name)
    else:
        print(f"Folder '{folder_name}' does not exist or is empty.")

folder_name = "pr/"

list_folder_contents(bucket_name, folder_name)

In [None]:
def check_file_existence(bucket_name, file_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)
    return blob.exists()


if check_file_existence(bucket_name, file_name):
    print("File exists.")
else:
    print("File does not exist.")

In [None]:
def read_csv_gzip_to_dict(bucket_name, file_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(file_name)

    # Download the file as bytes
    content = blob.download_as_string()

    # Decompress the gzip file and read it as a CSV
    with gzip.open(io.BytesIO(content), "rt") as gzip_file:
        csv_reader = csv.reader(gzip_file)
        header = next(csv_reader)  # Assuming the first row is the header
        data = {int(row[0]): float(row[1]) for row in csv_reader}

    return data

In [None]:
data = read_csv_gzip_to_dict(bucket_name, file_name)

In [None]:
# Print the first 20 items in the data dictionary
count = 0
for key, value in data.items():
    print(f"{key}: {value}")
    count += 1
    if count == 20:
        break

In [None]:
# Check to see time is good for getting 100 items from it.
import time

# Start the timer
start_time = time.time()

count = 0
for key, value in data.items():
    count += 1
    if count == 100:
        break

# Calculate and print the elapsed time
elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time} seconds")

Elapsed time: 0.00025010108947753906 seconds
