In [1]:
from google.cloud import storage
import os
import sqlalchemy
import sqlite3
import pandas as pd

In [2]:
def list_blobs_pd(bucket_name, folder_name):
    """List all files in given COS directory."""       
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    blob_name = []
    blob_size = []
    blob_time = []
    
    for blob in blobs:
        blob_name.append(blob.name)
        blob_size.append(blob.size)
        blob_time.append(blob.time_created)

    blobs_df = pd.DataFrame(list(zip(blob_name, blob_size, blob_time)), columns=['Name', 'Size', 'Time_Stamp'])

    return blobs_df


def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from COS bucket."""
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

In [3]:
def connect_to_sql() -> sqlalchemy.engine.base.Engine:
    """ Initializes a TCP connection pool for a Cloud SQL instance of MySQL. """

    db_host = '35.193.112.203'  # e.g. '127.0.0.1' ('172.17.0.1' if deployed to GAE Flex)
    db_user = 'root'  # e.g. 'my-db-user'
    db_pass = 'password'  # e.g. 'my-db-password'
    db_name = 'mysql'  # e.g. 'my-database'
    db_port = 0  # e.g. 3306

    pool = sqlalchemy.create_engine(
        # Equivalent URL:
        # mysql+pymysql://<db_user>:<db_pass>@<db_host>:<db_port>/<db_name>
        sqlalchemy.engine.url.URL.create(
            drivername="mysql+pymysql",
            username=db_user,
            password=db_pass,
            host=db_host,
            port=db_port,
            database=db_name,
        ),
        # ...
    )
    return pool

In [None]:
def executeQueriesFromFile(file_name):
    #execture query one by one

In [6]:
engine_mysql = connect_to_sql()
conn_mysql = engine_mysql.connect()

In [56]:
blobs = list_blobs_pd(bucket_name = "data_tql", folder_name = "spider/database")
blobs['schema'] = blobs['Name'].apply(lambda x: x.split('/')[2])
blobs['schema_last_file'] = blobs['Name'].apply(lambda x: x.split('/')[-1])
blobs['extension'] = blobs['schema_last_file'].apply(lambda x: x.split('.')[1])
# blobs[['extension']].value_counts()

In [8]:
sql_dict = {}
sqlite_dict = {}
no_db_file = []

for schema in blobs.schema.unique():
    df_schema = blobs[blobs['schema'] == schema]
    if 'sql' in df_schema['extension'].tolist():
        sql_dict[schema] = df_schema[df_schema['extension'] == 'sql']['Name'].values[0]
    elif 'sqlite' in df_schema['extension'].tolist():
        sqlite_dict[schema] = df_schema[df_schema['extension'] == 'sqlite']['Name'].values[0]
    else:
        no_db_file.append(schema)

In [55]:
for schema in sql_dict.keys():
    file_name = sql_dict[schema].split('/')[-1]
    download_blob(bucket_name = 'data_tql', source_blob_name = sql_dict[schema], destination_file_name = file_name)  
    conn_mysql.execute(sqlalchemy.text(f'CREATE SCHEMA IF NOT EXISTS {schema}'))
    conn_mysql.execute(sqlalchemy.text(f'USE {schema}'))
    executeQueriesFromFile(file_name)
    os.remove(file_name)    
    break