In [56]:
import sqlite3
from google.cloud import storage
import pandas as pd

In [57]:
def list_blobs(bucket_name, folder_name):
    """List all files in given COS directory."""    
    blob_names = []
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))
    for blob in blobs:
        blob_names.append(blob.name)
    return blob_names
    
def list_blobs_pd(bucket_name, folder_name):
    """List all files in given COS directory."""       
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    blob_name = []
    blob_size = []
    blob_time = []
    
    for blob in blobs:
        blob_name.append(blob.name)
        blob_size.append(blob.size)
        blob_time.append(blob.time_created)

    blobs_df = pd.DataFrame(list(zip(blob_name, blob_size, blob_time)), columns=['filePath', 'size', 'timeStamp'])    
    return blobs_df

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from COS bucket."""
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""    
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)

In [31]:
blobs = list_blobs_pd(BUCKET_NAME, FOLDER_NAME)

In [32]:
list_blobs(BUCKET_NAME, FOLDER_NAME)

['wikisqliteDB/test/']

In [58]:
dataset_type = 'test'

In [45]:
DB_NAME = dataset_type + ".db"
BUCKET_NAME = "data_tql"
FOLDER_NAME = "WikiSQL/data"

storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(f"{FOLDER_NAME}/{DB_NAME}")

# Download the SQLite file to a local file
local_file_name = DB_NAME
blob.download_to_filename(local_file_name)


In [46]:
FOLDER_NAME = "wikisqliteDB/"+dataset_type

# connect to the database file
conn = sqlite3.connect(local_file_name)
cursor = conn.cursor()

# extract the table names from the database
table_names = [row[0] for row in cursor.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()]

In [47]:
# create a separate SQLite file for each table
failed_schemas = []
for table_name in table_names:
    try:
        # extract the schema for the table
        schema = "".join(row[0] + "\n" for row in cursor.execute(f"SELECT sql FROM sqlite_master WHERE type='table' AND name='{table_name}'").fetchall())

        # extract the rows for the table
        rows = cursor.execute(f"SELECT * FROM {table_name}").fetchall()

        # create a new SQLite file for the table
        new_db_name = f"{table_name}.sqlite"
        new_conn = sqlite3.connect(new_db_name)
        new_cursor = new_conn.cursor()

        # create the table in the new SQLite file using the extracted schema
        new_cursor.execute(schema)

        # insert the rows into the new SQLite file
        for row in rows:
            values = ",".join([f"'{str(val)}'" for val in row])
            new_cursor.execute(f"INSERT INTO {table_name} VALUES ({values})")

        # save and close the new SQLite file
        new_conn.commit()
        new_conn.close()

        # upload the new SQLite file to the GCP bucket
        storage_client = storage.Client()
        bucket = storage_client.bucket(BUCKET_NAME)
        blob = bucket.blob(f"{FOLDER_NAME}/{new_db_name}")
        blob.upload_from_filename(new_db_name)
    except:
        failed_schemas.append(table_name)
    
# close the connection to the original database file
conn.close()


In [49]:
import os

# specify the folder path where the SQLite files are located
folder_path = "/home/jupyter/TQL/databaseDesign/WikiSQL"

# loop through all files in the folder
for filename in os.listdir(folder_path):
    # check if the file is a SQLite file (i.e., has .db or .sqlite extension)
    if filename.endswith(".db") or filename.endswith(".sqlite"):
        # delete the file
        os.remove(os.path.join(folder_path, filename))


In [48]:
len(failed_schemas)

874

In [51]:
import csv

with open("failed_schemas.csv", "w", newline="") as csvfile:
    fieldnames = ["failed_schema"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for schema in failed_schemas:
        writer.writerow({"failed_schema": schema})

In [54]:
file_name = "failed_schemas.csv"
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(f"{FOLDER_NAME}/{file_name}")
blob.upload_from_filename("failed_schemas.csv")

In [55]:
for filename in os.listdir(folder_path):
    # check if the file is a SQLite file (i.e., has .db or .sqlite extension)
    if filename.endswith(".csv"):
        # delete the file
        os.remove(os.path.join(folder_path, filename))