In [5]:
import os
import zipfile

# Define the path to the Zipped-Data folder
zipped_data_path = os.path.join('..', 'data', 'Zipped-Data')

# Ensure the Zipped-Data folder exists
if not os.path.exists(zipped_data_path):
    print("Zipped-Data folder does not exist.")
else:
    # Iterate over all files in the Zipped-Data folder
    for filename in os.listdir(zipped_data_path):
        if filename.endswith('.zip'):
            # Construct full file path
            zip_file_path = os.path.join(zipped_data_path, filename)
            
            # Define the path to the new folder for this zipped file
            extracted_folder_path = os.path.join('..', 'data', 'Extracted-Data', filename[:-4]) # Remove the .zip extension
            
            # Ensure the new folder exists, if not, create it
            if not os.path.exists(extracted_folder_path):
                os.makedirs(extracted_folder_path)
            
            # Extract the zip file to the new folder
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall(extracted_folder_path)

    print("Extraction completed.")

Extraction completed.


In [4]:
import os
import pandas as pd

# Define the path to the Extracted-Data folder
extracted_data_path = os.path.join('..', 'data', 'Extracted-Data')

# Define the path to the Schema folder
schema_folder_path = os.path.join('..', 'data', 'Schema')

# Ensure the Schema folder exists, if not, create it
if not os.path.exists(schema_folder_path):
    os.makedirs(schema_folder_path)

# Define the path to the schema.sql file
schema_file_path = os.path.join(schema_folder_path, 'schema.sql')

# Initialize an empty string to hold the SQL schema for all tables
all_schemas = ""

# Iterate over all folders in the Extracted-Data folder
for folder_name in os.listdir(extracted_data_path):
    folder_path = os.path.join(extracted_data_path, folder_name)
    if os.path.isdir(folder_path):
        # Construct the path to the Chart Data.csv file
        csv_file_path = os.path.join(folder_path, 'Chart Data.csv')
        
        # Check if the file exists
        if os.path.exists(csv_file_path):
            # Read the CSV file
            df = pd.read_csv(csv_file_path)
            print(f"Read CSV file: {csv_file_path}") # Debugging statement
            
            # Initialize an empty DataFrame to hold the schema for this CSV
            schema_df = pd.DataFrame(columns=['Column Name', 'Data Type'])
            
            # Infer the schema
            for column in df.columns:
                # Get the data type of the column
                data_type = df[column].dtype
                
                # Append the column name and data type to the schema DataFrame
                schema_df = schema_df.append({'Column Name': column, 'Data Type': str(data_type)}, ignore_index=True)
            
            # Check if the schema DataFrame is empty
            if schema_df.empty:
                print(f"Schema DataFrame is empty for: {csv_file_path}") # Debugging statement
                continue
            
            # Generate the SQL schema for this table
            table_name = folder_name  # Use the folder name as the table name
            sql_schema = f"CREATE TABLE {table_name} (\n"
            for index, row in schema_df.iterrows():
                sql_schema += f"    {row['Column Name']} {row['Data Type']},\n"
            sql_schema = sql_schema.rstrip(",\n") + "\n);"
            
            # Append the SQL schema to the all_schemas string
            all_schemas += sql_schema + "\n\n"  # Add two newlines for separation between tables

# Write the combined SQL schema to the schema.sql file
try:
    with open(schema_file_path, 'w') as file:
        file.write(all_schemas)
    print(f"Combined schema saved to: {schema_file_path}") # Debugging statement
except Exception as e:
    print(f"Failed to save combined schema to {schema_file_path}: {e}") # Debugging statement

print("Schema generation completed.")

Combined schema saved to: ../data/Schema/schema.sql
Schema generation completed.
