# Main_DDL
---
## This file is responsible for reading the raw data, cleaning it, and transforming the data into SQL files for implementation in the server containing the database

### Import Statements

In [1]:
import pandas as pd
import os
import openpyxl
from myMethods import (
    iter_file_paths,
    extract_dataframe_from_txt,
    extract_dataframe_from_excel,
    parse_FC_data, ## may be deprecated
    save_dataframe_to_excel,
    generate_ddl_from_file, ## may be deprecated
    process_txt_file ## will become deprecated
)

### Reading the Data
#### Data from DATA FILES/Source_Data is extracted into DATA FILES/Extracted_Data

In [3]:

## Define directories
DATA_dir = './DATA FILES/Source_Data/'
extracted_data_dir = './DATA FILES/Extracted_Data/'

## Create output directories if they don't exist
os.makedirs(extracted_data_dir, exist_ok=True)

## List to store tuples of (DataFrame, file_type, file_name)
dataframe_infos = []

## Extract data from all files
for file_path in iter_file_paths(DATA_dir):
    file_name = os.path.basename(file_path)
    base_name, ext = os.path.splitext(file_name)
    
    try:
        ## Extract data based on file type
        if ext.lower() == '.txt':
            df = extract_dataframe_from_txt(file_path)
            print(f"Successfully extracted from text file: {file_name}")
        elif ext.lower() in ['.xls', '.xlsx']:
            df = extract_dataframe_from_excel(file_path)
            print(f"Successfully extracted from Excel file: {file_name}")
        else:
            print(f"Skipping unsupported file: {file_name}")
            continue
        
        ## Store the DataFrame with its metadata
        dataframe_infos.append((df, ext.lower(), base_name))
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

## Save all DataFrames to Excel files in the Extracted_Data directory
for df, ext, base_name in dataframe_infos:
    output_path = os.path.join(extracted_data_dir, f"{base_name}.xlsx")
    try:
        save_dataframe_to_excel(df, output_path)
        print(f"Successfully saved: {base_name}.xlsx to Extracted_Data folder")
    except Exception as e:
        print(f"Error saving {base_name}.xlsx: {e}")


Successfully extracted from text file: Ar76c.txt
Successfully extracted from text file: Ar76f.txt
Successfully extracted from text file: AR79C.TXT
Successfully extracted from text file: AR79F.TXT
Successfully extracted from text file: AR80C.TXT
Successfully extracted from text file: AR80F.TXT
Successfully extracted from text file: AR83C.TXT
Successfully extracted from text file: AR83F.TXT
Successfully extracted from text file: AR85C.TXT
Successfully extracted from text file: AR85F.TXT
Successfully extracted from text file: AR87C.TXT
Successfully extracted from text file: AR87F.TXT
Successfully extracted from text file: AR91C.TXT
Successfully extracted from text file: AR91F.TXT
Successfully extracted from text file: AR95C.TXT
Successfully extracted from text file: AR95F.TXT
Successfully extracted from Excel file: ar_1975_constituinte.xls
Successfully extracted from Excel file: ar_1976.xls
Successfully extracted from Excel file: ar_1979_intercalar.xls
Successfully extracted from Excel fi

### Cleaning the Data
#### Data from DATA FILES/Extracted_Data is cleaned and the clean data is saved into DATA FILES/Cleaned_Data

In [None]:
## cleaning data block

### Creating SQL DDL/DML scripts to transform the raw excel data into SQL tables

In [None]:
# Directory containing the data files
data_dir = './Data_Selected'
output_dir = './sql_scripts/ddl-dml/'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop through every file in the data directory using the iter_file_paths function
for file_path in iter_file_paths(data_dir):
    file_name = os.path.basename(file_path)
    
    # Process the file using your SQL DDL function
    try:
        _, sql_script = generate_ddl_from_file(file_path)
    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        continue
    
    # Build the output file name based on the original file's name
    base_name, _ = os.path.splitext(file_name)
    output_file = os.path.join(output_dir, base_name + '.sql')
    
    # Write the generated SQL script to the output file
    with open(output_file, 'w') as f:
        f.write(sql_script)
    
    print(f"SQL script for {file_name} saved to {output_file}")


### cells below are explicitly for testing / debugging purposes