# Main_DDL
---
## This file is responsible for reading the raw data, cleaning it, and transforming the data into SQL files for implementation in the server containing the database

### Import Statements

In [1]:
import pandas as pd
import os
import io
import openpyxl
from myMethods import (
    iter_file_paths,
    extract_dataframe_from_txt,
    extract_dataframe_from_excel,
    parse_FC_data_1,
    parse_FC_data_2,
    parse_FC_data_3,
    save_dataframe_to_excel,
    generate_ddl_from_file, ## may be deprecated
)

### Reading the Data
#### Data from DATA FILES/Source_Data is extracted into DATA FILES/Extracted_Data

In [None]:

## Define directories
DATA_dir = './DATA FILES/Source_Data/'
extracted_data_dir = './DATA FILES/Extracted_Data/'

## Create output directories if they don't exist
os.makedirs(extracted_data_dir, exist_ok=True)

## List to store tuples of (DataFrame, file_type, file_name)
dataframe_infos = []

## Extract data from all files
for file_path in iter_file_paths(DATA_dir):
    file_name = os.path.basename(file_path)
    base_name, ext = os.path.splitext(file_name)
    
    try:
        ## Extract data based on file type
        if ext.lower() == '.txt':
            df = extract_dataframe_from_txt(file_path, _encoding='cp1252')
            print(f"Successfully extracted from text file: {file_name}")
        elif ext.lower() in ['.xls', '.xlsx']:
            df = extract_dataframe_from_excel(file_path)
            print(f"Successfully extracted from Excel file: {file_name}")
        else:
            print(f"Skipping unsupported file: {file_name}")
            continue
        
        ## Store the DataFrame with its metadata
        dataframe_infos.append([df.info(buf= io.StringIO()), ext.lower(), base_name])
        
        ## Export the DataFrame to Excel
        output_path = os.path.join(extracted_data_dir, f"{base_name}.xlsx")
        save_dataframe_to_excel(df, output_path)
        
    ## Handle specific exceptions for parsing
    except Exception as e:
        print(f"Error processing {file_name}: {e}")


In [2]:
## check dataframes info
for df_info, ext, base_name in dataframe_infos:
    print(f"DataFrame info for {base_name}:")
    print(df_info)
    print(f"Source file name with extension: {base_name}{ext}")

NameError: name 'dataframe_infos' is not defined

### Cleaning the Data
#### Data from DATA FILES/Extracted_Data is cleaned and the clean data is saved into DATA FILES/Cleaned_Data

In [None]:
## cleaning data block

### cells below are explicitly for testing / debugging purposes

In [5]:
## testing format of Ar76c.txt file
## Read the file and extract the DataFrame
file_path = './DATA FILES/Source_Data/Ar76c.txt'
df = extract_dataframe_from_txt(file_path, _encoding='cp1252')
save_dataframe_to_excel(df, './DATA FILES/Extracted_Data/Ar76c.xlsx')

Successfully saved: ./DATA FILES/Extracted_Data/Ar76c.xlsx to Extracted_Data folder


'./DATA FILES/Extracted_Data/Ar76c.xlsx'