# Main_DDL
---
## This file is responsible for reading the raw data, cleaning it, and transforming the data into SQL files for implementation in the server containing the database

### Import Statements

In [1]:
import pandas as pd
import os
import openpyxl
from myMethods import (
    iter_file_paths,
    extract_dataframe_from_txt,
    extract_dataframe_from_excel,
    # parse_FC_data_1,
    parse_FC_data_2,
    # parse_FC_data_3,
    save_dataframe_to_excel,
    generate_ddl_from_file, ## may be deprecated
)

### Reading the Data
#### Data from DATA FILES/Source_Data is extracted into DATA FILES/Extracted_Data

In [2]:

## Define directories
DATA_dir = './DATA FILES/Source_Data/'
extracted_data_dir = './DATA FILES/Extracted_Data/'

## Create output directories if they don't exist
os.makedirs(extracted_data_dir, exist_ok=True)

## List to store tuples of (DataFrame, file_type, file_name)
dataframe_infos = []

## Extract data from all files
for file_path in iter_file_paths(DATA_dir):
    file_name = os.path.basename(file_path)
    base_name, ext = os.path.splitext(file_name)
    
    try:
        ## Extract data based on file type
        if ext.lower() == '.txt':
            df = extract_dataframe_from_txt(file_path, _encoding='cp1252')
            print(f"Successfully extracted from text file: {file_name}")
        elif ext.lower() in ['.xls', '.xlsx']:
            df = extract_dataframe_from_excel(file_path)
            print(f"Successfully extracted from Excel file: {file_name}")
        else:
            print(f"Skipping unsupported file: {file_name}")
            continue
        
        ## Store the DataFrame with its metadata
        dataframe_infos.append((df.info(), ext.lower(), base_name))
        
        ## Export the DataFrame to Excel
        output_path = os.path.join(extracted_data_dir, f"{base_name}.xlsx")
        try:
            save_dataframe_to_excel(df, output_path)
            print(f"Successfully saved: {base_name}.xlsx to Extracted_Data folder")
        except Exception as e:
            print(f"Error saving {base_name}.xlsx: {e}")
        
    ## Handle specific exceptions for parsing
    except Exception as e:
        print(f"Error processing {file_name}: {e}")



Successfully extracted from text file: Ar76c.txt
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 48 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       305 non-null    object 
 1   1       305 non-null    object 
 2   2       305 non-null    int64  
 3   3       305 non-null    int64  
 4   4       305 non-null    int64  
 5   5       305 non-null    int64  
 6   6       267 non-null    object 
 7   7       267 non-null    float64
 8   8       267 non-null    float64
 9   9       304 non-null    object 
 10  10      304 non-null    float64
 11  11      304 non-null    float64
 12  12      274 non-null    object 
 13  13      274 non-null    float64
 14  14      274 non-null    float64
 15  15      274 non-null    object 
 16  16      274 non-null    float64
 17  17      274 non-null    float64
 18  18      297 non-null    object 
 19  19      297 non-null    float64
 20  20      297 non-null    flo

In [None]:
## check dataframes info
for df_info, ext, base_name in dataframe_infos:
    print(f"DataFrame info for {base_name}:")
    print(df_info)
    print(f"Source file name: {base_name}")
    print(f"Source file type: {ext}\n")

### Cleaning the Data
#### Data from DATA FILES/Extracted_Data is cleaned and the clean data is saved into DATA FILES/Cleaned_Data

In [None]:
## cleaning data block

### cells below are explicitly for testing / debugging purposes

In [None]:
## testing format of Ar76f.txt file
## Read the file and extract the DataFrame
file_path = './DATA FILES/Source_Data/Ar76f.txt'
df = extract_dataframe_from_txt(file_path, _encoding='cp1252')
save_dataframe_to_excel(df, './DATA FILES/Extracted_Data/Ar76f.xlsx')