# Main_DDL
---
## This file is responsible for reading the raw data, cleaning it, and transforming the data into SQL files for implementation in the server containing the database

### Import Statements

In [4]:
import numpy as np
import pandas as pd
import os
import io
import openpyxl
from myMethods import (
    iter_filepaths,
    extract_dataframe_from_txt,
    extract_dataframe_from_excel,
    parse_FC_data_1,
    parse_FC_data_2,
    parse_FC76_data,
    save_dataframe_to_excel,
    generate_ddl_from_file, ## may be deprecated
)
import shutil

### Reading the Data
#### Data from DATA FILES/Source_Data is extracted into DATA FILES/Extracted_Data

In [None]:

## Define directories
DATA_dir = './DATA FILES/Source_Data/'
extracted_data_dir = './DATA FILES/Extracted_Data/'

## Create output directories if they don't exist
os.makedirs(extracted_data_dir, exist_ok=True)

## List to store tuples of (DataFrame, file_type, file_name)
dataframe_infos = []

## Extract data from all files
for file_path in iter_filepaths(DATA_dir):
    file_name = os.path.basename(file_path)
    base_name, ext = os.path.splitext(file_name)
    
    try:
        ## Extract data based on file type
        if ext.lower() == '.txt':
            df = extract_dataframe_from_txt(file_path, _encoding='cp1252')
            print(f"Successfully extracted from text file: {file_name}")
        elif ext.lower() in ['.xls', '.xlsx']:
            df = extract_dataframe_from_excel(file_path)
            print(f"Successfully extracted from Excel file: {file_name}")
        else:
            print(f"Skipping unsupported file: {file_name}")
            continue
        
        ## Export the DataFrame to Excel
        output_path = os.path.join(extracted_data_dir, f"{base_name}.xlsx")
        save_dataframe_to_excel(df, output_path)
        
    ## Handle specific exceptions for parsing
    except Exception as e:
        print(f"Error processing {file_name}: {e}")


Successfully extracted from text file: Ar76c.txt
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 48 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       305 non-null    object 
 1   1       305 non-null    object 
 2   2       305 non-null    int64  
 3   3       305 non-null    int64  
 4   4       305 non-null    int64  
 5   5       305 non-null    int64  
 6   6       267 non-null    object 
 7   7       267 non-null    float64
 8   8       267 non-null    float64
 9   9       304 non-null    object 
 10  10      304 non-null    float64
 11  11      304 non-null    float64
 12  12      274 non-null    object 
 13  13      274 non-null    float64
 14  14      274 non-null    float64
 15  15      274 non-null    object 
 16  16      274 non-null    float64
 17  17      274 non-null    float64
 18  18      297 non-null    object 
 19  19      297 non-null    float64
 20  20      297 non-null    flo

### This code block is for creating an object which contains metadata about the source files to help filter them before the cleaning step

In [2]:

## Define directories for the original source files and the extracted Excel files
source_data_dir = './DATA FILES/Source_Data/'
extracted_data_dir = './DATA FILES/Extracted_Data/'

## Build a mapping (dictionary) from base name to original extension using the source data directory.
source_file_info = {}
for file_path in iter_filepaths(source_data_dir):
    file_name = os.path.basename(file_path)
    base_name, ext = os.path.splitext(file_name)
    source_file_info[base_name] = ext.lower()

## Now, build the dataframe_infos list from the extracted Excel files.
dataframe_infos_list = []
for file_path in iter_filepaths(extracted_data_dir):
    try:
        df = extract_dataframe_from_excel(file_path)
        ## Capture the DataFrame info into a string
        info_buf = io.StringIO()
        df.info(buf=info_buf)
        df_info_str = info_buf.getvalue()
        
        file_name = os.path.basename(file_path)
        base_name, _ = os.path.splitext(file_name)
        
        ## Retrieve the original file extension from the mapping
        orig_ext = source_file_info.get(base_name, '.xlsx')
        
        ## Append the metadata row as [DataFrame info string, original extension, base name]
        dataframe_infos_list.append([df_info_str, orig_ext, base_name])
        print(f"Captured info for {base_name}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

## Convert the list to a NumPy 2D array
dataframe_infos = np.array(dataframe_infos_list)

## Print the captured DataFrame information for verification
for info, ext, base_name in dataframe_infos:
    print(f"\nDataFrame info for {base_name}:")
    print(info)
    print(f"Source file name with extension: {base_name}{ext}")


Captured info for Ar76c
Captured info for Ar76f
Captured info for AR79C
Captured info for AR79F
Captured info for AR80C
Captured info for AR80F
Captured info for AR83C
Captured info for AR83F
Captured info for AR85C
Captured info for AR85F
Captured info for AR87C
Captured info for AR87F
Captured info for AR91C
Captured info for AR91F
Captured info for AR95C
Captured info for AR95F
Captured info for ar_1975_constituinte
Captured info for ar_1976
Captured info for ar_1979_intercalar
Captured info for ar_1980
Captured info for ar_1983
Captured info for ar_1985
Captured info for ar_1987
Captured info for ar_1991
Captured info for ar_1995
Captured info for ar_1999
Captured info for ar_2002
Captured info for ar_2005
Captured info for ar_2009
Captured info for ar_2011

DataFrame info for Ar76c:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 48 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       305 non-n

### Cleaning the Data
#### Data from DATA FILES/Extracted_Data is cleaned and the clean data is saved into DATA FILES/Cleaned_Data

In [None]:
## cleaning data block

### cells below are explicitly for testing / debugging purposes

In [7]:

## Define your directories (update these paths accordingly)
data_dir = "./DATA FILES/Extracted_Data/"
target_dir = "./DATA FILES/Cleaned_Data/"
os.makedirs(target_dir, exist_ok=True)

for row in dataframe_infos:
    df_info, file_ext, base_name = row
    ## Construct the full file path using the stored base_name and extension
    file_path = os.path.join(data_dir, f"{base_name}.xlsx")
    
    ## Process only extracted data which was sourced from .txt files
    if file_ext.lower() == ".txt":
        try:
            df = extract_dataframe_from_excel(file_path)
        except Exception as e:
            print(f"Error extracting dataframe from {file_path}: {e}")
            continue

        ## Clean the DataFrame using parse_1, then parse_2 functions, in that order
        cleaned_df = parse_FC_data_1(df)
        cleaned_df = parse_FC_data_2(cleaned_df)
        ## Decide how to process the intermediate column based on base_name
        if "76" in base_name:
            cleaned_df = parse_FC76_data(cleaned_df)
        else:
            cleaned_df[2] = pd.to_numeric(cleaned_df[2], errors="coerce").astype("Int64")
        
        ## Prepare the output file path using the same base name with .xlsx extension
        output_file = os.path.join(target_dir, f"{base_name}.xlsx")
        try:
            save_dataframe_to_excel(cleaned_df, output_file)
        except Exception as e:
            print(f"Error saving cleaned dataframe for {file_path}: {e}")
    ## For non-.txt sourced files, copy them directly from data_dir to target_dir.
    else:
        source_file = os.path.join(data_dir, f"{base_name}.xlsx")
        target_file = os.path.join(target_dir, f"{base_name}.xlsx")
        try:
            shutil.copy(source_file, target_file)
            print(f"Copied file {source_file} to {target_file}")
        except Exception as e:
            print(f"Error copying file {source_file}: {e}")

Successfully saved Ar76c.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved Ar76f.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR79C.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR79F.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR80C.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR80F.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR83C.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR83F.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR85C.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR85F.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR87C.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR87F.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR91C.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR91F.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR95C.xlsx to ./DATA FILES/Cleaned_Data
Successfully saved AR95F.xlsx to ./DATA FILES/Cleaned_Data
Copied file ./DATA FILES/Extracted_Data/ar_1975_constitu