In [39]:
import random
import pandas as pd
#from copy import deepcopy
import os
from loguru import logger
from pathlib import Path
from datetime import datetime

### Functions

In [45]:
def load_csv_into_df(folder_name: Path) -> list:
    '''
    This function searches for all .xls files in a given directory, loads each file into a Pandas dataframe and changes the header line
    return: List with all created dataframes
    '''
    # Check if the folder exists
    if not os.path.exists(folder_name):
        logger.error(f"The path {folder_name} does not exist.")
        exit()
    else:
        logger.info("Loading the data...")

        # Create an empty list to store all dataframes
        dataframes = []

        # Loop through all files in the folder and open them as dataframes
        for file in os.listdir(folder_name):
            if file.endswith(".xls") or file.endswith(".xlsx"):
                try:
                    # Load the excel into a pandas dataframe, delete the header and declare the second row as new header
                    df = pd.read_excel(os.path.join(folder_name, file), header=None, skiprows=1)            
                    df.columns = df.iloc[0]
                    df = df.iloc[1:]
                    
                    # Add the created dataframe to the list of dataframes
                    dataframes.append(df)
                except:
                    logger.info(f"Error reading file {file}. Skipping...")
                    continue

    # Check if any dataframes were created
    if len(dataframes) == 0:
        logger.error(f"No dataframes were created - please check if the files in folder {folder_name} are correct.")
        exit()
    else:
        logger.success(f"{len(dataframes)} dataframe(s) were created.")

        return dataframes


In [3]:
def combine_dataframes(dataframes: list) -> pd.DataFrame:
    '''
    This function takes a list of data frames as input and checks if the dataframes have the same header. If so, the dataframes will be merged.
    return: Merged dataframe
    '''
    # Set the header information
    columns_set = set(dataframes[0].columns)

    # Check if all dataframes have the same columns 
    for df in dataframes:
        if set(df.columns) != columns_set:
            raise ValueError("All dataframes must have the same columns.")
    
    # Merge all dataframes into a single dataframe
    merged_df = pd.concat(dataframes, ignore_index=True)
    merged_df.to_excel("../data/combined_dataset.xlsx")

    logger.success(f"{len(dataframes)} dataframe(s) are combined to one dataset and stored in a excel file.")
    
    return merged_df    

In [6]:
def df_info_to_excel(df: pd.DataFrame):
    '''
    This function saves feature informations in an excel file
    '''
    pd.DataFrame({"name": df.columns, "non-nulls": len(df)-df.isnull().sum().values, "nulls": df.isnull().sum().values, "type": df.dtypes.values}).to_excel("data_infos.xlsx")

In [50]:
def data_preprocessing(data_folder_dir: Path):

    # Load the data into a list of pandas dataframes
    dataframes = load_csv_into_df(data_folder_dir)

    # Store the ncar abbreviation for file paths
    ncar = dataframes[0]['Benennung (dt)'][1][:3]

    logger.info("Start preprocessing the data...")
  
    dataframes_with_labels = []
    for i in range(len(dataframes)):
        # Keep only the relevant samples with Dok-Format=5P. This samples are on the last level of the car structure
        dataframes[i] = dataframes[i][dataframes[i]["Dok-Format"]=='5P'].reset_index(drop=True)

        # Keep only features which are identified as relevant for the preprocessing, the predictions or for the users' next steps
        dataframes[i] = dataframes[i][['Sachnummer','Benennung (dt)', 'X-Min','X-Max','Y-Min','Y-Max','Z-Min','Z-Max', 'Wert','Einheit','Gewichtsart','Kurzname','L-Kz.', 'L/R-Kz.', 'Modul (Nr)', 'ox','oy', 'oz', 'xx','xy','xz', 'yx','yy','yz','zx','zy','zz']]

        # Add columns for the label "Relevant für Messung" and "Allgemeine Bezeichnung"
        data_labeled = dataframes[i]
        data_labeled.insert(len(data_labeled.columns), 'Relevant fuer Messung', 'Nein')
        data_labeled.insert(len(data_labeled.columns), 'Einheitsname', '')
        dataframes_with_labels.append(data_labeled)

        # Date
        dateTimeObj = datetime.now()
        timestamp = dateTimeObj.strftime("%d%m%Y_%H%M")
        
        # Store preprocessed dataframes
        dataframes_with_labels[i].to_excel(f"../data/preprocessed_data/{ncar}_preprocessed_{timestamp}.xlsx")

    logger.success(f"The data is succeccfully preprocessed and stored as {ncar}_preprocessed_{timestamp}.xlsx!")


### Main

In [51]:
def main():
    # Define the path to the folder containing the data (xls files)
    data_folder = Path("../data/original_data")

    data_preprocessing(data_folder)


In [52]:
if __name__ == "__main__":
    
    main()

[32m2023-04-26 15:40:57.242[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_csv_into_df[0m:[36m11[0m - [1mLoading the data...[0m
[32m2023-04-26 15:40:58.981[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mload_csv_into_df[0m:[36m36[0m - [32m[1m1 dataframe(s) were created.[0m
[32m2023-04-26 15:40:58.982[0m | [1mINFO    [0m | [36m__main__[0m:[36mdata_preprocessing[0m:[36m9[0m - [1mStart preprocessing the data...[0m
[32m2023-04-26 15:41:04.831[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mdata_preprocessing[0m:[36m32[0m - [32m[1mThe data is succeccfully preprocessed and stored as G65_preprocessed_26042023_1540.xlsx![0m
