In [1]:
import pandas as pd
import os
from loguru import logger
from pathlib import Path


### Load data

In [11]:
def load_csv_into_df(folder_name: Path) -> pd.DataFrame:
    '''
    This function searches for all .xls files in a given directory, loads each file into a Pandas dataframe and changes the header line
    return: List with all created dataframes
    '''
    # Check if the folder exists
    if not os.path.exists(folder_name):
        logger.error(f"The path {folder_name} does not exist.")
        exit()
    else:
        # Create an empty list to store all dataframes
        dataframes = []

        # Loop through all files in the folder and open them as dataframes
        for file in os.listdir(folder_name):
            if file.endswith(".xls") or file.endswith(".xlsx"):
                try:
                    # Load the excel into a pandas dataframe, delete the header and declare the second row as new header
                    df = pd.read_excel(os.path.join(folder_name, file), header=None, skiprows=1)            
                    df.columns = df.iloc[0]
                    df = df.iloc[1:]
                    
                    # Add the created dataframe to the list of dataframes
                    dataframes.append(df)
                except:
                    logger.info(f"Error reading file {file}. Skipping...")
                    continue

    # Check if any dataframes were created
    if len(dataframes) == 0:
        logger.error(f"No dataframes were created - please check if the files in folder {folder_name} are correct.")
        exit()
    else:
        logger.success(f"{len(dataframes)} dataframes were created.")

        return dataframes


In [17]:
def combine_dataframes(dataframes: list) -> pd.DataFrame:
    '''
    This function takes a list of data frames as input and checks if the dataframes have the same header. If so, the dataframes will be merged.
    return: Merged dataframe
    '''
    # Set the header information
    columns_set = set(dataframes[0].columns)

    # Check if all dataframes have the same columns 
    for df in dataframes:
        if set(df.columns) != columns_set:
            raise ValueError("All dataframes must have the same columns.")
    
    # Merge all dataframes into a single dataframe
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    return merged_df    

### Main

In [None]:
def main(data_folder: Path):
    dataframes = load_csv_into_df(data_folder)
    data = combine_dataframes(dataframes)

In [None]:
if __name__ == "__main__":
    # Define the path to the folder containing the data (xls files)
    data_folder = Path("C:/Users/q617269/Desktop/Masterarbeit_Tobias/Data/original_data")

    main(data_folder)