In [13]:
import pandas as pd
import os
import re
from loguru import logger
from pathlib import Path
from datetime import datetime
import shutil
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import WhitespaceTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

### Functions

In [14]:
def load_csv_into_df(folder_name: Path, original_prisma_data: bool, move_to_archive: bool) -> list:
    '''
    This function searches for all .xls files in a given directory, loads each file into a Pandas dataframe and changes the header line.
    If move_to_archive is set True, then all processed files will be moved to the archive.
    return: List with all created dataframes
    '''
    # Check if the folder exists
    if not os.path.exists(folder_name):
        logger.error(f"The path {folder_name} does not exist.")
        exit()
    else:
        logger.info("Loading the data...")

        # Create an empty list to store all dataframes
        dataframes = []
        
        # Loop through all files in the folder and open them as dataframes
        for file in os.listdir(folder_name):
            if file.endswith(".xls") or file.endswith(".xlsx"):
                try:
                    # Load the excel into a pandas dataframe, delete the header and declare the second row as new header
                    if original_prisma_data == True:
                        df = pd.read_excel(os.path.join(folder_name, file), header=None, skiprows=1)
                        #df = pd.read_excel(os.path.join(folder_name, file), skiprows=0)
                        df.columns = df.iloc[0]
                        df = df.iloc[1:]
                    else:
                        df = pd.read_excel(os.path.join(folder_name, file))

                    # Add the created dataframe to the list of dataframes
                    dataframes.append(df)

                    if move_to_archive == True:
                        # Move file to archive
                        shutil.move(os.path.join(folder_name, file), os.path.join(folder_name, "original_data_archive", file))

                except:
                    logger.info(f"Error reading file {file}. Skipping...")
                    continue

    # Check if any dataframes were created
    if len(dataframes) == 0:
        logger.error(f"No dataframes were created - please check if the files in folder {folder_name} are correct/exist.")
        exit()
    else:
        logger.success(f"{len(dataframes)} dataframe(s) were created.")

        return dataframes

In [15]:
def combine_dataframes(dataframes: list) -> pd.DataFrame:
    '''
    This function takes a list of data frames as input and checks if the dataframes have the same header. If so, the dataframes will be merged.
    return: Merged dataframe
    '''
    # Set the header information
    columns_set = set(dataframes[0].columns)

    # Check if all dataframes have the same columns 
    for df in dataframes:
        if set(df.columns) != columns_set:
            print(df.columns)
            print(columns_set)
            raise ValueError("All dataframes must have the same columns.")
    
    # Merge all dataframes into a single dataframe
    merged_df = pd.concat(dataframes, ignore_index=True)
    merged_df.to_excel("../data/combined_dataset.xlsx")

    logger.success(f"{len(dataframes)} dataframe(s) are combined to one dataset and stored in a excel file.")
    
    return merged_df    

In [16]:
def df_info_to_excel(df: pd.DataFrame):
    '''
    This function saves feature informations in an excel file
    '''
    pd.DataFrame({"name": df.columns, "non-nulls": len(df)-df.isnull().sum().values, "nulls": df.isnull().sum().values, "type": df.dtypes.values}).to_excel("data_infos.xlsx")

In [17]:
def prepare_and_add_labels(data_folder_dir: Path, original_prisma_data: bool, save_as_excel: bool, move_to_archive: bool):

    # Load the data into a list of pandas dataframes
    dataframes = load_csv_into_df(data_folder_dir, original_prisma_data, move_to_archive)

    # Store the ncar abbreviation for file paths
    ncar = dataframes[0]['Benennung (dt)'][1][:3]

    logger.info("Start preprocessing the data...")
    dataframes_with_labels = []
    for i in range(len(dataframes)):
        # Keep only the relevant samples with Dok-Format=5P. This samples are on the last level of the car structure
        dataframes[i] = dataframes[i][dataframes[i]["Dok-Format"]=='5P'].reset_index(drop=True)

        # Keep only features which are identified as relevant for the preprocessing, the predictions or for the users' next steps
        dataframes[i] = dataframes[i][['Sachnummer','Benennung (dt)', 'X-Min','X-Max','Y-Min','Y-Max','Z-Min','Z-Max', 'Wert','Einheit','Gewichtsart','Kurzname','L-Kz.', 'L/R-Kz.', 'Modul (Nr)', 'ox','oy', 'oz', 'xx','xy','xz', 'yx','yy','yz','zx','zy','zz']]

        # using dictionary to convert specific columns
        convert_dict = {'X-Min': float,
                        'X-Max': float,
                        'Y-Min': float,
                        'Y-Max': float,
                        'Z-Min': float,
                        'Z-Max': float,
                        'Wert': float,
                        'ox': float,
                        'oy': float,
                        'oz': float,
                        'xx': float,
                        'xy': float,
                        'xz': float,
                        'yx': float,
                        'yy': float,
                        'yz': float,
                        'zx': float,
                        'zy': float,
                        'zz': float                     
                        }
        
        dataframes[i] = dataframes[i].astype(convert_dict)

        # Add columns for the label "Relevant für Messung" and "Allgemeine Bezeichnung"
        data_labeled = dataframes[i]
        data_labeled.insert(len(data_labeled.columns), 'Relevant fuer Messung', 'Nein')
        data_labeled.insert(len(data_labeled.columns), 'Einheitsname', 'Dummy')
        dataframes_with_labels.append(data_labeled)

        if save_as_excel==True:
            # Date
            dateTimeObj = datetime.now()
            timestamp = dateTimeObj.strftime("%d%m%Y_%H%M")
            
            # Store preprocessed dataframes
            dataframes_with_labels[i].to_excel(f"../data/preprocessed_data/{ncar}_preprocessed_{timestamp}.xlsx")

    if save_as_excel == True:
        logger.success(f"The features are reduced and formated to the correct data type. The new dataset is stored as {ncar}_preprocessed_{timestamp}.xlsx!")
    else:
        logger.success(f"The features are reduced and formated to the correct data type!")
    
    return dataframes_with_labels, ncar


In [18]:
def prepare_text(designation: str) -> str:
    # transform to lower case
    text = str(designation).upper()

    # Removing punctations
    text = re.sub(r"[^\w\s]", "", text)

    # tokenize text
    text = text.split(" ")

    # remove empty tokens
    text = [t for t in text if len(t) > 0]

    # join all
    prepared_designation = " ".join(text)

    return prepared_designation

In [19]:
def vectorize_data(data: pd.DataFrame, timestamp) -> tuple:
    token = WhitespaceTokenizer()
    vectorizer = TfidfVectorizer(analyzer="word", tokenizer=token.tokenize)

    X_text = vectorizer.fit_transform(data['Benennung (dt)']).toarray()

    # Store the vocabulary
    vocabulary = vectorizer.get_feature_names_out()

    # Save the vectorizer and vocabulary to files
    with open(f'../models/vectorizer_{timestamp}.pkl', 'wb') as f:
        pickle.dump(vectorizer, f)
    with open(f'../models/vocabulary_{timestamp}.pkl', 'wb') as f:
        pickle.dump(vocabulary, f)

    return X_text

In [1]:
def train_test_val(df, test_size:float, timestamp):
    df["Benennung (dt)"] = df.apply(lambda x: prepare_text(x["Benennung (dt)"]), axis=1)

    #vectorizer = CountVectorizer()
    #X_text = vectorizer.fit_transform(df['Benennung (dt)']).toarray()

    X_text = vectorize_data(df, timestamp)

    # Combine text features with other features
    X = np.concatenate((X_text, df[['center_x', 'center_y', 'center_z','length','width','height','theta_x','theta_y','theta_z']].values), axis=1)

    y = df['Relevant fuer Messung']
    y = y.map({'Ja': 1, 'Nein': 0})

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

    return X_train, y_train, X_val, y_val, X_test, y_test

### Main

In [23]:
def main():
    # Define the path to the folder containing the data (xls files)
    data_folder = Path("../data/original_data_new")

    df, ncar = prepare_and_add_labels(data_folder, original_prisma_data=True, save_as_excel=True, move_to_archive=False)


In [24]:
if __name__ == "__main__":
    
    main()

[32m2023-05-10 16:26:11.366[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_csv_into_df[0m:[36m12[0m - [1mLoading the data...[0m
[32m2023-05-10 16:26:13.008[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mload_csv_into_df[0m:[36m46[0m - [32m[1m1 dataframe(s) were created.[0m
[32m2023-05-10 16:26:13.012[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_and_add_labels[0m:[36m9[0m - [1mStart preprocessing the data...[0m
[32m2023-05-10 16:26:17.183[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mprepare_and_add_labels[0m:[36m57[0m - [32m[1mThe features are reduced and formated to the correct data type. The new dataset is stored as G23_preprocessed_10052023_1626.xlsx![0m
