In [1]:
from pathlib import Path
from loguru import logger
import pickle
import numpy as np
from nltk.tokenize import WhitespaceTokenizer
from ipynb.fs.defs.Feature_Engineering import add_new_features
from ipynb.fs.defs.Feature_Engineering import preprocess_dataset
from ipynb.fs.full.Prepare_data import prepare_and_add_labels
from ipynb.fs.full.Prepare_data import prepare_text

In [2]:
def main():
    # Define the path to the folder containing the data (xls files)
    data_path = Path("../data/original_data_new")

    df, ncar = prepare_and_add_labels(data_path, save_as_excel=False, move_to_archive=False)

    df_new_features = add_new_features(df[0])

    df_preprocessed, df_for_plot = preprocess_dataset(df_new_features)

    logger.info("Start classifying the given dataset...")
    # Load model
    model_path = "../models/lgbm_03052023_1728.pkl"
    with open(model_path, "rb") as fid:
        lgbm = pickle.load(fid)

    vectorizer_path = "../models/vectorizer_03052023_1728.pkl"
    # Load the vectorizer from the file
    with open(vectorizer_path, 'rb') as f:
        vectorizer = pickle.load(f)

    # Get the vocabulary of the training data
    with open('../models/vocabulary_03052023_1728.pkl', 'rb') as f:
        vocabulary = pickle.load(f)

    df_preprocessed["Benennung (dt)"] = df_preprocessed.apply(lambda x: prepare_text(x["Benennung (dt)"]), axis=1)

    # Convert the vocabulary list to a dictionary
    vocabulary_dict = {word: index for index, word in enumerate(vocabulary)}

    # Set the vocabulary of the vectorizer to the loaded vocabulary
    vectorizer.vocabulary_ = vocabulary_dict
    X_text = vectorizer.transform(df_preprocessed['Benennung (dt)']).toarray()

    # Combine text features with other features
    X = np.concatenate((X_text, df_preprocessed[['center_x', 'center_y', 'center_z','length','width','height','theta_x','theta_y','theta_z']].values), axis=1)

    y_pred = lgbm.predict(X, num_iteration=lgbm.best_iteration, force_row_wise=True)
    y_pred = np.round(y_pred)

    for index, row in df_preprocessed.iterrows():
        if y_pred[index] == 1: 
            df_preprocessed.loc[index,'Relevant fuer Messung'] = 'Ja'
        else:
            df_preprocessed.loc[index,'Relevant fuer Messung'] = 'Nein'

    df_preprocessed.to_excel(f"../data/predicted/{ncar}_labeled_test.xlsx")

    logger.success(f"The prediction is done and the result is stored here: data/predicted/{ncar}_labeled_test.xlsx!")

In [3]:
if __name__ == "__main__":
    
    main()

[32m2023-05-03 17:30:20.105[0m | [1mINFO    [0m | [36mipynb.fs.full.Prepare_data[0m:[36mload_csv_into_df[0m:[36m34[0m - [1mLoading the data...[0m
[32m2023-05-03 17:30:22.891[0m | [32m[1mSUCCESS [0m | [36mipynb.fs.full.Prepare_data[0m:[36mload_csv_into_df[0m:[36m65[0m - [32m[1m1 dataframe(s) were created.[0m
[32m2023-05-03 17:30:22.892[0m | [1mINFO    [0m | [36mipynb.fs.full.Prepare_data[0m:[36mprepare_and_add_labels[0m:[36m104[0m - [1mStart preprocessing the data...[0m
[32m2023-05-03 17:30:22.996[0m | [32m[1mSUCCESS [0m | [36mipynb.fs.full.Prepare_data[0m:[36mprepare_and_add_labels[0m:[36m155[0m - [32m[1mThe features are reduced and formated to the correct data type![0m
[32m2023-05-03 17:30:26.109[0m | [1mINFO    [0m | [36mipynb.fs.defs.Feature_Engineering[0m:[36mpreprocess_dataset[0m:[36m12[0m - [1mStart preprocessing the dataframe with 4585 samples...[0m
[32m2023-05-03 17:30:26.128[0m | [32m[1mSUCCESS [0m | [36mipy