In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re

In [None]:
def prepare_text(designation: str) -> str:
    # transform to lower case
    text = str(designation).upper()

    # Removing punctations
    text = re.sub(r"[^\w\s]", "", text)

    # tokenize text
    text = text.split(" ")

    # remove empty tokens
    text = [t for t in text if len(t) > 0]

    # join all
    prepared_designation = " ".join(text)

    return prepared_designation

In [3]:
def train_test_val(df):
    df["Benennung (dt)"] = df.apply(lambda x: prepare_text(x["Benennung (dt)"]), axis=1)

    vectorizer = CountVectorizer()
    X_text = vectorizer.fit_transform(df['Benennung (dt)'])

    X_text = X_text.toarray()

    # Combine text features with other features
    X = np.concatenate((X_text, df[['center_x', 'center_y', 'center_z','length','width','height','theta_x','theta_y','theta_z']].values), axis=1)

    y = df['Relevant fuer Messung']
    y = y.map({'Ja': 1, 'Nein': 0})

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

    return X_train, y_train, X_val, y_val, X_test,y_test