In [20]:
TENSORFLOW = 1
GLUON = 1

import os  
if GLUON:
    os.environ['CUDA_VISIBLE_DEVICES']='0'

if TENSORFLOW:
    import tensorflow as tf

    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            tf.config.set_visible_devices(gpus[0], 'GPU')
            tf.config.set_logical_device_configuration(
                gpus[0],
                [tf.config.LogicalDeviceConfiguration(memory_limit=24000)])
            logical_gpus = tf.config.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)


In [21]:
import warnings, pandas as pd, requests, mlflow, sys, os, logging, numpy as np
from automlwrapper import AutoMLWrapper, SedarDataLoader
from sedarapi import SedarAPI
from sklearn.model_selection import train_test_split

logging.getLogger('automlwrapper').setLevel(logging.ERROR)

np.random.seed(1313)

TIME_LIMIT = 60 * 60
AUTOKERAS_TRIALS = 20
AUTOKERAS_EPOCHS = 30

In [22]:
USE_MLFLOW = False

MLFLOW_URI = 'http://192.168.220.107:6798'
MLFLOW_EXPID = '1'
if USE_MLFLOW:
    mlflow.set_tracking_uri(MLFLOW_URI) 
    mlflow.set_experiment(experiment_id=MLFLOW_EXPID)

In [23]:
SEDAR_URI = 'http://192.168.220.107:5000'

SEDAR = SedarAPI(SEDAR_URI)
DataLoader = SedarDataLoader(SEDAR)

In [24]:
data_location = './tmp/data/pda_2023-04-18_10-13-22.csv'
label_location = './tmp/data/labels_030723.csv'

## download data to the above locations

In [25]:
!curl --create-dirs -O --output-dir \
./tmp/data \
https://gitlab.com/mibbels/automlwrapperdata/-/raw/main/tabular-regression/labels_030723.csv 

!curl --create-dirs -O --output-dir \
./tmp/data \
https://gitlab.com/mibbels/automlwrapperdata/-/raw/main/tabular-regression/pda_2023-04-18_10-13-22.csv 

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 14098  100 14098    0     0  41526      0 --:--:-- --:--:-- --:--:-- 41464
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 67.7M  100 67.7M    0     0  64.6M      0  0:00:01  0:00:01 --:--:-- 64.7M


# data preparation taken without changes 
Hoseini, S., et. al.: Coatings intelligence: Data-driven automation for chemistry
4.0. In: 2024 IEEE 7th (ICPS). pp. 1–8 (2024)

In [26]:
csv = pd.read_csv(data_location, skiprows = 0)
csv['Zeit'] =  pd.to_datetime(csv['Zeit'])
csv.sort_values(by='Zeit', inplace = True)

labels = pd.read_csv(label_location)
labels = labels[labels['row'] != 'None']
labels = labels[labels['row'] != 'Aussortieren']
print(len(labels))

df = csv[['Zeit','product_id', 'run_id', 'experiment_id', 'trial_id', 'set_force_begin',
       'x_position', 'y_position', 'z1_position', 'z2_position', 'x_velocity',
       'y_velocity', 'z1_velocity', 'z2_velocity', 'x_force', 'y_force',
       'z_force']]
df = df[df["product_id"] == 304]
good_experiment_ids = [{"run_id": 0, "experiment_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]},
                  {"run_id": 1, "experiment_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]},
                  {"run_id": 2, "experiment_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]},
                  {"run_id": 3, "experiment_ids": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
                ]
peak_dfs = []
i = 0
for item in good_experiment_ids:
    df_temp = df[df["run_id"] == item["run_id"]]
    for item2 in item["experiment_ids"]:
        df_temp2 = df_temp[df_temp["experiment_id"] == item2]
        liste_temp = df_temp2["trial_id"].unique()
        for item3 in liste_temp:
            i += 1
            #print(item["run_id"], item2, item3 )
            if labels[(labels['trial_id'] == item3) & (labels['experiment_id'] == item2) & (labels['run_id'] == item["run_id"])].shape[0] > 0:
                peak_df = df_temp2[df_temp2["trial_id"] == item3]
                peak_dfs.append(peak_df[["run_id", "trial_id", "experiment_id", 'x_position', 'x_force', 'y_force', 'z_force']])
                
print(len(peak_dfs))

filtered_peak_dfs = []

for i, item in enumerate(peak_dfs):
    filtered_df_temp = item[item['x_position'] > 20.0001].reset_index(drop=True)
    
    peak_row_temp = filtered_df_temp['x_position'].idxmax()
    
    peak_row_data_temp = filtered_df_temp.loc[:peak_row_temp-1]
    
    filtered_df_temp2 = filtered_df_temp.loc[peak_row_temp:]
    
    condition = filtered_df_temp2['x_force'] >= 0
    
    extracted_rows = filtered_df_temp2.loc[:condition.idxmax()]
        
    if (extracted_rows['x_position'] >= 99.9).all():
        filtered_peak_dfs.append(pd.concat([peak_row_data_temp, extracted_rows]))
    else:
        filtered_peak_dfs.append(peak_row_data_temp)
print(len(filtered_peak_dfs))

max_length = max(len(df) for df in filtered_peak_dfs)

padded_dataframes = []
for df in filtered_peak_dfs:
    padding_size = 519 - len(df) #padding_size - len(df) # check classification
    padded_df = pd.DataFrame(np.pad(df.values, ((0, padding_size), (0, 0)), mode='edge'), columns=df.columns)
    padded_df['index'] = padded_df.index
    padded_dataframes.append(padded_df)
print(len(padded_dataframes))

lengths = set()
polke_padded_dataframes_with_labels = []
for item in padded_dataframes:
    lengths.add(len(item))
    
    run_id = item["run_id"].unique()[0],
    trial_id = item["trial_id"].unique()[0],
    experiment_id = item["experiment_id"].unique()[0]
    
    #print("RUN_ID:", run_id,"experiment_id:",  experiment_id,"trial_id:", trial_id)

    indidvidual = labels[labels["run_id"] == run_id]
    indidvidual = indidvidual[indidvidual["experiment_id"] == experiment_id]
    indidvidual = indidvidual[indidvidual["trial_id"] == trial_id]
    
    try:
        if indidvidual['row'].iloc[0].isnumeric():
            row_value = int(indidvidual['row'])
            polke_padded_dataframes_with_labels.append((item, row_value))
        else:
            continue                                       ### <<<----- added try block
    except AttributeError as a:
        continue
print(lengths)

398
398
398
398


  row_value = int(indidvidual['row'])


{519}


In [27]:
padded_dataframes_with_labels_combined = polke_padded_dataframes_with_labels


tensor_X = []
tensor_y = []
for item in padded_dataframes_with_labels_combined:
    #df_temp = item[0][['x_force', 'y_force', 'z_force']].copy()
    #df_temp = item[0][['x_force', 'z_force']].copy()
    df_temp = item[0][['x_force']].copy()
    a = df_temp.to_numpy().astype(np.float32)
    tensor_X.append(a)
    tensor_y.append(item[1])
print(len(tensor_X))
print(len(tensor_y))

# train test split
X_train, X_temp, y_train, y_temp = train_test_split(np.array(tensor_X), np.array(tensor_y), test_size=0.2, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.8, shuffle=True)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape, X_val.shape , y_val.shape)

255
255
(204, 519, 1) (41, 519, 1) (204,) (41,) (10, 519, 1) (10,)


In [28]:
train = pd.DataFrame(X_train.reshape(X_train.shape[:2]))
train['label'] = y_train

test = pd.DataFrame(X_test.reshape(X_test.shape[:2]))
test['label'] = y_test 

val = pd.DataFrame(X_val.reshape(X_val.shape[:2]))
val['label'] = y_val

In [29]:
train.shape

(204, 520)

In [30]:
val.shape

(10, 520)

In [31]:
test.shape

(41, 520)

# Optimizing a range of different predictors using AutoGluon


In [None]:
%%time
def wrapper_medium(train_data,val_data, eval_metric):
    
    wrapper = AutoMLWrapper('autogluon')
    wrapper.Train(
        train_data=train_data,
        validation_data=val_data,
        target_column='label',
        task_type='regression',
        data_type='tabular',
        problem_type='regression',
        hyperparameters={'time_limit': TIME_LIMIT,
                         'preset' : 'medium_quality',
                        'eval_metric':eval_metric},
    )
    
    return wrapper

In [None]:
%%time
w = wrapper_medium(train, val, 'mean_squared_error')



In [None]:
%%time

extra_kwargs_for_auto_gluon = {'auxiliary_metrics' : True, 'detailed_report' : True}

re = w.Evaluate(test, target_column='Type', **extra_kwargs_for_auto_gluon)
print(re)

# Optimizing neural networks for tabular regression using AutoKeras


In [None]:
def wrapper_keras(train_data, test_data):
    import keras_tuner 
    
    wrapper = AutoMLWrapper('autokeras')
    wrapper.Train(
        train_data=train_data,
        target_column='label',
        task_type='regression',
        data_type='tabular',
        problem_type='regression',
        hyperparameters={'evaluation_metric':['mean_squared_error', 'mean_absolute_error'],
                         'epochs':AUTOKERAS_EPOCHS,
                         'num_trials':AUTOKERAS_TRIALS,
                        
                         'objective_metric':keras_tuner.Objective("val_mean_squared_error", direction="min")}
    )
    
    return wrapper

In [None]:
%%time

w = wrapper_keras(train, test)

In [None]:
%%time

re = w.Evaluate(test, target_column='label', return_dict = True)
print(re)

# Optimizing a range of different predictors using AutoSklearn

### kernel needs to be changed from 'automl' to 'AutoSklearn'

In [14]:
def wrapper_sk(train_data):
    
    wrapper = AutoMLWrapper('autosklearn')
    wrapper.Train(
        train_data=train_data,
        target_column='label',
        task_type='regression',
        data_type='tabular',
        problem_type='regression',
        hyperparameters={'time_limit': TIME_LIMIT, 
                         'evaluation_metric':'mean_absolute_error'},
    )

    return wrapper


In [None]:
%%time
w = wrapper_sk(train)



Perhaps you already have a cluster running?
Hosting the HTTP server on port 33593 instead


In [None]:
%%time

ev = w.Evaluate(test, target_column='label',detailed_report=True)
print(ev)

-------------
# Evaluating the LLMs capabilities when creating regression code

In [32]:
X_train = X_train.reshape(X_train.shape[:2])
train['label'] = y_train

X_test = X_test.reshape(X_test.shape[:2])
test['label'] = y_test 

X_val = X_val.reshape(X_val.shape[:2])
val['label'] = y_val

# **Trial 1**: This prompt is an adaption of the standard SEDAR version for using numpy arrays instead of dataframes
Generate a Python code snippet for a tabular regression pipeline. The data is contained in numpy array X and y for train, validation, and test respectively. The data is of shape (# samples, 519, 1), i.e., it contains 519 features. Focus on a simple yet effective architecture. Include necessary imports, dataset loading and preprocessing steps, model creating, training and testing. Annotate the code with comments to explain each step.

# OpenOrca's answer is a basic linear regression model with minimal feature engineering

In [38]:
%%time

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

#Scaling:
scaler = StandardScaler()  # Use a scaler to normalize the features (optional)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)               
#Creating a basic model and fitting it to the data:

pipeline = Pipeline(
    [('Scaler', scaler), ('Regressor', LinearRegression())]
) 
# Create a pipeline containing the scaler and linear regression model, add more steps if necessary
model = pipeline.fit(X_train_scaled, y_train) # Fit the model to the training data
#Evaluation:

y_pred = model.predict(X_test_scaled) # Predict on validation set using the trained model
mse = mean_squared_error(y_test, y_pred)
# Calculate MSE for validationset
print("MSE:", mse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)


MSE: 8826.39593681153
MAE: 63.87525493342702
CPU times: user 349 ms, sys: 1.58 s, total: 1.93 s
Wall time: 44 ms


# **Trial 2**: Even though there were no mentions of the fact that the model can be greatly improved, we asked for improvements and implementations

Prompt 2:

The performance of this model is not good enough. Give me enhancement proposals and show how to implement them.

Answer 2:

Here are some enhancement proposals and their implementations to improve the model's performance:

    Feature Selection: Remove irrelevant or redundant features from your dataset. This can help in reducing overfitting and improving generalization capabilities of the model.

    Regularization: Adding regularization techniques like Lasso, Ridge Regression, or Elastic Net to your LinearRegression model could improve its performance by preventing overfitting and promoting sparse models.

    Ensemble Methods: Combine multiple models (e.g., Bagging, Random Forest, Gradient Boosting) to create an ensemble model that can outperform individual models in terms of accuracy and stability.

    Hyperparameter Tuning: Optimize the hyperparameters of your model using techniques like Grid Search or Random Search to find the best combination for your dataset. This could lead to a significant improvement in performance.

    Model Selection: Experiment with different types of models, such as Decision Trees, Neural Networks, or Support Vector Machines, and compare their performances on your dataset to choose the most suitable one.

Here's an example implementation for each proposal:

    Feature Selection

    ######################### answer ended here


Prompt 3:

Implement those changes in the code

Answer 3:

Here is how you can implement feature selection:

```python
from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(score_func=chi2, k=5)  # Choose top 5 features based on Chi-squared statistics
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)
```
    Regularization: Lasso Regression
######################### answer ended here

## the  chi-squared is not appropriate  for the data due to negative values and had to be changed


In [37]:
%%time

from sklearn.feature_selection import SelectKBest, chi2, f_regression


#Scaling:
scaler = StandardScaler()  # Use a scaler to normalize the features (optional)


X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)               
#Creating a model and fitting it to the data:

# Choose top 5 features based on Chi-squared statistics
##selector = SelectKBest(score_func=chi2, k=5)  

# EDITED, chi squared not working
selector = SelectKBest(score_func=f_regression, k=10)  

X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

pipeline = Pipeline(
    [('Scaler', scaler), ('Regressor', LinearRegression())]
) 
# Create a pipeline containing the scaler and linear regression model
model = pipeline.fit(X_train_selected, y_train) # Fit the model to the training data
#Evaluation:

y_pred = model.predict(X_test_selected) # Predict on validation set using the trained model
mse = mean_squared_error(y_test, y_pred)
# Calculate MSE for validationset
print("MSE:", mse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE:", mae)

MSE: 4439.81570798467
MAE: 53.39850932795827
CPU times: user 55.5 ms, sys: 621 ms, total: 676 ms
Wall time: 16.3 ms
