In [1]:
import pandas as pd
import numpy as np
from scripts.data_loader import load_data


In [2]:
run_df, incoming_df, metrology_df = load_data('train/')

In [3]:
run_df.rename(columns={'Tool ID': 'ToolId',
                       'Run Start Time': 'RunStartTime',
                       'Run End Time': 'RunEndTime',
                       'Run ID': 'RunId',
                       'Process Step': 'ProcessStep',
                       'Consumable Life': 'ConsumableLife',
                       'Step ID': 'StepId',
                       'Time Stamp': 'TimeStamp',
                       'Sensor Name': 'SensorName',
                       'Sensor Value': 'SensorValue'}, inplace=True)

incoming_df.rename(columns={'Tool ID': 'ToolId', 
                             'Run Start Time': 'RunStartTime',
                             'Run End Time': 'RunEndTime',
                             'Run ID': 'RunId',
                             'Process Step': 'ProcessStep',
                             'Step ID': 'StepId',
                             'Time Stamp': 'TimeStamp',
                             'Sensor Name': 'SensorName',
                             'Sensor Value': 'SensorValue'}, inplace=True)

metrology_df.rename(columns={'Run ID': 'RunId',
                           'Run Start Time': 'RunStartTime', 
                           'Run End Time': 'RunEndTime',   
                           'X_index': 'X_index',
                           'Y_index': 'Y_index',
                           'X': 'X',
                           'Y': 'Y',
                           'Point Index': 'PointIndex',
                           'Measurement': 'Measurement'}, inplace=True)



In [4]:
# impt step!!
metrology_pivot = metrology_df.pivot_table(index='RunId',
                                           columns='PointIndex',
                                           values='Measurement')


metrology_pivot.columns = [f'Measurement_{i}' for i in metrology_pivot.columns]

# Spatial feature engineering??
coord_map = metrology_df[['PointIndex', 'X', 'Y']].drop_duplicates().set_index('PointIndex')

# run_start_times = metrology_df[['RunId', 'RunStartTime']].drop_duplicates().set_index('RunId')
# metrology_pivot = metrology_pivot.join(run_start_times)


display(metrology_pivot.head())

target_columns = list(metrology_pivot.columns)


Unnamed: 0_level_0,Measurement_0,Measurement_1,Measurement_2,Measurement_3,Measurement_4,Measurement_5,Measurement_6,Measurement_7,Measurement_8,Measurement_9,...,Measurement_39,Measurement_40,Measurement_41,Measurement_42,Measurement_43,Measurement_44,Measurement_45,Measurement_46,Measurement_47,Measurement_48
RunId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000f424f-667d-54a2-bbbd-9624c4071465,10.095791,10.156764,10.337179,10.05893,10.116625,10.113654,10.183896,10.19013,10.090914,10.150634,...,10.179591,10.067517,10.129809,10.181777,10.050453,10.164333,10.120488,10.281584,10.171318,10.04301
001466b9-ee4c-5642-8e93-0a501ce4e9d9,10.138968,10.096725,10.350078,10.115794,10.203152,10.204871,10.194956,10.116416,10.086049,10.273882,...,10.263841,10.169717,10.197792,10.287032,10.088166,10.17633,10.222062,10.321888,10.192035,10.103446
002d6c65-b86f-5153-a2d8-206e59da6307,10.104243,10.209842,10.371794,10.09951,10.138044,10.12454,10.279279,10.1949,10.115012,10.166862,...,10.189573,10.08677,10.150102,10.214367,10.087978,10.213599,10.135042,10.301648,10.248628,10.080373
002fdc18-a36b-5188-a5e3-9e1d59697a6b,10.079414,10.196047,10.410978,10.030844,10.104684,10.197017,10.241707,10.085138,10.106157,10.165022,...,10.206083,10.107271,10.170462,10.15834,10.052357,10.198981,10.107771,10.314725,10.208511,10.037665
004379ac-3743-5811-bb3b-d1821813b2d2,10.199996,10.216007,10.398699,10.100442,10.21425,10.218284,10.223888,10.219392,10.140936,10.333146,...,10.249334,10.162063,10.262134,10.260722,10.090795,10.248904,10.252442,10.335714,10.229129,10.083445


In [5]:
def create_agg_features(df, group_col='RunId', prefix=''):

    print(f"aggregated features with prefix: {prefix}")

    # Pivot sensor data first: RunId, TimeStamp, Sensor1, Sensor2, ...
    pivot_df = df.pivot_table(index=[group_col, 'TimeStamp'],
                              columns='SensorName',
                              values='SensorValue',
                              aggfunc='mean') 
    pivot_df = pivot_df.reset_index()
    pivot_df.columns.name = None 


    sensor_cols = [col for col in pivot_df.columns if col not in [group_col, 'TimeStamp']]


    agg_funcs = ['mean', 'std', 'min', 'max', 'median', 'skew', 'sum']


    agg_dict = {col: agg_funcs for col in sensor_cols}
    aggregated_features = pivot_df.groupby(group_col).agg(agg_dict)

    # flatten multi-index columns
    aggregated_features.columns = [f'{prefix}{col[0]}_{col[1]}' for col in aggregated_features.columns]


    # def get_slope(series):
    #      # Check for NaNs or single point data
    #     if series.isnull().all() or len(series.dropna()) < 2:
    #          return 0.0
         
    #     y = series.values
         
    #     x = pivot_df.loc[series.index, 'TimeStamp'].values 
    #     not_nan_mask = ~np.isnan(y)
    #     y = y[not_nan_mask]
    #     x = x[not_nan_mask]
    #     if len(y) < 2:
    #         return 0.0
    #     X = np.vstack([x, np.ones(len(x))]).T
    #     try:
    #         slope, _ = np.linalg.lstsq(X, y, rcond=None)[0]
    #         return slope
    #     except np.linalg.LinAlgError:
    #         return 0.0 # Or some other indicator

    # print(f"Calculating slope features for prefix: {prefix}")
    
    # # Group by RunId first before applying lambda to avoid redundant pivoting
    # grouped_pivot = pivot_df.set_index(group_col) # Set RunId as index for grouping
    # slope_features = grouped_pivot.groupby(level=0)[sensor_cols].apply(lambda g: g.apply(get_slope))
    # slope_features.columns = [f'{prefix}{col}_slope' for col in slope_features.columns]
    # aggregated_features = aggregated_features.join(slope_features, on=group_col)

    # --- Add Time Duration Feature ---
    run_duration = pivot_df.groupby(group_col)['TimeStamp'].max() - pivot_df.groupby(group_col)['TimeStamp'].min()
    aggregated_features[f'{prefix}run_duration'] = run_duration


    return aggregated_features.reset_index() 


features_run = create_agg_features(run_df, prefix='proc_')


features_incoming = pd.DataFrame() 

features_incoming = create_agg_features(incoming_df, prefix='inc_')

cols_to_drop = [c for c in features_incoming.columns if 'ToolId' in c] # Example
features_incoming = features_incoming.drop(columns=cols_to_drop, errors='ignore')

# Start with static features from run_df (ConsumableLife, ToolId, Recipe)
static_features = run_df[['RunId', 'ToolId', 'ConsumableLife']].drop_duplicates(subset=['RunId'])
if static_features['RunId'].duplicated().any():
     static_features = static_features.groupby('RunId').first() 
else:
     static_features = static_features.set_index('RunId')


# merge
final_features = static_features.join(features_run.set_index('RunId'), on='RunId')
if not features_incoming.empty:
    final_features = final_features.join(features_incoming.set_index('RunId'), on='RunId')


# TODO: check if need add categorical feature for toolid
# final_features = pd.get_dummies(final_features, columns=['ToolId'], dummy_na=False)




aggregated features with prefix: proc_
aggregated features with prefix: inc_


## Final processing/merging

In [24]:
from sklearn.model_selection import train_test_split
if not isinstance(metrology_pivot.index, pd.RangeIndex):
     metrology_pivot = metrology_pivot.reset_index()

if not isinstance(final_features.index, pd.RangeIndex):
     final_features = final_features.reset_index()

training_data = pd.merge(final_features, metrology_pivot, on='RunId', how='inner')


feature_columns = [col for col in training_data.columns if col not in target_columns and col != 'RunId']
X = training_data[feature_columns].drop('ToolId', axis=1)
y = training_data[target_columns]
run_ids_final = training_data['RunId'] 


X['inc_run_duration'] = X['inc_run_duration'].apply(lambda x: x.total_seconds())
X['proc_run_duration'] = X['proc_run_duration'].apply(lambda x: x.total_seconds())


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)



## TODO: Modelling goes here

In [29]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


lgb_params = {
    'objective': 'regression_l1', 
    'metric': 'rmse',
    'n_estimators': 100,         
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,             
    'verbose': -1,
    'n_jobs': -1,                
    'seed': 42,
    'boosting_type': 'gbdt',
}


NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof_preds = np.zeros(y_train.shape)
# models = []


for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    X_train_split, y_train_split = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_val_split, y_val_split = X_train.iloc[val_idx], y_train.iloc[val_idx]


    base_model = lgb.LGBMRegressor(**lgb_params)
    multi_output_model = MultiOutputRegressor(base_model, n_jobs=-1) 

    multi_output_model.fit(X_train_split, y_train_split)

    val_preds = multi_output_model.predict(X_val_split)
    oof_preds[val_idx] = val_preds

    rmse_fold = np.sqrt(mean_squared_error(y_val_split, val_preds))
    print(f"Fold {fold+1} RMSE: {rmse_fold}")




Fold 1 RMSE: 0.042162101816923384
Fold 2 RMSE: 0.0440314224798441
Fold 3 RMSE: 0.04714561281461147
Fold 4 RMSE: 0.05646384104739237
Fold 5 RMSE: 0.045849529438754144


In [36]:
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

print(mean_absolute_percentage_error(y_train, oof_preds))
print(mean_squared_error(y_train, oof_preds))

0.0027913470485706458
0.0022458924996929526


## Sample submission here

In [41]:
df = pd.read_parquet('metrology_data.parquet')

pd.read_parquet('test/run_data.parquet')
# pd.read_parquet('test/incoming_run_data.parquet')

Unnamed: 0,Tool ID,Run Start Time,Run End Time,Run ID,Process Step,Consumable Life,Step ID,Time Stamp,Sensor Name,Sensor Value
0,683e5405-4b93-5160-be90-d3d5b0d9287a,2024-01-01 01:27:55,2024-01-01 01:40:30,dac9d86d-8006-5bcf-a10d-dc575bca05fb,22c8716b-1c14-525a-8c0a-0473ec7aa99e,115.641464,Step_0,2024-01-01 01:27:55.000000000,Sensor_A,0.055712
1,683e5405-4b93-5160-be90-d3d5b0d9287a,2024-01-01 01:27:55,2024-01-01 01:40:30,dac9d86d-8006-5bcf-a10d-dc575bca05fb,22c8716b-1c14-525a-8c0a-0473ec7aa99e,115.641464,Step_0,2024-01-01 01:27:56.000000002,Sensor_A,-0.114435
2,683e5405-4b93-5160-be90-d3d5b0d9287a,2024-01-01 01:27:55,2024-01-01 01:40:30,dac9d86d-8006-5bcf-a10d-dc575bca05fb,22c8716b-1c14-525a-8c0a-0473ec7aa99e,115.641464,Step_0,2024-01-01 01:27:56.999999997,Sensor_A,0.030820
3,683e5405-4b93-5160-be90-d3d5b0d9287a,2024-01-01 01:27:55,2024-01-01 01:40:30,dac9d86d-8006-5bcf-a10d-dc575bca05fb,22c8716b-1c14-525a-8c0a-0473ec7aa99e,115.641464,Step_1,2024-01-01 01:27:58.000000000,Sensor_A,-0.066958
4,683e5405-4b93-5160-be90-d3d5b0d9287a,2024-01-01 01:27:55,2024-01-01 01:40:30,dac9d86d-8006-5bcf-a10d-dc575bca05fb,22c8716b-1c14-525a-8c0a-0473ec7aa99e,115.641464,Step_1,2024-01-01 01:27:59.000000002,Sensor_A,0.011077
...,...,...,...,...,...,...,...,...,...,...
8557225,1b314ddd-198a-5cd5-90ae-933b947d013d,2024-01-02 21:29:55,2024-01-02 21:42:30,e195334c-cfb1-5297-b3e2-7f7a33c48a97,22c8716b-1c14-525a-8c0a-0473ec7aa99e,279.218048,Step_12,2024-01-02 21:41:14.000000002,Sensor_O,690.403748
8557226,1b314ddd-198a-5cd5-90ae-933b947d013d,2024-01-02 21:29:55,2024-01-02 21:42:30,e195334c-cfb1-5297-b3e2-7f7a33c48a97,22c8716b-1c14-525a-8c0a-0473ec7aa99e,279.218048,Step_12,2024-01-02 21:41:14.999999998,Sensor_O,750.969604
8557227,1b314ddd-198a-5cd5-90ae-933b947d013d,2024-01-02 21:29:55,2024-01-02 21:42:30,e195334c-cfb1-5297-b3e2-7f7a33c48a97,22c8716b-1c14-525a-8c0a-0473ec7aa99e,279.218048,Step_12,2024-01-02 21:41:16.000000000,Sensor_O,690.931152
8557228,1b314ddd-198a-5cd5-90ae-933b947d013d,2024-01-02 21:29:55,2024-01-02 21:42:30,e195334c-cfb1-5297-b3e2-7f7a33c48a97,22c8716b-1c14-525a-8c0a-0473ec7aa99e,279.218048,Step_12,2024-01-02 21:41:17.000000002,Sensor_O,746.828857
