In [52]:
import pandas as pd
import numpy as np
from scripts.data_loader import load_data


In [53]:
run_df, incoming_df, metrology_df = load_data('train/')

In [54]:
run_df.rename(columns={'Tool ID': 'ToolId',
                       'Run Start Time': 'RunStartTime',
                       'Run End Time': 'RunEndTime',
                       'Run ID': 'RunId',
                       'Process Step': 'ProcessStep',
                       'Consumable Life': 'ConsumableLife',
                       'Step ID': 'StepId',
                       'Time Stamp': 'TimeStamp',
                       'Sensor Name': 'SensorName',
                       'Sensor Value': 'SensorValue'}, inplace=True)

incoming_df.rename(columns={'Tool ID': 'ToolId', 
                             'Run Start Time': 'RunStartTime',
                             'Run End Time': 'RunEndTime',
                             'Run ID': 'RunId',
                             'Process Step': 'ProcessStep',
                             'Step ID': 'StepId',
                             'Time Stamp': 'TimeStamp',
                             'Sensor Name': 'SensorName',
                             'Sensor Value': 'SensorValue'}, inplace=True)

metrology_df.rename(columns={'Run ID': 'RunId',
                           'Run Start Time': 'RunStartTime', 
                           'Run End Time': 'RunEndTime',   
                           'X_index': 'X_index',
                           'Y_index': 'Y_index',
                           'X': 'X',
                           'Y': 'Y',
                           'Point Index': 'PointIndex',
                           'Measurement': 'Measurement'}, inplace=True)



In [55]:
# impt step!!
metrology_pivot = metrology_df.pivot_table(index='RunId',
                                           columns='PointIndex',
                                           values='Measurement')


metrology_pivot.columns = [f'Measurement_{i}' for i in metrology_pivot.columns]

# Spatial feature engineering??
coord_map = metrology_df[['PointIndex', 'X', 'Y']].drop_duplicates().set_index('PointIndex')

# run_start_times = metrology_df[['RunId', 'RunStartTime']].drop_duplicates().set_index('RunId')
# metrology_pivot = metrology_pivot.join(run_start_times)


display(metrology_pivot.head())

target_columns = list(metrology_pivot.columns)


Unnamed: 0_level_0,Measurement_0,Measurement_1,Measurement_2,Measurement_3,Measurement_4,Measurement_5,Measurement_6,Measurement_7,Measurement_8,Measurement_9,...,Measurement_39,Measurement_40,Measurement_41,Measurement_42,Measurement_43,Measurement_44,Measurement_45,Measurement_46,Measurement_47,Measurement_48
RunId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000f424f-667d-54a2-bbbd-9624c4071465,10.095791,10.156764,10.337179,10.05893,10.116625,10.113654,10.183896,10.19013,10.090914,10.150634,...,10.179591,10.067517,10.129809,10.181777,10.050453,10.164333,10.120488,10.281584,10.171318,10.04301
001466b9-ee4c-5642-8e93-0a501ce4e9d9,10.138968,10.096725,10.350078,10.115794,10.203152,10.204871,10.194956,10.116416,10.086049,10.273882,...,10.263841,10.169717,10.197792,10.287032,10.088166,10.17633,10.222062,10.321888,10.192035,10.103446
002d6c65-b86f-5153-a2d8-206e59da6307,10.104243,10.209842,10.371794,10.09951,10.138044,10.12454,10.279279,10.1949,10.115012,10.166862,...,10.189573,10.08677,10.150102,10.214367,10.087978,10.213599,10.135042,10.301648,10.248628,10.080373
002fdc18-a36b-5188-a5e3-9e1d59697a6b,10.079414,10.196047,10.410978,10.030844,10.104684,10.197017,10.241707,10.085138,10.106157,10.165022,...,10.206083,10.107271,10.170462,10.15834,10.052357,10.198981,10.107771,10.314725,10.208511,10.037665
004379ac-3743-5811-bb3b-d1821813b2d2,10.199996,10.216007,10.398699,10.100442,10.21425,10.218284,10.223888,10.219392,10.140936,10.333146,...,10.249334,10.162063,10.262134,10.260722,10.090795,10.248904,10.252442,10.335714,10.229129,10.083445


In [56]:
def create_agg_features(df, group_col='RunId', prefix=''):

    print(f"aggregated features with prefix: {prefix}")

    # Pivot sensor data first: RunId, TimeStamp, Sensor1, Sensor2, ...
    pivot_df = df.pivot_table(index=[group_col, 'TimeStamp'],
                              columns='SensorName',
                              values='SensorValue',
                              aggfunc='mean') 
    pivot_df = pivot_df.reset_index()
    pivot_df.columns.name = None 


    sensor_cols = [col for col in pivot_df.columns if col not in [group_col, 'TimeStamp']]


    agg_funcs = ['mean', 'std', 'min', 'max', 'median', 'skew', 'sum']


    agg_dict = {col: agg_funcs for col in sensor_cols}
    aggregated_features = pivot_df.groupby(group_col).agg(agg_dict)

    # flatten multi-index columns
    aggregated_features.columns = [f'{prefix}{col[0]}_{col[1]}' for col in aggregated_features.columns]


    # def get_slope(series):
    #      # Check for NaNs or single point data
    #     if series.isnull().all() or len(series.dropna()) < 2:
    #          return 0.0
         
    #     y = series.values
         
    #     x = pivot_df.loc[series.index, 'TimeStamp'].values 
    #     not_nan_mask = ~np.isnan(y)
    #     y = y[not_nan_mask]
    #     x = x[not_nan_mask]
    #     if len(y) < 2:
    #         return 0.0
    #     X = np.vstack([x, np.ones(len(x))]).T
    #     try:
    #         slope, _ = np.linalg.lstsq(X, y, rcond=None)[0]
    #         return slope
    #     except np.linalg.LinAlgError:
    #         return 0.0 # Or some other indicator

    # print(f"Calculating slope features for prefix: {prefix}")
    
    # # Group by RunId first before applying lambda to avoid redundant pivoting
    # grouped_pivot = pivot_df.set_index(group_col) # Set RunId as index for grouping
    # slope_features = grouped_pivot.groupby(level=0)[sensor_cols].apply(lambda g: g.apply(get_slope))
    # slope_features.columns = [f'{prefix}{col}_slope' for col in slope_features.columns]
    # aggregated_features = aggregated_features.join(slope_features, on=group_col)

    # --- Add Time Duration Feature ---
    run_duration = pivot_df.groupby(group_col)['TimeStamp'].max() - pivot_df.groupby(group_col)['TimeStamp'].min()
    aggregated_features[f'{prefix}run_duration'] = run_duration


    return aggregated_features.reset_index() 


features_run = create_agg_features(run_df, prefix='proc_')


features_incoming = pd.DataFrame() 

features_incoming = create_agg_features(incoming_df, prefix='inc_')

cols_to_drop = [c for c in features_incoming.columns if 'ToolId' in c] 
features_incoming = features_incoming.drop(columns=cols_to_drop, errors='ignore')

# Start with static features from run_df (ConsumableLife, ToolId, Recipe)
static_features = run_df[['RunId', 'ToolId', 'ConsumableLife']].drop_duplicates(subset=['RunId'])
if static_features['RunId'].duplicated().any():
     static_features = static_features.groupby('RunId').first() 
else:
     static_features = static_features.set_index('RunId')


# merge
final_features = static_features.join(features_run.set_index('RunId'), on='RunId')
if not features_incoming.empty:
    final_features = final_features.join(features_incoming.set_index('RunId'), on='RunId')


# TODO: check if need add categorical feature for toolid
# final_features = pd.get_dummies(final_features, columns=['ToolId'], dummy_na=False)




aggregated features with prefix: proc_
aggregated features with prefix: inc_


## Final processing/merging

In [57]:
from sklearn.model_selection import train_test_split
if not isinstance(metrology_pivot.index, pd.RangeIndex):
     metrology_pivot = metrology_pivot.reset_index()

if not isinstance(final_features.index, pd.RangeIndex):
     final_features = final_features.reset_index()

training_data = pd.merge(final_features, metrology_pivot, on='RunId', how='inner')


feature_columns = [col for col in training_data.columns if col not in target_columns and col != 'RunId']
X = training_data[feature_columns].drop('ToolId', axis=1)
y = training_data[target_columns]
run_ids_final = training_data['RunId'] 


X['inc_run_duration'] = X['inc_run_duration'].apply(lambda x: x.total_seconds())
X['proc_run_duration'] = X['proc_run_duration'].apply(lambda x: x.total_seconds())


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)



## TODO: Modelling goes here

In [58]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error


lgb_params = {
    'objective': 'regression_l1', 
    'metric': 'rmse',
    'n_estimators': 100,         
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'num_leaves': 31,             
    'verbose': -1,
    'n_jobs': -1,                
    'seed': 42,
    'boosting_type': 'gbdt',
}


NFOLDS = 3 # increase later on for real training
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof_preds = np.zeros(y_train.shape)
# models = []


for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    X_train_split, y_train_split = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_val_split, y_val_split = X_train.iloc[val_idx], y_train.iloc[val_idx]


    base_model = lgb.LGBMRegressor(**lgb_params)
    multi_output_model = MultiOutputRegressor(base_model, n_jobs=-1) 

    multi_output_model.fit(X_train_split, y_train_split)

    val_preds = multi_output_model.predict(X_val_split)
    oof_preds[val_idx] = val_preds

    rmse_fold = np.sqrt(mean_squared_error(y_val_split, val_preds))
    print(f"Fold {fold+1} RMSE: {rmse_fold}")

Fold 1 RMSE: 0.04178426681068764
Fold 2 RMSE: 0.04505495940470875
Fold 3 RMSE: 0.04466575306005808
Fold 4 RMSE: 0.0509290839642193
Fold 5 RMSE: 0.04341250621972655


In [59]:
print(np.sqrt(mean_squared_error(y_train, oof_preds)))

0.045274346369201024


## Sample submission here

In [60]:
df = pd.read_parquet('metrology_data.parquet')

test_run_data = pd.read_parquet('test/run_data.parquet')
test_incoming_data = pd.read_parquet('test/incoming_run_data.parquet')



test_run_data.rename(columns={'Tool ID': 'ToolId',
                       'Run Start Time': 'RunStartTime',
                       'Run End Time': 'RunEndTime',
                       'Run ID': 'RunId',
                       'Process Step': 'ProcessStep',
                       'Consumable Life': 'ConsumableLife',
                       'Step ID': 'StepId',
                       'Time Stamp': 'TimeStamp',
                       'Sensor Name': 'SensorName',
                       'Sensor Value': 'SensorValue'}, inplace=True)

test_incoming_data.rename(columns={'Tool ID': 'ToolId', 
                             'Run Start Time': 'RunStartTime',
                             'Run End Time': 'RunEndTime',
                             'Run ID': 'RunId',
                             'Process Step': 'ProcessStep',
                             'Step ID': 'StepId',
                             'Time Stamp': 'TimeStamp',
                             'Sensor Name': 'SensorName',
                             'Sensor Value': 'SensorValue'}, inplace=True)


In [61]:
features_run = create_agg_features(test_run_data, prefix='proc_')
features_incoming = create_agg_features(test_incoming_data, prefix='inc_')


static_features = test_run_data[['RunId', 'ToolId', 'ConsumableLife']].drop_duplicates(subset=['RunId'])
if static_features['RunId'].duplicated().any():
     static_features = static_features.groupby('RunId').first() 
else:
     static_features = static_features.set_index('RunId')


# merge
final_test_features = static_features.join(features_run.set_index('RunId'), on='RunId')

final_test_features = final_test_features.join(features_incoming.set_index('RunId'), on='RunId')


aggregated features with prefix: proc_
aggregated features with prefix: inc_


In [62]:

final_test_features = final_test_features.reset_index()

feature_columns = [col for col in final_test_features.columns if col != 'RunId']
X_test = final_test_features[feature_columns].drop('ToolId', axis=1)


X_test['inc_run_duration'] = X_test['inc_run_duration'].apply(lambda x: x.total_seconds())
X_test['proc_run_duration'] = X_test['proc_run_duration'].apply(lambda x: x.total_seconds())


## Inference data should be in this format

In [63]:
display(X_test)

Unnamed: 0,ConsumableLife,proc_Sensor_A_mean,proc_Sensor_A_std,proc_Sensor_A_min,proc_Sensor_A_max,proc_Sensor_A_median,proc_Sensor_A_skew,proc_Sensor_A_sum,proc_Sensor_B_mean,proc_Sensor_B_std,...,inc_Sensor_8_skew,inc_Sensor_8_sum,inc_Sensor_9_mean,inc_Sensor_9_std,inc_Sensor_9_min,inc_Sensor_9_max,inc_Sensor_9_median,inc_Sensor_9_skew,inc_Sensor_9_sum,inc_run_duration
0,115.641464,0.003904,0.057714,-0.127762,0.119095,0.004508,-0.140023,2.646686,359.911865,9995.263595,...,-0.105423,49082.292969,108.245247,94.246468,-3.052058,209.287170,127.383598,-0.111523,49468.078125,456.000000
1,351.287750,0.000440,0.058004,-0.125030,0.121876,-0.000970,0.011642,0.292860,8.637903,810.201341,...,0.122489,41759.886719,94.637772,92.466835,-2.991898,205.638855,91.067322,0.117985,41451.343750,436.999999
2,93.483879,-0.002633,0.057128,-0.133910,0.109139,-0.004423,-0.023315,-1.579871,62.498302,2139.571727,...,-0.127889,49318.019531,108.746887,93.457443,-3.003605,208.144714,131.696899,-0.133575,51763.519531,475.000000
3,215.626404,0.002636,0.056736,-0.115691,0.114624,0.001467,-0.021877,1.618248,-35.206646,1190.712844,...,0.014318,49184.980469,98.020378,91.029045,-3.104290,201.455063,98.026901,0.009400,49500.289062,504.000000
4,158.428711,-0.000266,0.057775,-0.120110,0.117478,-0.003783,0.064066,-0.185718,-55.992004,624.158940,...,-0.049839,47801.468750,100.580437,90.001648,-3.038958,199.748672,98.747223,-0.055819,45663.519531,453.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
855,396.446198,-0.003801,0.058594,-0.099911,0.099507,-0.009370,0.129697,-2.771024,-12.796025,536.128392,...,0.096665,47498.101562,92.510719,89.691216,-2.981722,201.445480,88.232796,0.142304,45237.742188,487.999999
856,157.337097,0.000113,0.058669,-0.099700,0.099857,-0.001850,0.026395,0.077686,0.941636,555.343886,...,-0.012357,45245.656250,99.664261,91.237831,-2.940707,204.752747,98.017349,0.031160,45945.222656,460.000000
857,389.172363,0.000942,0.058282,-0.099753,0.098921,0.004241,-0.050438,0.649977,-29.435598,1470.625752,...,0.078399,49701.761719,95.795799,92.091469,-3.207752,206.481293,92.322365,0.122825,49909.613281,520.000000
858,223.075592,-0.001155,0.059520,-0.099956,0.099158,-0.004389,0.060049,-0.715989,-705.293457,16020.869457,...,0.100839,45459.726562,93.577591,90.882271,-2.731769,204.683578,87.288605,0.144534,44449.355469,474.000000


In [159]:
out = multi_output_model.predict(X_test)

sample_predictions = pd.DataFrame(out).unstack().reset_index(name='Measurement').rename(columns={'level_0': 'Point Index', 'level_1': 'Run ID'})
sample_predictions['Point Index'] = sample_predictions['Point Index'].apply(lambda x: int(x))

sample_data = pd.read_parquet('metrology_data.parquet').drop(columns=['Measurement'])
sample_predictions['Run ID'] = sample_data['Run ID']

In [168]:

SUBMISSION_DF = pd.merge(
    left=sample_data, 
    right=sample_predictions,
    # how='right',
    left_on=['Run ID', 'Point Index'],
    right_on=['Run ID', 'Point Index'],
)

In [169]:
SUBMISSION_DF

Unnamed: 0,Run ID,Run Start Time,Run End Time,X_index,Y_index,X,Y,Point Index,Measurement
0,03aa7486-bf62-5d59-b844-5f2d4a4528c4,2024-01-02 16:31:00,2024-01-02 16:43:35,25,5,3.061224,-119.387755,0,10.110769
1,03aa7486-bf62-5d59-b844-5f2d4a4528c4,2024-01-02 16:31:00,2024-01-02 16:43:35,25,5,3.061224,-119.387755,0,10.072334
2,03aa7486-bf62-5d59-b844-5f2d4a4528c4,2024-01-02 16:31:00,2024-01-02 16:43:35,25,5,3.061224,-119.387755,0,10.096593
3,03aa7486-bf62-5d59-b844-5f2d4a4528c4,2024-01-02 16:31:00,2024-01-02 16:43:35,25,5,3.061224,-119.387755,0,10.095969
4,03aa7486-bf62-5d59-b844-5f2d4a4528c4,2024-01-02 16:31:00,2024-01-02 16:43:35,25,5,3.061224,-119.387755,0,10.069511
...,...,...,...,...,...,...,...,...,...
42135,ef918e33-410b-5687-bb0c-ca086e995572,2024-01-04 01:24:50,2024-01-04 01:37:25,1,29,-143.877551,27.551020,48,10.234395
42136,ef918e33-410b-5687-bb0c-ca086e995572,2024-01-04 01:24:50,2024-01-04 01:37:25,1,29,-143.877551,27.551020,48,10.053408
42137,ef918e33-410b-5687-bb0c-ca086e995572,2024-01-04 01:24:50,2024-01-04 01:37:25,1,29,-143.877551,27.551020,48,10.050037
42138,ef918e33-410b-5687-bb0c-ca086e995572,2024-01-04 01:24:50,2024-01-04 01:37:25,1,29,-143.877551,27.551020,48,10.124713
