In [5]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split

train = pd.read_csv('data/single_turbine_data/train_reduced_unskewed.csv')
test = pd.read_csv('data/single_turbine_data/test_reduced_unskewed.csv')

label = ['1_Gear oil temperature (°C)']

# convert to datetime
train['# Date and time'] = pd.to_datetime(train['# Date and time'])

# Setting the index
train.set_index('# Date and time', inplace=True)

original_cols = ['1_Wind direction (°)',
       '1_Nacelle position (°)', '1_Power (kW)',
       '1_Front bearing temperature (°C)', '1_Rear bearing temperature (°C)',
       '1_Stator temperature 1 (°C)', '1_Nacelle ambient temperature (°C)',
       '1_Nacelle temperature (°C)', '1_Transformer temperature (°C)',
       '1_Generator bearing rear temperature (°C)',
       '1_Generator bearing front temperature (°C)', '1_Temp. top box (°C)',
       '1_Hub temperature (°C)', '1_Ambient temperature (converter) (°C)',
       '1_Rotor bearing temp (°C)', '1_Transformer cell temperature (°C)', '1_Generator RPM (RPM)']
extras = ['month_sin', 'month_cos', 'hour_sin', 'hour_cos', 
# 'curtailed', 
# 'offline',
]
          
train = train[original_cols + extras + label]

In [6]:
original = train.corr()

In [21]:
# Lag by 1 and 2 steps
lagged1 = train.copy()
lagged1[label] = lagged1[label].shift(1)
lagged1 = lagged1.dropna()
lagged1 = lagged1.corr()
lagged1_corr_with_label = lagged1.loc[:, label]

lagged2 = train.copy()
lagged2[label] = lagged2[label].shift(2)
lagged2 = lagged2.dropna()
lagged2 = lagged2.corr()
lagged2_corr_with_label = lagged2.loc[:, label]

# Lead by 1 and 2 steps
lead1 = train.copy()
lead1[label] = lead1[label].shift(-1)
lead1 = lead1.dropna()
lead1 = lead1.corr()
lead1_corr_with_label = lead1.loc[:, label]

lead2 = train.copy()
lead2[label] = lead2[label].shift(-2)
lead2 = lead2.dropna()
lead2 = lead2.corr()
lead2_corr_with_label = lead2.loc[:, label]

# Original correlation
original = train.corr()
original_corr_with_label = original.loc[:, label]

# Concatenate all correlations into a single DataFrame
all_corr = pd.concat([original_corr_with_label, lagged1_corr_with_label, lagged2_corr_with_label, lead1_corr_with_label, lead2_corr_with_label], axis=1)
all_corr.columns = ['original', 'lagged1', 'lagged2', 'lead1', 'lead2']

all_corr

Unnamed: 0,original,lagged1,lagged2,lead1,lead2
1_Wind direction (°),0.051447,0.050983,0.050627,0.051968,0.052377
1_Nacelle position (°),0.060928,0.06084,0.060814,0.061403,0.061606
1_Power (kW),-0.016071,-0.013225,-0.011306,-0.027089,-0.024792
1_Front bearing temperature (°C),0.685337,0.669449,0.656392,0.677322,0.675946
1_Rear bearing temperature (°C),0.843274,0.822392,0.805133,0.832856,0.828726
1_Stator temperature 1 (°C),0.515677,0.520088,0.518519,0.507861,0.503458
1_Nacelle ambient temperature (°C),0.140482,0.139664,0.139091,0.141038,0.141536
1_Nacelle temperature (°C),0.397021,0.373256,0.358341,0.392175,0.386036
1_Transformer temperature (°C),-0.040964,-0.041766,-0.042168,-0.039198,-0.03661
1_Generator bearing rear temperature (°C),0.569479,0.571595,0.568267,0.563106,0.558347


In [22]:
best_lag_or_lead_dict = {}

for feature in all_corr.index:
    if feature != label:  # skip the label itself
        best_corr_type = all_corr.loc[feature].idxmax()  # Gets the column name with max correlation
        best_corr_value = all_corr.loc[feature, best_corr_type]  # Gets the max correlation value

        # Translate column name to tuple (type, steps)
        if 'lagged' in best_corr_type:
            steps = int(best_corr_type.replace('lagged', ''))
            best_lag_or_lead_dict[feature] = ['lag', steps]
        elif 'lead' in best_corr_type:
            steps = int(best_corr_type.replace('lead', ''))
            best_lag_or_lead_dict[feature] = ['lead', steps]
        else:
            best_lag_or_lead_dict[feature] = ['original', 0]

print(best_lag_or_lead_dict)


{'1_Wind direction (°)': ['lead', 2], '1_Nacelle position (°)': ['lead', 2], '1_Power (kW)': ['lag', 2], '1_Front bearing temperature (°C)': ['original', 0], '1_Rear bearing temperature (°C)': ['original', 0], '1_Stator temperature 1 (°C)': ['lag', 1], '1_Nacelle ambient temperature (°C)': ['lead', 2], '1_Nacelle temperature (°C)': ['original', 0], '1_Transformer temperature (°C)': ['lead', 2], '1_Generator bearing rear temperature (°C)': ['lag', 1], '1_Generator bearing front temperature (°C)': ['original', 0], '1_Temp. top box (°C)': ['lag', 1], '1_Hub temperature (°C)': ['lead', 2], '1_Ambient temperature (converter) (°C)': ['lead', 2], '1_Rotor bearing temp (°C)': ['original', 0], '1_Transformer cell temperature (°C)': ['lead', 2], '1_Generator RPM (RPM)': ['lead', 2], 'month_sin': ['lead', 2], 'month_cos': ['lead', 2], 'hour_sin': ['lag', 2], 'hour_cos': ['lead', 1], '1_Gear oil temperature (°C)': ['original', 0]}
