## Preprocessing

In [1]:
import pandas as pd

df = pd.read_parquet('../data/dataset.parquet', engine='pyarrow')

Remove **time_to_failure** and **session_counter** and set **Timestamp** as the index.

In [2]:
df.drop(columns=['session_counter', 'time_to_failure'], inplace=True)
df.set_index('Timestamp', inplace=True)

In [3]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 679045 entries, 2021-06-07 04:14:30.742000 to 2022-05-30 17:50:43.374000
Data columns (total 14 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Flag roping                      679045 non-null  float64
 1   Platform Position [°]            679045 non-null  float64
 2   Platform Motor frequency [HZ]    679045 non-null  float64
 3   Temperature platform drive [°C]  679045 non-null  float64
 4   Temperature slave drive [°C]     679045 non-null  float64
 5   Temperature hoist drive [°C]     679045 non-null  float64
 6   Tensione totale film [%]         679045 non-null  float64
 7   Current speed cart [%]           679045 non-null  float64
 8   Platform motor speed [%]         679045 non-null  float64
 9   Lifting motor speed [RPM]        679045 non-null  float64
 10  Platform rotation speed [RPM]    679045 non-null  float64
 11  Slave rotation sp

Unnamed: 0,Flag roping,Platform Position [°],Platform Motor frequency [HZ],Temperature platform drive [°C],Temperature slave drive [°C],Temperature hoist drive [°C],Tensione totale film [%],Current speed cart [%],Platform motor speed [%],Lifting motor speed [RPM],Platform rotation speed [RPM],Slave rotation speed [M/MIN],Lifting speed rotation [M/MIN],alert_11
count,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0,679045.0
mean,30.999361,163.937436,694.531602,23.766837,23.444327,21.303289,165.357471,2.150718,13.383201,116.676833,10.785796,11.759514,0.933471,0.002051
std,0.140758,111.206953,1578.580228,11.435006,9.969298,9.974128,211.047433,12.512806,30.393794,685.563601,25.029436,32.598768,5.634515,0.045246
min,0.0,0.0,0.0,0.0,0.0,0.0,-100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,31.0,62.0,0.0,15.0,16.0,13.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,31.0,165.0,0.0,22.0,22.0,19.0,179.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,31.0,260.0,0.0,32.0,30.0,28.0,201.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,31.0,359.0,5200.0,61.0,55.0,55.0,9900.0,100.0,100.0,5500.0,88.0,184.0,73.0,1.0


## Modelling

In [4]:
import numpy as np

label = np.array(['alert_11'])
features = np.array(df.columns.difference(label))

print(f"-> Label (shape={label.shape}): {label}")
print(f"-> Features (shape={features.shape}): {features}")

-> Label (shape=(1,)): ['alert_11']
-> Features (shape=(13,)): ['Current speed cart [%]' 'Flag roping' 'Lifting motor speed [RPM]'
 'Lifting speed rotation [M/MIN]' 'Platform Motor frequency [HZ]'
 'Platform Position [°]' 'Platform motor speed [%]'
 'Platform rotation speed [RPM]' 'Slave rotation speed [M/MIN]'
 'Temperature hoist drive [°C]' 'Temperature platform drive [°C]'
 'Temperature slave drive [°C]' 'Tensione totale film [%]']


In [5]:
X = df[features]
y = df[label]

print(f"-> X (shape={X.shape})")
print(f"-> y (shape={y.shape})")

-> X (shape=(679045, 13))
-> y (shape=(679045, 1))


In [6]:
X = X.to_numpy()
y = y.to_numpy().flatten()

In [7]:
import numpy as np 

def window(X_data, y_data, width: int, shift: int):
    
    X_wins, y_wins = [], []

    for index, (X, y) in enumerate(zip(X_data, y_data)):
        if (index + width + shift) <= X_data.shape[0]:

            window = slice((index + width), (index + width + shift))

            X_wins.append(X_data[index: index + width])

            y_values_shift = y_data[window]
            y_wins.append(int(np.any(y_values_shift == 1)))

    X_wins = np.array(X_wins)
    y_wins = np.array(y_wins)
    return X_wins.reshape(X_wins.shape[0], -1), y_wins.flatten()

In [8]:
X_wins, y_wins = window(X, y, width=120, shift=180)

## Undersampling

In [9]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)

X_res, y_res = rus.fit_resample(X_wins, y_wins)
np.unique(y_res, return_counts=True)

(array([0, 1]), array([3324, 3324]))

## Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

## K-Fold Cross Validation

In [11]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_res, y_res, cv=5)
scores.mean()

0.6464838167653898

## Train and Test

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=0)

## Confusion Matrix

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 973    9]
 [   0 1013]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       982
           1       0.99      1.00      1.00      1013

    accuracy                           1.00      1995
   macro avg       1.00      1.00      1.00      1995
weighted avg       1.00      1.00      1.00      1995

