In [None]:
import copy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
from sklearn.model_selection import train_test_split

In [None]:
# Read data
data = pd.read_csv(f'./sample_data/T1.csv', on_bad_lines='skip', sep=None, engine="python")
data.columns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read data
X, y = data[['LV ActivePower (kW)', 'Wind Speed (m/s)', 'Wind Direction (°)']], data['Theoretical_Power_Curve (KWh)']

# train-test split for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

# Convert to 2D PyTorch tensors
X_train = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train = torch.tensor(y_train.to_numpy(), dtype=torch.float32).reshape(-1, 1)
X_test = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
y_test = torch.tensor(y_test.to_numpy(), dtype=torch.float32).reshape(-1, 1)

In [None]:
model = nn.Sequential(
    nn.Linear(3, 24),
    nn.ReLU(),
    nn.Linear(24, 12),
    nn.ReLU(),
    nn.Linear(12, 6),
    nn.ReLU(),
    nn.Linear(6, 1)
)


In [None]:
data.shape

In [None]:
# loss function and optimizer
loss_fn = nn.MSELoss()  # mean square error
params = model.parameters()
optimizer = optim.Adam(params, lr=0.001)

n_epochs = 100   # number of epochs to run    #best = 50
batch_size = 64  # size of each batch   # best = 32
batch_start = torch.arange(0, len(X_train), batch_size)

# Hold the best model
best_mse = np.inf   # init to infinity
best_weights = None
history = []
train_history = []

In [None]:
for epoch in range(n_epochs):
    model.train()
    total_loss = 0  # Initialize total loss for averaging later
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=False) as bar:
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            # take a batch
            X_batch = X_train[start:start+batch_size]
            y_batch = y_train[start:start+batch_size]

            #set gradients to zero
            optimizer.zero_grad()

            # forward pass
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            # backward pass
            loss.backward()
            # update weights
            optimizer.step()
            # print progress
            bar.set_postfix(mse=float(loss))
            total_loss += loss.item() #* len(X_batch)  # Update total loss

    train_history.append(float(loss))

    # evaluate accuracy at end of each epoch
    model.eval()
    y_pred = model(X_test)
    mse = loss_fn(y_pred, y_test)
    mse = float(mse)
    history.append(mse)
    if mse < best_mse:
        best_mse = mse
        best_weights = copy.deepcopy(model.state_dict())

print('best:', best_mse)

In [None]:
model.state_dict()
# torch.save(model.state_dict(), '.')

In [None]:
# restore model and return best accuracy
model.load_state_dict(best_weights)
print("MSE: %.2f" % best_mse)
print("RMSE: %.2f" % np.sqrt(best_mse))
plt.plot(history, 'b')
# plt.plot(train_history, 'r')
plt.show()

In [2]:
# import csv
import pandas as pd
import matplotlib.pyplot as plt

file = '/T1.csv'
df = pd.read_csv(f'./{file}', on_bad_lines='skip',  sep=None, engine="python")
df

df[df.columns[0]]
df['Theoretical_Power_Curve (KWh)']

0         416.328908
1         519.917511
2         390.900016
3         516.127569
4         491.702972
            ...     
50525    3397.190793
50526    1173.055771
50527    1788.284755
50528    2418.382503
50529    2779.184096
Name: Theoretical_Power_Curve (KWh), Length: 50530, dtype: float64

In [3]:
tpc = df['Theoretical_Power_Curve (KWh)']
lvap = df['LV ActivePower (kW)']
# tpc = tpc[1:1000]
# lvap = lvap[1:1000]
time = df[df.columns[0]]
# time = time[1:1000]


In [4]:
#number of zeros (could be considered as failure point)
(tpc == 0).sum()


# Determine zeros in 'Theoretical_Power_Curve'
is_zero = (tpc <= 100)

# Generate group keys for consecutive zeros
group_keys = is_zero.ne(is_zero.shift()).cumsum()

# Calculate the size of each group of zeros
group_sizes = is_zero.groupby(group_keys).transform('sum')

# Flag the first instance of a sequence of more than 10 zeros
failure = (group_sizes > 50) & (group_sizes.groupby(group_keys).cumcount() == 0)

# Optionally, view the DataFrame to verify the 'Failure' flags
#print(df[['Theoretical_Power_Curve', 'Failure']])


max(group_sizes)

df['Failure'] = (group_sizes > 10) & (group_sizes.groupby(group_keys).cumcount() == 0)

time = df[df.columns[0]]
time.dtype

df['FailOneMonth'] = 0
df['FailThreeMonth'] = 0
df['FailSixMonth'] = 0
df['FailOneYear'] = 0
df
df['DATE'] = pd.to_datetime(df[df.columns[0]], format='mixed')


# Extract features
df['Year'] = df['DATE'].dt.year
df['Month'] = df['DATE'].dt.month
df['Day'] = df['DATE'].dt.day
df['Hour'] = df['DATE'].dt.hour
df['Minute'] = df['DATE'].dt.minute
df

# Add a month to each date
df['Date_Plus_One_Month'] = df['DATE'] + pd.DateOffset(months=1)
df['Date_Plus_Three_Month'] = df['DATE'] + pd.DateOffset(months=3)
df['Date_Plus_Six_Month'] = df['DATE'] + pd.DateOffset(months=6)
df['Date_Plus_One_Year'] = df['DATE'] + pd.DateOffset(months=12)

# Capture failure dates
failure_dates = df['DATE'][df['Failure']].values

# Vectorized approach to identify failure within a month
df['Failure_Within_One_Month'] = [any((failure_dates > start) & (failure_dates <= end)) for start, end in zip(df['DATE'].values, df['Date_Plus_One_Month'].values)]
df['Failure_Within_Three_Month'] = [any((failure_dates > start) & (failure_dates <= end)) for start, end in zip(df['DATE'].values, df['Date_Plus_Three_Month'].values)]
df['Failure_Within_Six_Month'] = [any((failure_dates > start) & (failure_dates <= end)) for start, end in zip(df['DATE'].values, df['Date_Plus_Six_Month'].values)]
df['Failure_Within_One_Year'] = [any((failure_dates > start) & (failure_dates <= end)) for start, end in zip(df['DATE'].values, df['Date_Plus_One_Year'].values)]


In [5]:
df = df.drop(columns=['Date_Plus_One_Month', 'Date_Plus_Three_Month', 'Date_Plus_Six_Month', 'Date_Plus_One_Year', 'FailOneMonth', 'FailThreeMonth', 'FailSixMonth', 'FailOneYear'])

# Create a column that contains the date of the last failure
df['Last_Failure_Date'] = df['DATE'].where(df['Failure']).ffill()

# Calculate the number of days since the last failure
df['Days_Since_Last_Failure'] = (df['DATE'] - df['Last_Failure_Date']).dt.days

# This will fill NaNs for the initial rows before the first failure occurs
df['Days_Since_Last_Failure'] = df['Days_Since_Last_Failure'].fillna(0).astype(int)

df

print(df['DATE'][df['DATE']=='2018-12-31 01:40:00'])
row_0 = df.iloc[50396]
df['Days_Since_Last_Failure'][47000]
df['Next_Failure_Date'] = df['DATE'].where(df['Failure']).bfill()
df


50396   2018-12-31 01:40:00
Name: DATE, dtype: datetime64[ns]


Unnamed: 0,﻿Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°),Failure,DATE,Year,Month,Day,Hour,Minute,Failure_Within_One_Month,Failure_Within_Three_Month,Failure_Within_Six_Month,Failure_Within_One_Year,Last_Failure_Date,Days_Since_Last_Failure,Next_Failure_Date
0,01 01 2018 00:00,380.047791,5.311336,416.328908,259.994904,False,2018-01-01 00:00:00,2018,1,1,0,0,True,True,True,True,NaT,0,2018-04-01 12:40:00
1,01 01 2018 00:10,453.769196,5.672167,519.917511,268.641113,False,2018-01-01 00:10:00,2018,1,1,0,10,True,True,True,True,NaT,0,2018-04-01 12:40:00
2,01 01 2018 00:20,306.376587,5.216037,390.900016,272.564789,False,2018-01-01 00:20:00,2018,1,1,0,20,True,True,True,True,NaT,0,2018-04-01 12:40:00
3,01 01 2018 00:30,419.645905,5.659674,516.127569,271.258087,False,2018-01-01 00:30:00,2018,1,1,0,30,True,True,True,True,NaT,0,2018-04-01 12:40:00
4,01 01 2018 00:40,380.650696,5.577941,491.702972,265.674286,False,2018-01-01 00:40:00,2018,1,1,0,40,True,True,True,True,NaT,0,2018-04-01 12:40:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50525,31 12 2018 23:10,2963.980957,11.404030,3397.190793,80.502724,False,2018-12-31 23:10:00,2018,12,31,23,10,False,False,False,False,2018-12-31 01:40:00,0,NaT
50526,31 12 2018 23:20,1684.353027,7.332648,1173.055771,84.062599,False,2018-12-31 23:20:00,2018,12,31,23,20,False,False,False,False,2018-12-31 01:40:00,0,NaT
50527,31 12 2018 23:30,2201.106934,8.435358,1788.284755,84.742500,False,2018-12-31 23:30:00,2018,12,31,23,30,False,False,False,False,2018-12-31 01:40:00,0,NaT
50528,31 12 2018 23:40,2515.694092,9.421366,2418.382503,84.297913,False,2018-12-31 23:40:00,2018,12,31,23,40,False,False,False,False,2018-12-31 01:40:00,0,NaT


In [6]:
first_fail_idx = min(df['Failure'][df['Failure']].index)
df['Days_Since_Last_Failure'][df.index<first_fail_idx] = (df['DATE'][df.index<first_fail_idx] - df['Next_Failure_Date'][df.index<first_fail_idx]).dt.days



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Days_Since_Last_Failure'][df.index<first_fail_idx] = (df['DATE'][df.index<first_fail_idx] - df['Next_Failure_Date'][df.index<first_fail_idx]).dt.days


In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [11]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # Define the LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

        # Define the output layer
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()

        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        out = self.linear(out[:, -1, :])

        # Sigmoid to convert to probability
        out = torch.sigmoid(out)
        return out


In [14]:
data = df[['LV ActivePower (kW)', 'Wind Speed (m/s)',
       'Theoretical_Power_Curve (KWh)', 'Wind Direction (°)',
       'Failure_Within_One_Month', 'Failure_Within_Three_Month',
       'Failure_Within_Six_Month', 'Failure_Within_One_Year',
       'Days_Since_Last_Failure']]

until_june_30 = data[df['DATE'] <= pd.Timestamp(year=2018, month=6, day=30)]
after_june_30 = data[df['DATE'] > pd.Timestamp(year=2018, month=6, day=30)]

X = data[['LV ActivePower (kW)', 'Wind Speed (m/s)',
       'Theoretical_Power_Curve (KWh)', 'Wind Direction (°)',
       'Days_Since_Last_Failure']]

y = data[['Failure_Within_One_Month', 'Failure_Within_Three_Month',
       'Failure_Within_Six_Month', 'Failure_Within_One_Year']]


X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.5, shuffle=False)

X_train = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train = torch.tensor(y_train.to_numpy(), dtype=torch.float32).reshape(-1, 1)
X_val = torch.tensor(X_val.to_numpy(), dtype=torch.float32)
y_val = torch.tensor(y_val.to_numpy(), dtype=torch.float32).reshape(-1, 1)


In [15]:
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

In [21]:
def create_sequences(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

# Example of using a time step of 10
time_steps = 10
X_train_seq, y_train_seq = create_sequences(X_train, y_train, time_steps)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, time_steps)


In [22]:
train_features = torch.tensor(X_train_seq, dtype=torch.float32)
train_targets = torch.tensor(y_train_seq, dtype=torch.float32)
val_features = torch.tensor(X_val_seq, dtype=torch.float32)
val_targets = torch.tensor(y_val_seq, dtype=torch.float32)

# Create TensorDatasets
train_dataset = TensorDataset(train_features, train_targets)
val_dataset = TensorDataset(val_features, val_targets)

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [18]:
import torch.nn as nn

class SimpleLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = self.linear(lstm_out[:, -1, :])
        return out



In [None]:
# Hyperparameters
input_dim = 5
hidden_dim = 32
num_layers = 2
output_dim = 1
num_epochs = 5
learning_rate = 0.001

# Initialize model
model = SimpleLSTM(input_dim, hidden_dim, output_dim, num_layers)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()  # For binary classification tasks
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        model.eval()  # Set model to evaluation mode
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                val_loss = criterion(outputs, labels)
                # Evaluate your model's performance on validation data here

        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')



Epoch 1, Loss: 0.6800884008407593, Val Loss: 0.6902151703834534
Epoch 1, Loss: 0.6629761457443237, Val Loss: 0.6821802854537964
Epoch 1, Loss: 0.6557673215866089, Val Loss: 0.6730303764343262
Epoch 1, Loss: 0.6593004465103149, Val Loss: 0.6619611978530884
Epoch 1, Loss: 0.6684970259666443, Val Loss: 0.6513828039169312
Epoch 1, Loss: 0.6523380875587463, Val Loss: 0.6409645080566406
Epoch 1, Loss: 0.6398386359214783, Val Loss: 0.627568781375885
Epoch 1, Loss: 0.6221834421157837, Val Loss: 0.6115245223045349
Epoch 1, Loss: 0.5846136212348938, Val Loss: 0.5974112153053284
Epoch 1, Loss: 0.568810224533081, Val Loss: 0.5798937678337097
Epoch 1, Loss: 0.5789618492126465, Val Loss: 0.5660191774368286
Epoch 1, Loss: 0.5817817449569702, Val Loss: 0.5530396699905396
Epoch 1, Loss: 0.5420508980751038, Val Loss: 0.5395126938819885
Epoch 1, Loss: 0.514768123626709, Val Loss: 0.5252806544303894
Epoch 1, Loss: 0.5190532803535461, Val Loss: 0.5127410292625427
Epoch 1, Loss: 0.5408099889755249, Val Loss