#### Train run

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the dataset
dataset = pd.read_csv("/content/50combined.csv")  # Update with your dataset path

from sklearn.impute import KNNImputer

# Define the columns you want to impute
columns_to_impute = ['Sample_50', 'Sample_45', 'Sample_46', 'Sample_47', 'Sample_48', 'Sample_49']  # Add more columns as needed

# Instantiate the KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Group rows by label and fill NaN values within each group for each specified column
for column in columns_to_impute:
    grouped = dataset.groupby('Label')[column]
    for label, group in grouped:
        # Create a mask to identify NaN values for this group
        nan_mask = group.isnull()
        if nan_mask.any():
            # Exclude the 'Label' column and impute NaN values using KNNImputer
            features = group.drop(columns=['Label']).values.reshape(-1, 1)
            features_imputed = imputer.fit_transform(features)

            # Update the DataFrame with imputed values using the mask
            dataset.loc[group.index, column] = features_imputed.flatten()

# Extract features (all columns except the last one)
X = dataset.iloc[:, 1:].values

# Extract labels (last column)
y = dataset.iloc[:, 0].values

# Encode labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Calculate mean and standard deviation for each feature
standard_scaler = StandardScaler()
X_normalized = standard_scaler.fit_transform(X)

# Scale the features using MinMaxScaler
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X_normalized)

# Convert data to PyTorch tensors with float32 data type and move to GPU
X_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.int64).to(device)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=0)

# Define DataLoader for batching the data
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# Define a CNN-LSTM model (combine convolutional and LSTM layers)
class CNNLSTMClassifier(nn.Module):
    def __init__(self, input_channels, hidden_size, num_layers, num_classes, kernel_size=3, dropout=0.2):
        super(CNNLSTMClassifier, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(2)
        self.lstm = nn.LSTM(16, hidden_size, num_layers)
        self.fc1 = nn.Linear(hidden_size, 64)
        self.fc2 = nn.Linear(64, num_classes)
        self.relu = nn.SELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.unsqueeze(1).repeat(1, 50, 1)  # Add a channel dimension
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define an RNN model
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.2):
        super(RNNClassifier, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.relu = nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.unsqueeze(1).repeat(1, 50, 1)
        out, _ = self.rnn(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(self.relu(out))
        return out

# Instantiate both models
input_channels = X_train.shape[1]
hidden_size_cnn_lstm = 128
hidden_size_rnn = 128
num_layers = 3
num_classes = len(label_encoder.classes_)
cnn_lstm_model = CNNLSTMClassifier(input_channels, hidden_size_cnn_lstm, num_layers, num_classes).to(device)
rnn_model = RNNClassifier(input_channels, hidden_size_rnn, num_layers, num_classes).to(device)

# Define L2 regularization parameter
weight_decay = 7e-5

# Define loss functions and optimizers for both models with L2 regularization
criterion_cnn_lstm = nn.CrossEntropyLoss(ignore_index=num_classes)
optimizer_cnn_lstm = optim.RAdam(cnn_lstm_model.parameters(), lr=0.006, weight_decay=weight_decay)
criterion_rnn = nn.CrossEntropyLoss()
optimizer_rnn = optim.RAdam(rnn_model.parameters(), lr=0.001, weight_decay=weight_decay)

# Train both models
num_epochs = 125
best_loss_cnn_lstm = float('inf')
best_loss_rnn = float('inf')

for epoch in range(num_epochs):
    cnn_lstm_model.train()
    rnn_model.train()
    running_loss_cnn_lstm = 0.0
    running_loss_rnn = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Train CNN-LSTM model
        optimizer_cnn_lstm.zero_grad()
        outputs_cnn_lstm = cnn_lstm_model(inputs)
        loss_cnn_lstm = criterion_cnn_lstm(outputs_cnn_lstm, labels)
        loss_cnn_lstm.backward()
        optimizer_cnn_lstm.step()
        running_loss_cnn_lstm += loss_cnn_lstm.item() * inputs.size(0)

        # Train RNN model
        optimizer_rnn.zero_grad()
        outputs_rnn = rnn_model(inputs)
        loss_rnn = criterion_rnn(outputs_rnn, labels)
        loss_rnn.backward()
        optimizer_rnn.step()
        running_loss_rnn += loss_rnn.item() * inputs.size(0)

    epoch_loss_cnn_lstm = running_loss_cnn_lstm / len(train_loader.dataset)
    epoch_loss_rnn = running_loss_rnn / len(train_loader.dataset)

    # Print progress
    print(f"Epoch [{epoch+1}/{num_epochs}], CNN-LSTM Loss: {epoch_loss_cnn_lstm:.4f}, RNN Loss: {epoch_loss_rnn:.4f}")

# Evaluate both models on the test set
cnn_lstm_model.eval()
rnn_model.eval()
combined_probs = []
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Get probabilities from CNN-LSTM model
        outputs_cnn_lstm = cnn_lstm_model(inputs)
        probs_cnn_lstm = nn.functional.softmax(outputs_cnn_lstm, dim=1)

        # Get probabilities from RNN model
        outputs_rnn = rnn_model(inputs)
        probs_rnn = nn.functional.softmax(outputs_rnn, dim=1)

        # Combine probabilities from both models
        combined_probs_batch = (probs_cnn_lstm + probs_rnn) / 2
        combined_probs.append(combined_probs_batch.cpu().numpy())

        # Get predicted labels from combined probabilities
        _, predicted = torch.max(combined_probs_batch, 1)

        # Calculate accuracy
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Ensemble Test Accuracy: {accuracy:.2%}")

# Flatten and concatenate probabilities from all batches
combined_probs = np.concatenate(combined_probs, axis=0)
y_pred_combined = np.argmax(combined_probs, axis=1)
y_true_combined = y_test.cpu().numpy()

# Print classification report for ensemble model
print("Classification Report for Ensemble Model:")
print(classification_report(y_true_combined, y_pred_combined))

#### save pkl files of models, scalers, and encoder

In [None]:
import pickle
from google.colab import files

# Save CNN-LSTM model
torch.save(cnn_lstm_model, 'cnn_lstm_model_14.pkl')
files.download('cnn_lstm_model_14.pkl')

# Save RNN model
torch.save(rnn_model, 'rnn_model_14.pkl')
files.download('rnn_model_14.pkl')

In [None]:
import joblib
from google.colab import files

joblib.dump(standard_scaler, 'standard_scaler.pkl')
files.download('standard_scaler.pkl')

joblib.dump(min_max_scaler, 'min_max_scaler.pkl')
files.download('min_max_scaler.pkl')

In [None]:
import joblib

# Assuming label_encoder is your LabelEncoder object
joblib.dump(label_encoder, 'label_encoder.pkl')
files.download('label_encoder.pkl')

#### Test run

In [None]:
import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np

rnn_model = torch.load('/content/rnn_model_14.pkl', map_location = torch.device('cpu'))
cnn_lstm_model = torch.load('/content/cnn_lstm_model_14.pkl', map_location = torch.device('cpu'))

# Load the label encoder
label_encoder = joblib.load('/content/label_encoder.pkl')

num_classes = len(label_encoder.classes_)

# Load the CSV file
data = pd.read_csv("/content/combined.csv")

# Separate features and labels
X = data.drop(columns=['Label'])  # Assuming 'label' is the column name for the target variable
y = data['Label']

# Calculate mean and standard deviation for each feature
means = X.mean(axis=0)
stds = X.std(axis=0)

# Mean normalization
X_normalized = (X - means) / stds

# # Scale the features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Convert the preprocessed data to PyTorch tensor
input_tensor = torch.tensor(X_scaled, dtype=torch.float32)

# Create a DataLoader for batch processing
batch_size = 1  # Since we're processing one sample at a time
data_loader = DataLoader(TensorDataset(input_tensor), batch_size=batch_size)

# Set models to evaluation mode
rnn_model.eval()
cnn_lstm_model.eval()

# Create an empty list to store predictions
predictions = []

# Iterate over the data loader
for batch in data_loader:
    input_data = batch[0]  # Extract input data from the batch

    # Generate predictions for the current batch
    with torch.no_grad():
        rnn_output = rnn_model(input_data)
        probs_rnn = nn.functional.softmax(rnn_output, dim=1)
        cnn_output = cnn_lstm_model(input_data)
        probs_cnn_lstm = nn.functional.softmax(cnn_output, dim=1)

        # Combine probabilities from both models
        combined_probs_batch = (probs_cnn_lstm + probs_rnn) / 2

        # Get predicted labels from combined probabilities
        _, predicted = torch.max(combined_probs_batch, 1)
        predictions.extend(predicted.tolist())

# Convert predicted labels to original class labels
predicted_labels = label_encoder.inverse_transform(predictions)

# Print classification report
print("Accuracy Score : ", accuracy_score(y, predicted_labels))
print(classification_report(y, predicted_labels))

In [None]:
from sklearn.metrics import confusion_matrix

# Get the confusion matrix
cf_matrix = confusion_matrix(y, predicted_labels)
print(cf_matrix)

import seaborn as sns
import matplotlib.pyplot as plt

# Assuming cf_matrix is your confusion matrix
plt.figure(figsize=(15, 15))  # Set the size of the figure

# Plot the heatmap with real class labels
sns.heatmap(cf_matrix, annot=True, cmap='Blues', xticklabels=y.unique(), yticklabels=y.unique())
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

#### scaler pkl usage

In [24]:
import numpy as np
import joblib

# Load the scaler
standard_scaler = joblib.load(r'C:\Users\Admin\Downloads\Anusha\AIML\Final Project\real time pipeline\models\standard_scaler.pkl')

np_array = np.array([[2808.335703604236,1991.0607623523667,2004.436170537431,2391.572985015412,2615.0798059022795,2911.9638660847804,3193.972472304453,3291.100436414685,3289.975402081362,3161.2214728232675,2946.4649189733577,2684.956938383129,2330.946134830775,2032.8120365001373,1809.930234687338,1718.5524460585343,2003.8111514633624,1778.8042847987304,1720.302499465926,1809.3052156132692,2021.5616931669056,2311.9455549790946,2736.583513901181,3085.7191686758024,3283.6002075258643,3298.600665303507,3104.9697561571093,2791.710196234016,2424.5739921262248,2156.6908169804988,1851.6815088351088,1725.0526444288462,1811.1802728354749,2082.063539536729,2440.4494766075622,2812.3358256782744,3132.47059541612,3261.3495284890287,3289.975402081362,3100.344615009003,2753.4590289010284,2394.57307657094,2037.4371776482435,2238.5683156834625,2512.9516891995,2721.8330637531662,2677.3317056794945,2632.580339976196,2650.330881679739,2763.209326456496]])
print(np_array)

X_new_scaled = standard_scaler.transform(np_array)

X_new_scaled

[[2808.3357036  1991.06076235 2004.43617054 2391.57298502 2615.0798059
  2911.96386608 3193.9724723  3291.10043641 3289.97540208 3161.22147282
  2946.46491897 2684.95693838 2330.94613483 2032.8120365  1809.93023469
  1718.55244606 2003.81115146 1778.8042848  1720.30249947 1809.30521561
  2021.56169317 2311.94555498 2736.5835139  3085.71916868 3283.60020753
  3298.6006653  3104.96975616 2791.71019623 2424.57399213 2156.69081698
  1851.68150884 1725.05264443 1811.18027284 2082.06353954 2440.44947661
  2812.33582568 3132.47059542 3261.34952849 3289.97540208 3100.34461501
  2753.4590289  2394.57307657 2037.43717765 2238.56831568 2512.9516892
  2721.83306375 2677.33170568 2632.58033998 2650.33088168 2763.20932646]]


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


array([[ 0.63627834, -0.9391976 , -0.91700509, -0.1514977 ,  0.28804801,
         0.86950533,  1.43314642,  1.63216697,  1.63449589,  1.37603956,
         0.94302144,  0.42177054, -0.26949151, -0.84285131, -1.27550831,
        -1.44853995, -0.90187408, -1.33514749, -1.45810778, -1.28080957,
        -0.86624281, -0.30280856,  0.50679354,  1.17742525,  1.5476338 ,
         1.57831979,  1.21313501,  0.61186037, -0.09796614, -0.61802767,
        -1.21124198, -1.46187604, -1.29752336, -0.78017043, -0.09002249,
         0.63781174,  1.26054945,  1.51288611,  1.55859204,  1.18632027,
         0.51037645, -0.19918615, -0.90954094, -0.51303657,  0.01808797,
         0.42817971,  0.33993158,  0.24908887,  0.27535853,  0.49641438]])

### Classical Ensemble (SVC + KNN + RF + XGBOOST)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Load the dataset
dataset = pd.read_csv("/content/50combined.csv")  # Update with your dataset path

# Define the columns you want to impute
columns_to_impute = ['Sample_50', 'Sample_45', 'Sample_46', 'Sample_47', 'Sample_48', 'Sample_49', 'Sample_44']  # Add more columns as needed

# Instantiate the KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Group rows by label and fill NaN values within each group for each specified column
for column in columns_to_impute:
    grouped = dataset.groupby('Label')[column]
    for label, group in grouped:
        # Create a mask to identify NaN values for this group
        nan_mask = group.isnull()
        if nan_mask.any():
            # Exclude the 'Label' column and impute NaN values using KNNImputer
            features = group.drop(columns=['Label']).values.reshape(-1, 1)
            features_imputed = imputer.fit_transform(features)

            # Update the DataFrame with imputed values using the mask
            dataset.loc[group.index, column] = features_imputed.flatten()

# Extract features (all columns except the last one)
X = dataset.iloc[:, 1:].values

# Extract labels (last column)
y = dataset.iloc[:, 0].values

# Encode labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Scale the features using MinMaxScaler
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X_standardized)

base_classifiers = [
    ('xgb', XGBClassifier(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        reg_lambda=0.5
    )),
    ('svm', SVC(
        C=3.0,
        kernel='rbf',
        gamma='scale',
        decision_function_shape = 'ovo'
    )),
    ('knn', KNeighborsClassifier(
        n_neighbors=5,
        weights='distance',
        algorithm='kd_tree'
    )),
    ('rf', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        bootstrap = True
    ))
]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define meta-classifier (Softmax Regression)
meta_classifier = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=500)

# Create stacking classifier with cross-validation
stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=meta_classifier, cv=5, verbose=2)

# Train stacking classifier
stacking_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = stacking_classifier.predict(X_test)

from sklearn.metrics import accuracy_score

# Evaluate the predictions
accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy:", accuracy)

from sklearn.metrics import classification_report

# Evaluate the predictions
report = classification_report(y_test, predictions)
print("Classification Report:", report)

#### Save model

In [None]:
import joblib
from google.colab import files

# Save the stacking classifier
joblib.dump(stacking_classifier, 'stacking_classifier_22.pkl')
files.download('stacking_classifier_22.pkl')

#### Prediction on unseen data

In [None]:
import joblib
from sklearn.metrics import classification_report, accuracy_score

# Load the stacking classifier
stacking_classifier = joblib.load('/content/stacking_classifier_22.pkl')

# Load the test data
test_data = pd.read_csv("/content/combined.csv")  # Replace "path_to_test_data.csv" with the path to your test data file

from sklearn.impute import KNNImputer

# Define the columns you want to impute
columns_to_impute = ['Sample_50']  # Add more columns as needed

# Instantiate the KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Group rows by label and fill NaN values within each group for each specified column
for column in columns_to_impute:
    grouped = test_data.groupby('Label')[column]
    for label, group in grouped:
        # Create a mask to identify NaN values for this group
        nan_mask = group.isnull()
        if nan_mask.any():
            # Exclude the 'Label' column and impute NaN values using KNNImputer
            features = group.drop(columns=['Label']).values.reshape(-1, 1)
            features_imputed = imputer.fit_transform(features)

            # Update the DataFrame with imputed values using the mask
            test_data.loc[group.index, column] = features_imputed.flatten()

# Separate features and labels
X_test = test_data.drop(columns=['Label'])
y_test = test_data['Label']

# Standardize the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X_test)

# Scale the features using MinMaxScaler
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X_standardized)

# Make predictions on the test set
predictions = stacking_classifier.predict(X_scaled)
predictions = label_encoder.inverse_transform(predictions)

# Print classification report and accuracy score
print("Accuracy Score : ", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))