In [3]:
import pandas as pd
df = pd.read_csv('49_updated.csv')
df = df.drop('TIME', axis=1)

In [4]:
df.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,o,group_status
0,107.136536,100.0,134.133804,2.854385,576.762146,99.946281,44.625851,46.853638,104.717056,106.523125,107.030479,131.141632,1.689493,1,1
1,105.134583,100.0,160.002411,3.367386,510.683624,107.427765,54.109188,-0.69275,108.977722,97.88475,98.32534,135.435562,1.835044,1,1
2,158.74939,100.0,177.491074,4.161911,489.616302,111.045418,51.477051,-0.69275,112.985031,127.458191,127.96553,138.692169,2.040076,1,1
3,218.557755,100.0,230.25087,6.420364,500.774811,120.365662,55.490112,-0.585938,118.745735,120.074837,120.488739,158.166794,2.533844,1,1
4,207.734665,100.0,236.822556,6.05751,497.256439,124.775787,56.008911,-0.585938,125.489731,119.607536,120.061485,156.594208,2.461201,1,1


In [None]:
!pip install -U huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.20.3
    Uninstalling huggingface-hub-0.20.3:
      Successfully uninstalled huggingface-hub-0.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llava 1.1.3 requires pydantic<2,>=1, but you have pydantic 2.9.2 which is incompatible.
videollava 1.0.0 requires gradio==3.37.0, but you have gradio 3.35.2 which is incompatible.
videollava 1.0.0 requires gradio-client==0.7.0, but you have gradio-client 0.2.9 which is incompat

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Load and preprocess the data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Transformer model with an embedding layer to adjust input dimensions
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, hidden_dim, n_layers):
        super(TransformerClassifier, self).__init__()
        # Embedding layer to adjust input dimension
        self.embedding = nn.Linear(input_dim, emb_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=hidden_dim)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)
        self.fc = nn.Linear(emb_dim, 1)
    
    def forward(self, x):
        x = self.embedding(x)  # Adjust to emb_dim
        x = self.transformer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return torch.sigmoid(x)

# Model parameters
input_dim = len(features)
emb_dim = 16  # Choose an embedding dimension divisible by n_heads
n_heads = 4
hidden_dim = 64
n_layers = 2
model = TransformerClassifier(input_dim=input_dim, emb_dim=emb_dim, n_heads=n_heads, hidden_dim=hidden_dim, n_layers=n_layers)

# Training setup
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).int().numpy())
            targets.extend(labels.numpy())
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train and evaluate
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)




Epoch 1/10, Loss: 0.2854
Epoch 2/10, Loss: 0.2432
Epoch 3/10, Loss: 0.2346
Epoch 4/10, Loss: 0.2294
Epoch 5/10, Loss: 0.2238
Epoch 6/10, Loss: 0.2224
Epoch 7/10, Loss: 0.2190
Epoch 8/10, Loss: 0.2176
Epoch 9/10, Loss: 0.2171
Epoch 10/10, Loss: 0.2150
Test Accuracy: 0.8936, F1 Score: 0.9348


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Transformer model with an embedding layer to adjust input dimensions
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, emb_dim, n_heads, hidden_dim, n_layers):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Linear(input_dim, emb_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, dim_feedforward=hidden_dim)
        self.transformer = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)
        self.fc = nn.Linear(emb_dim, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return torch.sigmoid(x)

# Model parameters
input_dim = len(features)
emb_dim = 16
n_heads = 4
hidden_dim = 64
n_layers = 2
model = TransformerClassifier(input_dim=input_dim, emb_dim=emb_dim, n_heads=n_heads, hidden_dim=hidden_dim, n_layers=n_layers)

# Training setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).int().numpy())
            targets.extend(labels.numpy())
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train and save the model
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)
torch.save(model.state_dict(), 'transformer_classifier.pth')
print("Model saved as transformer_classifier.pth")

# Function to load and predict on a new dataset
def predict_new_data(file_path, model, scaler):
    # Load and preprocess new data
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for prediction
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)

    # Prepare DataLoader for new data
    new_dataset = TimeSeriesDataset(new_sequences, np.zeros(len(new_sequences)))  # Use zeros as dummy labels
    new_loader = DataLoader(new_dataset, batch_size=32, shuffle=False)
    
    # Predict
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for sequences, _ in new_loader:
            outputs = model(sequences)
            predictions = (outputs.squeeze() > 0.5).int().numpy()
            all_predictions.extend(predictions)
    
    return all_predictions

# Load the saved model
model = TransformerClassifier(input_dim=input_dim, emb_dim=emb_dim, n_heads=n_heads, hidden_dim=hidden_dim, n_layers=n_layers)
model.load_state_dict(torch.load('transformer_classifier.pth'))
print("Model loaded for prediction.")








Epoch 1/10, Loss: 0.2748
Epoch 2/10, Loss: 0.2438
Epoch 3/10, Loss: 0.2341
Epoch 4/10, Loss: 0.2287
Epoch 5/10, Loss: 0.2235
Epoch 6/10, Loss: 0.2214
Epoch 7/10, Loss: 0.2173
Epoch 8/10, Loss: 0.2167
Epoch 9/10, Loss: 0.2154
Epoch 10/10, Loss: 0.2131
Test Accuracy: 0.7652, F1 Score: 0.8661
Model saved as transformer_classifier.pth
Model loaded for prediction.


  model.load_state_dict(torch.load('transformer_classifier.pth'))


In [3]:
# Predict on a new CSV file
new_predictions = predict_new_data('46.csv', model, scaler)
print("Predictions on new data:", new_predictions)

Predictions on new data: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# CNN model for time series classification
class CNNClassifier(nn.Module):
    def __init__(self, input_dim, num_classes=1):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_dim, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc1 = nn.Linear(64 * (sequence_length // 2), 64)
        self.fc2 = nn.Linear(64, num_classes)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = x.permute(0, 2, 1)  # Reshape to (batch_size, input_dim, sequence_length) for Conv1d
        x = torch.relu(self.conv1(x))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return self.sigmoid(x)

# Model parameters
input_dim = len(features)
model = CNNClassifier(input_dim=input_dim)

# Training setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences).squeeze()
            predictions.extend((outputs > 0.5).int().numpy())
            targets.extend(labels.numpy())
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train and save the model
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)
torch.save(model.state_dict(), 'cnn_classifier.pth')
print("Model saved as cnn_classifier.pth")

# Function to load and predict on a new dataset
def predict_new_data(file_path, model, scaler):
    # Load and preprocess new data
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for prediction
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)

    # Prepare DataLoader for new data
    new_dataset = TimeSeriesDataset(new_sequences, np.zeros(len(new_sequences)))  # Use zeros as dummy labels
    new_loader = DataLoader(new_dataset, batch_size=32, shuffle=False)
    
    # Predict
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for sequences, _ in new_loader:
            outputs = model(sequences).squeeze()
            predictions = (outputs > 0.5).int().numpy()
            all_predictions.extend(predictions)
    
    return all_predictions

# Load the saved model
model = CNNClassifier(input_dim=input_dim)
model.load_state_dict(torch.load('cnn_classifier.pth'))
print("Model loaded for prediction.")

# Predict on a new CSV file
new_predictions = predict_new_data('46.csv', model, scaler)
print("Predictions on new data:", new_predictions)


Epoch 1/10, Loss: 0.2689
Epoch 2/10, Loss: 0.2183
Epoch 3/10, Loss: 0.2092
Epoch 4/10, Loss: 0.2066
Epoch 5/10, Loss: 0.2036
Epoch 6/10, Loss: 0.1995
Epoch 7/10, Loss: 0.1954
Epoch 8/10, Loss: 0.1934
Epoch 9/10, Loss: 0.1884
Epoch 10/10, Loss: 0.1853
Test Accuracy: 0.9366, F1 Score: 0.9602
Model saved as cnn_classifier.pth
Model loaded for prediction.
Predictions on new data: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, output_dim=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        lstm_out, (hn, cn) = self.lstm(x)
        x = self.fc(hn[-1])
        x = self.sigmoid(x)
        return x

    # Add predict method for permutation importance
    def predict(self, X):
        self.eval()
        with torch.no_grad():
            X = torch.tensor(X, dtype=torch.float32)
            return (self(X).squeeze() > 0.5).int().numpy()

# Model parameters
input_dim = len(features)
hidden_dim = 64
n_layers = 2
model = LSTMClassifier(input_dim=input_dim, hidden_dim=hidden_dim, n_layers=n_layers)

# Training setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).int().numpy())
            targets.extend(labels.numpy())
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)

# Define a scikit-learn compatible wrapper for the model
# Define a scikit-learn compatible wrapper for the model
class SklearnModelWrapper:
    def __init__(self, model):
        self.model = model

    def fit(self, X, y):
        # This method doesn't need to do anything, as PyTorch models are trained directly
        pass

    def predict(self, X):
        # Set the model to evaluation mode before prediction
        self.model.eval()
        with torch.no_grad():
            X = torch.tensor(X, dtype=torch.float32)
            return (self.model(X).squeeze() > 0.5).int().numpy()

    def score(self, X, y):
        predictions = self.predict(X)
        return accuracy_score(y, predictions)

# Wrap the PyTorch model with the wrapper class
sklearn_model = SklearnModelWrapper(model)

# Compute feature importance using Permutation Feature Importance
def get_feature_importance(model, test_data, test_labels):
    # Flatten the 3D test_data to 2D
    n_samples, sequence_length, n_features = test_data.shape
    test_data_2d = test_data.reshape(n_samples, sequence_length * n_features)
    
    # Permutation importance on the test data
    result = permutation_importance(
        model, test_data_2d, test_labels, n_repeats=10, random_state=42
    )
    return result.importances_mean

# Compute feature importance
importance = get_feature_importance(sklearn_model, test_sequences, test_labels)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(features, importance)
plt.title('Permutation Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()



Epoch 1/10, Loss: 0.2847
Epoch 2/10, Loss: 0.2255
Epoch 3/10, Loss: 0.2102
Epoch 4/10, Loss: 0.2065
Epoch 5/10, Loss: 0.2030
Epoch 6/10, Loss: 0.2003
Epoch 7/10, Loss: 0.1985
Epoch 8/10, Loss: 0.1978
Epoch 9/10, Loss: 0.1938
Epoch 10/10, Loss: 0.1928
Test Accuracy: 0.9375, F1 Score: 0.9604


TypeError: Singleton array array(0, dtype=int32) cannot be considered a valid collection.

In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# LSTM model for sequence classification
class RNNClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers):
        super(RNNClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        x = hn[-1]
        x = self.fc(x)
        return torch.sigmoid(x)

# Model parameters
input_dim = len(features)
hidden_dim = 64
n_layers = 2
model = RNNClassifier(input_dim=input_dim, hidden_dim=hidden_dim, n_layers=n_layers)

# Training setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).int().numpy())
            targets.extend(labels.numpy())
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train and save the model
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)
torch.save(model.state_dict(), 'rnn_classifier.pth')
print("Model saved as rnn_classifier.pth")

# Function to load and predict on a new dataset
def predict_new_data(file_path, model, scaler):
    # Load and preprocess new data
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for prediction
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)

    # Prepare DataLoader for new data
    new_dataset = TimeSeriesDataset(new_sequences, np.zeros(len(new_sequences)))  # Use zeros as dummy labels
    new_loader = DataLoader(new_dataset, batch_size=32, shuffle=False)
    
    # Predict
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for sequences, _ in new_loader:
            outputs = model(sequences)
            predictions = (outputs.squeeze() > 0.5).int().numpy()
            all_predictions.extend(predictions)
    
    return all_predictions

# Load the saved model
model = RNNClassifier(input_dim=input_dim, hidden_dim=hidden_dim, n_layers=n_layers)
model.load_state_dict(torch.load('rnn_classifier.pth'))
print("Model loaded for prediction.")

# Predict on a new CSV file
new_predictions = predict_new_data('46.csv', model, scaler)
print("Predictions on new data:", new_predictions)

Epoch 1/10, Loss: 0.2717
Epoch 2/10, Loss: 0.2184
Epoch 3/10, Loss: 0.2104
Epoch 4/10, Loss: 0.2063
Epoch 5/10, Loss: 0.2030
Epoch 6/10, Loss: 0.2016
Epoch 7/10, Loss: 0.1988
Epoch 8/10, Loss: 0.1961
Epoch 9/10, Loss: 0.1942
Epoch 10/10, Loss: 0.1922
Test Accuracy: 0.9294, F1 Score: 0.9553
Model saved as rnn_classifier.pth
Model loaded for prediction.
Predictions on new data: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]


In [23]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, features):
    sequences = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        sequences.append(seq)
    return np.array(sequences)

sequences = create_sequences(data, sequence_length, features)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx]

train_dataset = TimeSeriesDataset(train_sequences)
test_dataset = TimeSeriesDataset(test_sequences)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Autoencoder model with RNN (LSTM-based)
class RNNEncoderDecoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers):
        super(RNNEncoderDecoder, self).__init__()
        self.encoder = nn.LSTM(input_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, input_dim, num_layers=n_layers, batch_first=True)
    
    def forward(self, x):
        _, (hn, _) = self.encoder(x)
        decoder_input = hn[-1].unsqueeze(1).repeat(1, x.size(1), 1)  # Repeat last hidden state for decoder input
        decoded, _ = self.decoder(decoder_input)
        return decoded

# Model parameters
input_dim = len(features)
hidden_dim = 64
n_layers = 2
model = RNNEncoderDecoder(input_dim=input_dim, hidden_dim=hidden_dim, n_layers=n_layers)

# Training setup
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, sequences)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation for reconstruction error threshold
def calculate_reconstruction_error(model, data_loader):
    model.eval()
    reconstruction_errors = []
    with torch.no_grad():
        for sequences in data_loader:
            outputs = model(sequences)
            loss = criterion(outputs, sequences)
            reconstruction_errors.append(loss.item())
    return np.mean(reconstruction_errors), np.std(reconstruction_errors)

# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=10)

# Calculate threshold based on training reconstruction error
mean_error, std_error = calculate_reconstruction_error(model, train_loader)
threshold = mean_error + 3 * std_error  # Set threshold to mean + 3*std deviation
print(f"Reconstruction Error Threshold: {threshold:.4f}")

# Anomaly detection function based on reconstruction error
def detect_anomalies(model, data_loader, threshold):
    model.eval()
    anomalies = []
    with torch.no_grad():
        for sequences in data_loader:
            outputs = model(sequences)
            for i, seq in enumerate(sequences):
                reconstruction_error = mean_squared_error(seq.numpy(), outputs[i].numpy())
                if reconstruction_error > threshold:
                    anomalies.append(1)  # Anomaly
                else:
                    anomalies.append(0)  # Normal
    return anomalies

# Test for anomalies in the test set
anomalies = detect_anomalies(model, test_loader, threshold)
print("Detected anomalies:", anomalies)

# Save the model
torch.save(model.state_dict(), 'rnn_autoencoder.pth')
print("Model saved as rnn_autoencoder.pth")

# Function to load and predict anomalies in new data
def predict_anomalies(file_path, model, scaler, threshold):
    # Load and preprocess new data
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for anomaly detection
    new_sequences = create_sequences(new_data, sequence_length, features)

    # Prepare DataLoader for new data
    new_dataset = TimeSeriesDataset(new_sequences)
    new_loader = DataLoader(new_dataset, batch_size=32, shuffle=False)
    
    # Detect anomalies in new data
    anomalies = detect_anomalies(model, new_loader, threshold)
    return anomalies

# Load the saved model
model = RNNEncoderDecoder(input_dim=input_dim, hidden_dim=hidden_dim, n_layers=n_layers)
model.load_state_dict(torch.load('rnn_autoencoder.pth'))
print("Model loaded for anomaly detection.")


Epoch 1/10, Loss: 0.4053
Epoch 2/10, Loss: 0.3440
Epoch 3/10, Loss: 0.3376
Epoch 4/10, Loss: 0.3342
Epoch 5/10, Loss: 0.3321
Epoch 6/10, Loss: 0.3294
Epoch 7/10, Loss: 0.3276
Epoch 8/10, Loss: 0.3253
Epoch 9/10, Loss: 0.3233
Epoch 10/10, Loss: 0.3215
Reconstruction Error Threshold: 1.1783
Detected anomalies: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,

In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import joblib

# Load and prepare original data (used for training the model)
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Define sequence length
sequence_length = 10

# Function to create sequences and labels
def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values.flatten()  # Flatten the sequence
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

# Prepare data
sequences, labels = create_sequences(data, sequence_length, target)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model and scaler for later use
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Function to predict anomalies in a new dataset
def predict_anomalies_new_data(file_path, model_path, scaler_path, sequence_length=10):
    # Load model and scaler
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    
    # Load new data
    new_data = pd.read_csv(file_path)
    
    # Normalize features using the same scaler
    new_data[features] = scaler.transform(new_data[features])
    
    # Prepare sequences
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values.flatten()
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)
    
    # Predict anomalies
    predictions = model.predict(new_sequences)
    return predictions

# Example usage with new data file
new_data_predictions = predict_anomalies_new_data('46.csv', 'random_forest_model.pkl', 'scaler.pkl')
print("Predicted Anomalies:", new_data_predictions)


Predicted Anomalies: [1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1]


In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# CNN-RNN Hybrid Model
class CNNRNNHybrid(nn.Module):
    def __init__(self, input_dim, cnn_out_channels, rnn_hidden_dim, output_dim=1):
        super(CNNRNNHybrid, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=input_dim, out_channels=cnn_out_channels, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        self.rnn = nn.LSTM(input_size=cnn_out_channels, hidden_size=rnn_hidden_dim, num_layers=1, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = x.permute(0, 2, 1)  # For CNN layer: batch x channels x sequence length
        x = self.cnn(x)
        x = x.permute(0, 2, 1)  # Back to batch x sequence length x channels
        _, (h_n, _) = self.rnn(x)
        x = self.fc(h_n[-1])  # Use last hidden state
        return self.sigmoid(x)

# Model parameters
input_dim = len(features)
cnn_out_channels = 16
rnn_hidden_dim = 32
output_dim = 1

model = CNNRNNHybrid(input_dim=input_dim, cnn_out_channels=cnn_out_channels, rnn_hidden_dim=rnn_hidden_dim, output_dim=output_dim)

# Training setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).int().numpy())
            targets.extend(labels.numpy())
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train and save the model
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)
torch.save(model.state_dict(), 'cnn_rnn_hybrid_model.pth')
print("Model saved as cnn_rnn_hybrid_model.pth")

# Prediction function for new dataset
def predict_new_data(file_path, model, scaler, sequence_length=10):
    # Load and preprocess new data
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for prediction
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)

    # Prepare DataLoader for new data
    new_dataset = TimeSeriesDataset(new_sequences, np.zeros(len(new_sequences)))  # Use zeros as dummy labels
    new_loader = DataLoader(new_dataset, batch_size=32, shuffle=False)
    
    # Predict
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for sequences, _ in new_loader:
            outputs = model(sequences).squeeze()
            predictions = (outputs > 0.5).int().numpy()
            all_predictions.extend(predictions)
    
    return all_predictions

# Load the saved model if needed (for standalone execution)
model = CNNRNNHybrid(input_dim=input_dim, cnn_out_channels=cnn_out_channels, rnn_hidden_dim=rnn_hidden_dim, output_dim=output_dim)
model.load_state_dict(torch.load('cnn_rnn_hybrid_model.pth'))
print("Model loaded for prediction.")

# Example of using the predict_new_data function with another dataset
predictions = predict_new_data('18.csv', model, scaler)
print("Predictions for new data:", predictions)


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 1/10, Loss: 0.3103
Epoch 2/10, Loss: 0.2411
Epoch 3/10, Loss: 0.2228
Epoch 4/10, Loss: 0.2164
Epoch 5/10, Loss: 0.2101
Epoch 6/10, Loss: 0.2057
Epoch 7/10, Loss: 0.2038
Epoch 8/10, Loss: 0.2001
Epoch 9/10, Loss: 0.1987
Epoch 10/10, Loss: 0.1975
Test Accuracy: 0.7693, F1 Score: 0.8691
Model saved as cnn_rnn_hybrid_model.pth
Model loaded for prediction.


  model.load_state_dict(torch.load('cnn_rnn_hybrid_model.pth'))


Predictions for new data: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [33]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# RNN + Transformer Hybrid Model
class RNNTransformerHybrid(nn.Module):
    def __init__(self, input_dim, rnn_hidden_dim, transformer_dim, n_heads, n_layers, output_dim=1):
        super(RNNTransformerHybrid, self).__init__()
        
        # RNN layer (LSTM)
        self.rnn = nn.LSTM(input_size=input_dim, hidden_size=rnn_hidden_dim, num_layers=1, batch_first=True)
        
        # Fully connected layer to project RNN output to transformer dimension
        self.fc_rnn_to_transformer = nn.Linear(rnn_hidden_dim, transformer_dim)
        
        # Transformer Encoder
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=n_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=n_layers)
        
        # Fully connected layer to map to output
        self.fc = nn.Linear(transformer_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # RNN Layer
        rnn_out, (hn, cn) = self.rnn(x)
        
        # Project RNN output to the transformer dimension
        rnn_out_mapped = self.fc_rnn_to_transformer(rnn_out)
        
        # Pass mapped RNN output to transformer
        transformer_out = self.transformer_encoder(rnn_out_mapped)
        
        # Use the last hidden state of transformer output
        out = transformer_out.mean(dim=1)
        
        # Final output layer
        out = self.fc(out)
        out = self.sigmoid(out)
        
        return out

# Model parameters
input_dim = len(features)
rnn_hidden_dim = 32
transformer_dim = 64
n_heads = 4
n_layers = 2
output_dim = 1

model = RNNTransformerHybrid(input_dim=input_dim, rnn_hidden_dim=rnn_hidden_dim, 
                             transformer_dim=transformer_dim, n_heads=n_heads, 
                             n_layers=n_layers, output_dim=output_dim)

# Training setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).int().numpy())
            targets.extend(labels.numpy())
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train and save the model
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)
torch.save(model.state_dict(), 'rnn_transformer_hybrid_model.pth')
print("Model saved as rnn_transformer_hybrid_model.pth")

# Prediction function for new dataset
def predict_new_data(file_path, model, scaler, sequence_length=10):
    # Load and preprocess new data
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for prediction
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)

    # Prepare DataLoader for new data
    new_dataset = TimeSeriesDataset(new_sequences, np.zeros(len(new_sequences)))  # Use zeros as dummy labels
    new_loader = DataLoader(new_dataset, batch_size=32, shuffle=False)
    
    # Predict
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for sequences, _ in new_loader:
            outputs = model(sequences).squeeze()
            predictions = (outputs > 0.5).int().numpy()
            all_predictions.extend(predictions)
    
    return all_predictions

# Load the saved model if needed (for standalone execution)
model = RNNTransformerHybrid(input_dim=input_dim, rnn_hidden_dim=rnn_hidden_dim, 
                             transformer_dim=transformer_dim, n_heads=n_heads, 
                             n_layers=n_layers, output_dim=output_dim)
model.load_state_dict(torch.load('rnn_transformer_hybrid_model.pth'))
print("Model loaded for prediction.")

# Example of using the predict_new_data function with another dataset
predictions = predict_new_data('46.csv', model, scaler)
print("Predictions for new data:", predictions)




Epoch 1/10, Loss: 0.2881
Epoch 2/10, Loss: 0.2546
Epoch 3/10, Loss: 0.2450
Epoch 4/10, Loss: 0.2272
Epoch 5/10, Loss: 0.2214
Epoch 6/10, Loss: 0.2167
Epoch 7/10, Loss: 0.2126
Epoch 8/10, Loss: 0.2102
Epoch 9/10, Loss: 0.2076
Epoch 10/10, Loss: 0.2049
Test Accuracy: 0.9456, F1 Score: 0.9655
Model saved as rnn_transformer_hybrid_model.pth
Model loaded for prediction.
Predictions for new data: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]




In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import torch.optim as optim

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)  # Increased batch size for efficiency
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Optimized RNN + Transformer Hybrid Model
class RNNTransformerHybrid(nn.Module):
    def __init__(self, input_dim, rnn_hidden_dim, transformer_dim, n_heads, n_layers, output_dim=1, dropout=0.3):
        super(RNNTransformerHybrid, self).__init__()
        
        # RNN layer (LSTM)
        self.rnn = nn.LSTM(input_size=input_dim, hidden_size=rnn_hidden_dim, num_layers=1, batch_first=True, dropout=dropout)
        
        # Fully connected layer to project RNN output to transformer dimension
        self.fc_rnn_to_transformer = nn.Linear(rnn_hidden_dim, transformer_dim)
        
        # Transformer Encoder
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=n_layers)
        
        # Fully connected layer to map to output
        self.fc = nn.Linear(transformer_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # RNN Layer
        rnn_out, (hn, cn) = self.rnn(x)
        
        # Project RNN output to the transformer dimension
        rnn_out_mapped = self.fc_rnn_to_transformer(rnn_out)
        
        # Pass mapped RNN output to transformer
        transformer_out = self.transformer_encoder(rnn_out_mapped)
        
        # Use the last hidden state of transformer output
        out = transformer_out.mean(dim=1)
        
        # Final output layer
        out = self.fc(out)
        out = self.sigmoid(out)
        
        return out

# Model parameters
input_dim = len(features)
rnn_hidden_dim = 64  # Increased for better representation
transformer_dim = 128  # Increased to capture more complex patterns
n_heads = 8  # Increased number of attention heads
n_layers = 4  # Increased layers for deeper model
output_dim = 1
dropout = 0.3  # Dropout to prevent overfitting

# Initialize the model and move it to the GPU if available
model = RNNTransformerHybrid(input_dim=input_dim, rnn_hidden_dim=rnn_hidden_dim, 
                             transformer_dim=transformer_dim, n_heads=n_heads, 
                             n_layers=n_layers, output_dim=output_dim, dropout=dropout).to(device)

# Optimizer setup
optimizer = optim.Adam(model.parameters(), lr=0.0005)  # Adjusted learning rate
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.7)  # Learning rate scheduler

# Loss function
criterion = nn.BCELoss()

# Training loop with gradient clipping and learning rate scheduler
def train_model(model, train_loader, criterion, optimizer, scheduler, epochs=20):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            sequences, labels = sequences.to(device), labels.to(device)  # Move data to GPU
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            
            # Gradient clipping to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            total_loss += loss.item()
        
        scheduler.step()  # Update the learning rate
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")
        
        # Optionally, save the model every few epochs
        if (epoch + 1) % 5 == 0:
            torch.save(model.state_dict(), f'rnn_transformer_hybrid_epoch_{epoch+1}.pth')
            print(f"Model saved at epoch {epoch+1}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            sequences, labels = sequences.to(device), labels.to(device)  # Move data to GPU
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).int().cpu().numpy())  # Move to CPU for final output
            targets.extend(labels.cpu().numpy())
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, scheduler, epochs=20)
evaluate_model(model, test_loader)

# Save the final model
torch.save(model.state_dict(), 'rnn_transformer_hybrid_final_model.pth')
print("Final model saved as rnn_transformer_hybrid_final_model.pth")

# Prediction function for new dataset
def predict_new_data(file_path, model, scaler, sequence_length=10):
    # Load and preprocess new data
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for prediction
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)

    # Prepare DataLoader for new data
    new_dataset = TimeSeriesDataset(new_sequences, np.zeros(len(new_sequences)))  # Use zeros as dummy labels
    new_loader = DataLoader(new_dataset, batch_size=64, shuffle=False)
    
    # Predict
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for sequences, _ in new_loader:
            sequences = sequences.to(device)  # Move data to GPU
            outputs = model(sequences).squeeze()
            predictions = (outputs > 0.5).int().cpu().numpy()  # Move to CPU for final output
            all_predictions.extend(predictions)
    
    return all_predictions

# Example of using the predict_new_data function with another dataset
predictions = predict_new_data('46.csv', model, scaler)
print("Predictions for new data:", predictions)


Using device: cuda




Epoch 1/20, Loss: 0.3061
Epoch 2/20, Loss: 0.2592
Epoch 3/20, Loss: 0.2741
Epoch 4/20, Loss: 0.2561
Epoch 5/20, Loss: 0.2469
Model saved at epoch 5
Epoch 6/20, Loss: 0.2493
Epoch 7/20, Loss: 0.2431
Epoch 8/20, Loss: 0.2356
Epoch 9/20, Loss: 0.2367
Epoch 10/20, Loss: 0.2360
Model saved at epoch 10
Epoch 11/20, Loss: 0.2321
Epoch 12/20, Loss: 0.2328
Epoch 13/20, Loss: 0.2329
Epoch 14/20, Loss: 0.2285
Epoch 15/20, Loss: 0.2237
Model saved at epoch 15
Epoch 16/20, Loss: 0.2184
Epoch 17/20, Loss: 0.2141
Epoch 18/20, Loss: 0.2137
Epoch 19/20, Loss: 0.2118
Epoch 20/20, Loss: 0.2081
Model saved at epoch 20
Test Accuracy: 0.7761, F1 Score: 0.8716
Final model saved as rnn_transformer_hybrid_final_model.pth
Predictions for new data: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# RNN + Transformer Hybrid Model
class RNNTransformerHybrid(nn.Module):
    def __init__(self, input_dim, rnn_hidden_dim, transformer_dim, n_heads, n_layers, output_dim=1):
        super(RNNTransformerHybrid, self).__init__()
        
        # RNN layer (LSTM)
        self.rnn = nn.LSTM(input_size=input_dim, hidden_size=rnn_hidden_dim, num_layers=1, batch_first=True)
        
        # Fully connected layer to project RNN output to transformer dimension
        self.fc_rnn_to_transformer = nn.Linear(rnn_hidden_dim, transformer_dim)
        
        # Transformer Encoder
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=n_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=n_layers)
        
        # Fully connected layer to map to output
        self.fc = nn.Linear(transformer_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # RNN Layer
        rnn_out, (hn, cn) = self.rnn(x)
        
        # Project RNN output to the transformer dimension
        rnn_out_mapped = self.fc_rnn_to_transformer(rnn_out)
        
        # Pass mapped RNN output to transformer
        transformer_out = self.transformer_encoder(rnn_out_mapped)
        
        # Use the last hidden state of transformer output
        out = transformer_out.mean(dim=1)
        
        # Final output layer
        out = self.fc(out)
        out = self.sigmoid(out)
        
        return out

# Model parameters
input_dim = len(features)
rnn_hidden_dim = 32
transformer_dim = 64
n_heads = 4
n_layers = 2
output_dim = 1

model = RNNTransformerHybrid(input_dim=input_dim, rnn_hidden_dim=rnn_hidden_dim, 
                             transformer_dim=transformer_dim, n_heads=n_heads, 
                             n_layers=n_layers, output_dim=output_dim)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

# Training loop
def train_model(model, train_loader, criterion, optimizer, scheduler, epochs=30):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            
            # Ensure labels have shape (batch_size, 1)
            labels = labels.unsqueeze(1)  # Add an extra dimension
            
            optimizer.zero_grad()
            outputs = model(sequences)
            
            # Compute the loss
            loss = criterion(outputs.squeeze(), labels.squeeze())  # Ensure outputs and labels match in shape
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        scheduler.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).int().cpu().numpy())
            targets.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(targets, predictions)
    precision = precision_score(targets, predictions)
    recall = recall_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    
    print(f"Test Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Train and evaluate the model
train_model(model, train_loader, criterion, optimizer, scheduler, epochs=15)
evaluate_model(model, test_loader)

# Save the model
torch.save(model.state_dict(), 'rnn_transformer_hybrid_model.pth')
print("Model saved as rnn_transformer_hybrid_model.pth")

# Prediction function for new dataset
def predict_new_data(file_path, model, scaler, sequence_length=10):
    # Load and preprocess new data
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for prediction
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)

    # Prepare DataLoader for new data
    new_dataset = TimeSeriesDataset(new_sequences, np.zeros(len(new_sequences)))  # Use zeros as dummy labels
    new_loader = DataLoader(new_dataset, batch_size=32, shuffle=False)
    
    # Ensure model is on the same device as input data
    model.to(device)
    
    # Predict
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for sequences, _ in new_loader:
            sequences = sequences.to(device)  # Move sequences to the correct device
            outputs = model(sequences).squeeze()
            predictions = (outputs > 0.5).int().cpu().numpy()
            all_predictions.extend(predictions)
    
    return all_predictions


# Load the saved model if needed (for standalone execution)
model = RNNTransformerHybrid(input_dim=input_dim, rnn_hidden_dim=rnn_hidden_dim, 
                             transformer_dim=transformer_dim, n_heads=n_heads, 
                             n_layers=n_layers, output_dim=output_dim)
model.load_state_dict(torch.load('rnn_transformer_hybrid_model.pth'))
print("Model loaded for prediction.")

# Example of using the predict_new_data function with another dataset
predictions = predict_new_data('46.csv', model, scaler)
print("Predictions for new data:", predictions)




Epoch 1/15, Loss: 0.2813
Epoch 2/15, Loss: 0.2409
Epoch 3/15, Loss: 0.2267
Epoch 4/15, Loss: 0.2212
Epoch 5/15, Loss: 0.2195
Epoch 6/15, Loss: 0.2163
Epoch 7/15, Loss: 0.2141
Epoch 8/15, Loss: 0.2099
Epoch 9/15, Loss: 0.2091
Epoch 10/15, Loss: 0.2052
Epoch 11/15, Loss: 0.1939
Epoch 12/15, Loss: 0.1916
Epoch 13/15, Loss: 0.1906
Epoch 14/15, Loss: 0.1896
Epoch 15/15, Loss: 0.1887
Test Accuracy: 0.8938, Precision: 0.8884, Recall: 0.9874, F1 Score: 0.9353
Model saved as rnn_transformer_hybrid_model.pth
Model loaded for prediction.
Predictions for new data: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


  model.load_state_dict(torch.load('rnn_transformer_hybrid_model.pth'))


In [7]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import Dataset, DataLoader

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess the training data
data = pd.read_csv('49_updated.csv')
features = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'l', 'k', 'm']
target = 'group_status'

# Normalize features
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

# Prepare sequences of 10 rows
sequence_length = 10

def create_sequences(data, sequence_length, target_col):
    sequences = []
    labels = []
    for i in range(len(data) - sequence_length):
        seq = data[features].iloc[i:i+sequence_length].values
        label = data[target_col].iloc[i + sequence_length - 1]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

sequences, labels = create_sequences(data, sequence_length, target)

# Split into train and test sets
split_ratio = 0.8
split_index = int(split_ratio * len(sequences))
train_sequences, test_sequences = sequences[:split_index], sequences[split_index:]
train_labels, test_labels = labels[:split_index], labels[split_index:]

# Custom Dataset
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

train_dataset = TimeSeriesDataset(train_sequences, train_labels)
test_dataset = TimeSeriesDataset(test_sequences, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the Deep Learning Model (LSTM + Transformer)
class RNNTransformer(nn.Module):
    def __init__(self, input_dim, rnn_hidden_dim, transformer_dim, n_heads, n_layers, output_dim=1):
        super(RNNTransformer, self).__init__()
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=rnn_hidden_dim, num_layers=1, batch_first=True)
        
        # Linear layer to map LSTM output to transformer dimension
        self.fc_lstm_to_transformer = nn.Linear(rnn_hidden_dim, transformer_dim)
        
        # Transformer Encoder layer
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=n_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=n_layers)
        
        # Final output layer
        self.fc = nn.Linear(transformer_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # LSTM layer to capture sequential dependencies
        lstm_out, _ = self.lstm(x)
        
        # Map LSTM output to Transformer dimension
        transformer_input = self.fc_lstm_to_transformer(lstm_out)
        
        # Pass through Transformer encoder
        transformer_out = self.transformer_encoder(transformer_input)
        
        # Use the mean of transformer outputs across time steps (or last timestep)
        out = transformer_out.mean(dim=1)
        
        # Final output layer and sigmoid activation
        out = self.fc(out)
        out = self.sigmoid(out)
        
        return out

# Hyperparameters
input_dim = len(features)  # Number of features
rnn_hidden_dim = 32       # LSTM hidden state dimension
transformer_dim = 64      # Transformer hidden state dimension
n_heads = 4              # Number of attention heads
n_layers = 2             # Number of Transformer layers
output_dim = 1           # Binary output (0 or 1)

# Initialize model
model = RNNTransformer(input_dim=input_dim, rnn_hidden_dim=rnn_hidden_dim, 
                       transformer_dim=transformer_dim, n_heads=n_heads, 
                       n_layers=n_layers, output_dim=output_dim).to(device)

# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training Loop
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for sequences, labels in train_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

# Evaluation Function
def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for sequences, labels in test_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            
            outputs = model(sequences)
            predictions.extend((outputs.squeeze() > 0.5).cpu().int().numpy())
            targets.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions)
    print(f"Test Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

# Train and evaluate the model
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)

# Save the model
torch.save(model.state_dict(), 'rnn_transformer_model.pth')
print("Model saved as rnn_transformer_model.pth")

# Load the saved model if needed
model.load_state_dict(torch.load('rnn_transformer_model.pth'))
print("Model loaded for prediction.")

# Prediction function for new dataset
def predict_new_data(file_path, model, scaler, sequence_length=10):
    new_data = pd.read_csv(file_path)
    new_data[features] = scaler.transform(new_data[features])  # Normalize using the same scaler
    
    # Prepare sequences for prediction
    new_sequences = []
    for i in range(len(new_data) - sequence_length):
        seq = new_data[features].iloc[i:i+sequence_length].values
        new_sequences.append(seq)
    new_sequences = np.array(new_sequences)

    new_dataset = TimeSeriesDataset(new_sequences, np.zeros(len(new_sequences)))  # Use dummy labels
    new_loader = DataLoader(new_dataset, batch_size=32, shuffle=False)
    
    model.eval()
    all_predictions = []
    with torch.no_grad():
        for sequences, _ in new_loader:
            sequences = sequences.to(device)  # Ensure sequences are on the same device as the model
            outputs = model(sequences).squeeze()
            predictions = (outputs > 0.5).int().cpu().numpy()
            all_predictions.extend(predictions)
    
    return all_predictions

# Example of using the predict_new_data function
predictions = predict_new_data('44.csv', model, scaler)
print("Predictions for new data:", predictions)




Epoch 1/10, Loss: 0.2674
Epoch 2/10, Loss: 0.2367
Epoch 3/10, Loss: 0.2244
Epoch 4/10, Loss: 0.2244
Epoch 5/10, Loss: 0.2160
Epoch 6/10, Loss: 0.2160
Epoch 7/10, Loss: 0.2131
Epoch 8/10, Loss: 0.2091
Epoch 9/10, Loss: 0.2091
Epoch 10/10, Loss: 0.2110
Test Accuracy: 0.9347, F1 Score: 0.9589
Model saved as rnn_transformer_model.pth
Model loaded for prediction.


  model.load_state_dict(torch.load('rnn_transformer_model.pth'))


Predictions for new data: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1