In [9]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np


np.random.seed(0)

def load_csv(file_path):
    # Load the data
    data = pd.read_csv(file_path, delimiter=';')

    # Replace commas in numeric columns and convert them to floats
    data = data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')
    return data

In [10]:
def drop_columns_with_excessive_nans(dataframe, threshold=200):
    """Drop columns from a DataFrame where the number of NaN values exceeds the specified threshold."""
    nan_counts = dataframe.isna().sum()
    columns_to_drop = nan_counts[nan_counts > threshold].index
    return dataframe.drop(columns=columns_to_drop)

## Data Loader

In [35]:
train_data = load_csv('../data/training_data.csv')

le = LabelEncoder()
train_data['Group'] = le.fit_transform(train_data['Group'])

# Cut outliers
top_quantiles = train_data.quantile(0.97)
outliers_top = (train_data > top_quantiles)

low_quantiles = train_data.quantile(0.03)
outliers_low = (train_data < low_quantiles)

train_data = train_data.mask(outliers_top, top_quantiles, axis=1)
train_data = train_data.mask(outliers_low, low_quantiles, axis=1)

In [36]:
train_data = train_data.groupby(['Group']).transform(lambda x: x.fillna(x.mean()))
train_data = train_data.fillna(0)

In [54]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
X = train_data.drop(columns=['Class', 'Perform']).values
y = train_data['Class'].values.reshape(-1, 1)

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the data
one_hot_encoded = encoder.fit_transform(y)

# Convert the sparse matrix to a dense array
y = one_hot_encoded.toarray()


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [56]:
# Normalize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [58]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 3)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 3)

# Create dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [92]:
import torch.nn as nn

class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(116, 64)  # 116 features
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 3)    

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Create a model instance
model = NeuralNet()


In [93]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [94]:
# Training loop
num_epochs = 300
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute loss
        loss = differentiable_cost_matrix_loss(outputs, labels, cost_matrix)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {loss.item()}')


Epoch 1, Loss: 0.9743879437446594
Epoch 2, Loss: 0.9511961936950684
Epoch 3, Loss: 0.9241287112236023
Epoch 4, Loss: 0.8854696750640869
Epoch 5, Loss: 0.9442274570465088
Epoch 6, Loss: 0.9492467641830444
Epoch 7, Loss: 0.948595404624939
Epoch 8, Loss: 0.9201532006263733
Epoch 9, Loss: 0.9517183899879456
Epoch 10, Loss: 0.9049546718597412
Epoch 11, Loss: 0.8302083611488342
Epoch 12, Loss: 0.8976171612739563
Epoch 13, Loss: 0.897175133228302
Epoch 14, Loss: 0.945582389831543
Epoch 15, Loss: 0.9411467909812927
Epoch 16, Loss: 0.8641144633293152
Epoch 17, Loss: 0.8978013396263123
Epoch 18, Loss: 0.9311583638191223
Epoch 19, Loss: 0.8450379967689514
Epoch 20, Loss: 0.9046963453292847
Epoch 21, Loss: 0.8891412019729614
Epoch 22, Loss: 0.8917538523674011
Epoch 23, Loss: 0.9065621495246887
Epoch 24, Loss: 0.8515470623970032
Epoch 25, Loss: 0.9030880331993103
Epoch 26, Loss: 0.7873167395591736
Epoch 27, Loss: 0.8910432457923889
Epoch 28, Loss: 0.8413931131362915
Epoch 29, Loss: 0.82742452621459

In [102]:
model.eval()
total = correct = 0

y_pred = []
y_test = []

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == torch.max(labels.data, 1)[1]).sum().item()
        y_pred.extend(predicted.numpy().tolist())
        y_test.extend(torch.max(labels.data, 1)[1].numpy().tolist())

print(f'Accuracy: {100 * correct / total}%')


Accuracy: 25.6875%


In [128]:
labels

tensor([[0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        [1

: 

In [107]:
calculate_custom_error(y_test, y_pred, cost_matrix)

0.8475

In [108]:
test_data = load_csv('../data/test_data_no_target.csv')

le = LabelEncoder()
test_data['Group'] = le.fit_transform(test_data['Group'])

# Cut outliers
top_quantiles = test_data.quantile(0.97)
outliers_top = (test_data > top_quantiles)

low_quantiles = test_data.quantile(0.03)
outliers_low = (test_data < low_quantiles)

test_data = test_data.mask(outliers_top, top_quantiles, axis=1)
test_data = test_data.mask(outliers_low, low_quantiles, axis=1)

In [109]:
test_data = test_data.groupby(['Group']).transform(lambda x: x.fillna(x.mean()))
test_data = test_data.fillna(0)

In [110]:
test_scaled = scaler.transform(test_data)



In [112]:
# Convert to PyTorch tensors
test_tensor = torch.tensor(test_scaled, dtype=torch.float32)

# Create dataloaders
test_dataset = TensorDataset(test_tensor)

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)


In [124]:
model.eval()
total = correct = 0

with torch.no_grad():
    outputs = model(test_tensor)

test_preds = torch.max(outputs, 1)[1].numpy() - 1

In [125]:
np.savetxt('nn.txt', test_preds, fmt='%d', newline='\n')

# Classify

In [80]:
import torch
import torch.nn.functional as F

def differentiable_cost_matrix_loss(outputs, targets, cost_matrix):
    """
    A differentiable loss function based on a cost matrix for one-hot encoded labels.

    Args:
    outputs (tensor): Raw logits from the neural network.
    targets (tensor): Ground truth labels, one-hot encoded.
    cost_matrix (tensor): A matrix of costs associated with misclassifications.

    Returns:
    torch.Tensor: The calculated loss.
    """
    # Ensure cost_matrix is a torch tensor and correctly formatted
    if not isinstance(cost_matrix, torch.Tensor):
        cost_matrix = torch.tensor(cost_matrix, dtype=torch.float32, device=outputs.device)
    
    # Get softmax probabilities
    probs = F.softmax(outputs, dim=1)
    
    # Compute the effective cost matrix for each prediction
    # Matrix multiplication between targets (one-hot encoded) and cost_matrix
    # This maps each one-hot vector to its respective cost row in the cost matrix
    effective_cost_matrix = torch.matmul(targets, cost_matrix)
    
    # Calculate the element-wise product of probabilities and the effective cost matrix
    weighted_losses = torch.sum(probs * effective_cost_matrix, dim=1)  # sum over classes
    
    # Mean loss across batch
    return torch.mean(weighted_losses)

# Example usage
cost_matrix = torch.tensor([[0, 1, 2],
                            [1, 0, 1],
                            [2, 1, 0]], dtype=torch.float32)

# Assuming your model, optimizer, and data loaders are set up as before


In [106]:
import numpy as np
from sklearn.metrics import confusion_matrix

cost_matrix = np.array([[0, 1, 2],
                        [1, 0, 1],
                        [2, 1, 0]])
def calculate_custom_error(preds, gt, cost_matrix=cost_matrix):
    """
    Calculate a custom error metric based on a confusion matrix and a cost matrix.

    Args:
    preds (array-like): Predicted labels.
    gt (array-like): Ground truth (actual) labels.
    cost_matrix (numpy.ndarray): A matrix of costs associated with misclassifications.

    Returns:
    float: The calculated error metric.
    """
    # Calculate the confusion matrix
    cm = confusion_matrix(gt, preds)
    
    # Validate dimensions of cost_matrix
    if cm.shape != cost_matrix.shape:
        raise ValueError("Cost matrix dimensions must match the confusion matrix dimensions.")
    
    # Calculate weighted confusion matrix
    weighted_cm = cm * cost_matrix
    
    # Calculate the custom error
    total_samples = len(gt)
    if total_samples == 0:
        raise ValueError("The length of ground truth cannot be zero.")
    
    error = np.sum(weighted_cm) / total_samples
    return error


In [29]:
train_data = pd.read_csv('training_data.csv', delimiter=';')
train_data = train_data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')
train_data = train_data[['Perform', 'Class']]

test_data = pd.read_csv('test_data_predictions.csv', header=None)

X = train_data.drop('Class', axis=1)  # Features
y = train_data['Class'] # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  train_data = train_data.replace(',', '.', regex=True).apply(pd.to_numeric, errors='ignore')


In [30]:
model = SVC(random_state=42)

# Initialize search
model.fit(X_train, y_train)

# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# # Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)


In [33]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Custom Error:", calculate_custom_error(y_pred, y_test))

Accuracy: 0.998125
Classification Report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       619
           0       1.00      0.99      0.99       227
           1       1.00      1.00      1.00       754

    accuracy                           1.00      1600
   macro avg       1.00      1.00      1.00      1600
weighted avg       1.00      1.00      1.00      1600

Confusion Matrix:
[[619   0   0]
 [  1 224   2]
 [  0   0 754]]
Custom Error: 0.001875


In [34]:
predicts = model.predict(test_data)



In [35]:
np.savetxt('regression.txt', predicts, fmt='%d', newline='\n')