### Import libraries

In [9]:
import numpy as np
import h5py
import pandas as pd
from sklearn.model_selection import train_test_split

### Load data for machine learning

In [3]:
# Path to your HDF5 file
hdf5_file = "../data/train_data.h5"

# Open the HDF5 file
with h5py.File(hdf5_file, 'r') as hdf:
    # Extract the images (X)
    X = np.array(hdf['images'])
    
    # Extract the labels (y)
    y = np.array(hdf['labels'])

# Check the shapes to ensure they are correct
print("Shape of X (images):", X.shape)
print("Shape of y (labels):", y.shape)



Shape of X (images): (1100000, 16, 16, 6)
Shape of y (labels): (1100000,)
Shape of X_test (images): (120000, 16, 16, 6)


In [10]:
hdf5_file_test = "../data/test_data.h5"
# Open the HDF5 file
with h5py.File(hdf5_file_test, 'r') as hdf:
    # Extract the images (X)
    X_test = np.array(hdf['images'])


# Check the shapes to ensure they are correct
print("Shape of X_test (images):", X_test.shape)

KeyError: "Unable to synchronously open object (object 'labels' doesn't exist)"

### Balance data

In [4]:
# Step 1: Count the number of 1's in y
num_ones = np.sum(y == 1)

# Step 2: Get indices of 0's and 1's in y
ones_indices = np.where(y == 1)[0]
zeros_indices = np.where(y == 0)[0]

# Step 3: Randomly sample the same number of 0's as there are 1's
balanced_zero_indices = np.random.choice(zeros_indices, num_ones, replace=False)

# Step 4: Combine indices of 0's and 1's
balanced_indices = np.concatenate([ones_indices, balanced_zero_indices])

# Step 5: Create balanced X and y
X_balanced = X[balanced_indices]
y_balanced = y[balanced_indices]

# Display the number of 0's and 1's in the balanced y
print(f"Number of 1's in balanced y: {np.sum(y_balanced == 1)}")
print(f"Number of 0's in balanced y: {np.sum(y_balanced == 0)}")

Number of 1's in balanced y: 100000
Number of 0's in balanced y: 100000


In [11]:
from sklearn.utils import shuffle
# Shuffle both X_balanced and y_balanced together
X_set, y_set = shuffle(X_balanced, y_balanced, random_state=1)


In [12]:
from sklearn.model_selection import train_test_split

# Assuming X and y are already loaded
# Step 1: Split the data into 50% train and 50% test
X_train, X_test, y_train, y_test = train_test_split(X_set, y_set, test_size=0.5, random_state=42)

# Step 2: Split the test set into 60% test and 40% validation
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.4, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 100000
Validation set size: 40000
Test set size: 60000


In [6]:
print(X_train.shape)
print(X_test.shape)

(200000, 16, 16, 6)
(120000, 16, 16, 6)


### Baseline models
#### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Flatten the image data
X_train_flat = X_train.reshape(X_train.shape[0], -1)  
X_test_flat = X_test.reshape(X_test.shape[0], -1)  

# Define and train the baseline model
rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train_flat, y_train)

y_pred_train = rf.predict(X_train_flat)
y_pred_test = rf.predict(X_test_flat)





In [14]:
# Evaluate the model
train_accuracy_rf = accuracy_score(y_test, y_pred_test)

## 0.9999 accuracy: OVERFITTING
print(f'Baseline RF model training accuracy: {train_accuracy_rf:.4f}')

Baseline RF model training accuracy: 0.7326


#### Multi layer perceptron

In [19]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

# Flatten the image data
X_train_flat = X_train.reshape(X_train.shape[0], -1)  
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# Scale the data to [0,1]
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_flat)
X_test_scaled = scaler.transform(X_test_flat)

# Define and train the MLP model with early stopping
mlp_model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=2000, random_state=42,
                          early_stopping=True, n_iter_no_change=10, validation_fraction=0.2)

# Train the model
mlp_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_test_mlp = mlp_model.predict(X_test_scaled)



In [20]:

# Evaluate the model
test_accuracy_mlp = accuracy_score(y_test, y_pred_test_mlp)
print(f'MLP model test accuracy: {test_accuracy_mlp:.4f}')

MLP model test accuracy: 0.6049
