In [50]:
# ! pip install kagglehub
# ! pip install matplotlib
# ! pip install optuna

In [51]:
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import optuna

In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [53]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("zalando-research/fashionmnist")

# print("Path to dataset files:", path)

In [54]:
path = "/kaggle/input/fashionmnist"

In [55]:
df = pd.read_csv(path + "/fashion-mnist_train.csv")

df.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
sample_df = df.sample(n=30000, random_state=42)

In [57]:
sample_df.shape

(30000, 785)

In [58]:
X = sample_df.iloc[:, 1:].values
y = sample_df.iloc[:, :1].values


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [60]:
st_scaler = StandardScaler()

In [61]:

X_train = st_scaler.fit_transform(X_train)
X_test = st_scaler.transform(X_test)

In [62]:
X_train

array([[-0.0100328 , -0.03207572, -0.04945422, ..., -0.16188057,
        -0.09292675, -0.03794802],
       [-0.0100328 , -0.03207572, -0.04945422, ..., -0.16188057,
        -0.09292675, -0.03794802],
       [-0.0100328 , -0.03207572, -0.04945422, ..., -0.16188057,
        -0.09292675, -0.03794802],
       ...,
       [-0.0100328 , -0.03207572, -0.04945422, ..., -0.16188057,
        -0.09292675, -0.03794802],
       [-0.0100328 , -0.03207572, -0.04945422, ..., -0.16188057,
        -0.09292675, -0.03794802],
       [-0.0100328 , -0.03207572, -0.04945422, ..., -0.16188057,
        -0.09292675, -0.03794802]])

# Dataset Class

In [63]:
# create custom dataset class

class FashionMNISTDataset(Dataset):

    def __init__(self, features, labels):
        print(type(features))
        print(type(labels))
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)


    def __len__(self):
        return len(self.features)


    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [64]:
train_dataset = FashionMNISTDataset(X_train, y_train)
test_dataset = FashionMNISTDataset(X_test, y_test)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


# OPTUNA

In [65]:
class MyNN(nn.Module):

    def __init__(self, input_dim, output_dim, hidden_layers, neurons_per_layer, dropout_rate):

        super().__init__()

        layers = []

        for i in range(hidden_layers):

            layers.append(nn.Linear(input_dim , neurons_per_layer)) 
            layers.append(nn.BatchNorm1d(neurons_per_layer))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(p=dropout_rate))
            
            input_dim = neurons_per_layer

        layers.append(
            nn.Linear(neurons_per_layer, output_dim)
        )
        
        
        self.model = nn.Sequential(*layers)
    

    def forward(self, x):
        return self.model(x)



In [66]:
# objective function

def objective_function(trial):

    # next hyperparameter values
    num_hidden_layers = trial.suggest_int("num_hidden_layers", 1, 4, step=1, log=False)
    neurons_per_layer = trial.suggest_int("neurons_per_layer", 8, 256, step=8, log=False)
    num_epochs = trial.suggest_int("num_epochs", 40, 100, step=5, log=False)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1, log=False)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    optimizer_name = trial.suggest_categorical("optimizer_name", ["SGD", "Adam", "RMSprop"])
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-3, log=True)
    
    # loading data
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

    # model init
    input_dim = 784
    output_dim = 10

    model = MyNN(input_dim, output_dim, num_hidden_layers, neurons_per_layer, dropout_rate).to(device) 

    # params init
    
    learning_rate = learning_rate
    num_epochs = num_epochs

    # defining loss function
    loss_function = nn.CrossEntropyLoss()
    
    
    # defining optimizer
    
    if optimizer_name == "SGD":
        
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # weight decay is mathematically equivalent to L2 regularization
    elif optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == "RMSprop":
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
        
        
    
    
    loss_per_epoch = []
    
    # training loop
    for epoch in range(num_epochs):
                
        for batch_features, batch_labels in train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            
            
            # forward pass
            outputs = model(batch_features)
            
            # calculating loss
            loss = loss_function(outputs, batch_labels.squeeze())
                        
            # zero the gradients
            optimizer.zero_grad()
            
            # backward pass
            loss.backward()
            
            # wieghts update
            optimizer.step()
            #print(f"Loss: {loss.item()}")
            
            loss_per_epoch.append(loss.item())
            
        #print(f"---- For Epoch {epoch+1}/{num_epochs} ,  Average loss is {sum(loss_per_epoch)/len(loss_per_epoch)} ----")
        
        
        

            
    # evaluation
    
    model.eval()


    # return accuracy
    
    total = 0
    correct = 0 
    i = 1
    cnt =0
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:

            cnt +=1
            
            # move data to gpu
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
                
            test_outputs = model(batch_features)

            _, predicted = torch.max(test_outputs, 1)
            
            
            
            total += batch_labels.shape[0]
            correct += (predicted == batch_labels.squeeze()).sum().item()

    
    accuracy = 100 * correct / total




    
    
    return accuracy

# Study

In [67]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_function, n_trials=30)

[I 2026-01-06 09:15:20,163] A new study created in memory with name: no-name-52090c0b-6c8f-445a-893a-3fdec75b4e95
[I 2026-01-06 09:17:33,167] Trial 0 finished with value: 83.16666666666667 and parameters: {'num_hidden_layers': 2, 'neurons_per_layer': 16, 'num_epochs': 85, 'learning_rate': 0.013503648304048571, 'dropout_rate': 0.4, 'batch_size': 32, 'optimizer_name': 'SGD', 'weight_decay': 3.2632857788264026e-05}. Best is trial 0 with value: 83.16666666666667.
[I 2026-01-06 09:19:14,793] Trial 1 finished with value: 88.35 and parameters: {'num_hidden_layers': 2, 'neurons_per_layer': 152, 'num_epochs': 65, 'learning_rate': 0.0695690854434198, 'dropout_rate': 0.2, 'batch_size': 32, 'optimizer_name': 'SGD', 'weight_decay': 1.6292055759007518e-05}. Best is trial 1 with value: 88.35.
[I 2026-01-06 09:19:54,002] Trial 2 finished with value: 87.76666666666667 and parameters: {'num_hidden_layers': 4, 'neurons_per_layer': 200, 'num_epochs': 60, 'learning_rate': 0.0013630401386168225, 'dropout_ra

In [68]:
device, study.best_value, study.best_params

(device(type='cuda'),
 89.4,
 {'num_hidden_layers': 3,
  'neurons_per_layer': 200,
  'num_epochs': 90,
  'learning_rate': 0.0005989220238939682,
  'dropout_rate': 0.4,
  'batch_size': 128,
  'optimizer_name': 'RMSprop',
  'weight_decay': 0.00012437072177777747})