# BÀI THỰC HÀNH 2: MẠNG NEURAL TÍCH CHẬP

Bộ dữ liệu sử dụng: [VinaFood21 dataset](https://arxiv.org/abs/2108.02929).

Link download: https://drive.google.com/file/d/1UpZOf0XlwvB4rKpyZ35iwTA8oWHqDBbR/view?usp=share_link.

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

from keras.datasets import mnist


import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [4]:
# Chuẩn hóa (normalize) và reshape
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

In [5]:
# Thêm chiều channel
X_train = np.expand_dims(X_train, axis=-1)  # (60000, 28, 28, 1)
X_test = np.expand_dims(X_test, axis=-1)    # (10000, 28, 28, 1)

In [6]:
X_train = torch.tensor(X_train, dtype=torch.float).permute(0, 3, 1, 2)
X_test = torch.tensor(X_test, dtype=torch.float).permute(0, 3, 1, 2)

num_classes = 10
y_train = torch.tensor(y_train, dtype=torch.long)
y_test  = torch.tensor(y_test,  dtype=torch.long)

In [7]:
batch_size=256

In [8]:
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [9]:
# Train and evaluate model
def train_model(model, train_loader, num_epochs, criterion, optimizer, device):
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {avg_loss:.4f}")

def evaluate_model(model, X_test, y_test, device):
    model.eval()
    model.to(device)

    X_test = X_test.to(device)
    y_test = y_test.to(device)

    with torch.no_grad():
        outputs = model(X_test)
        y_pred = torch.argmax(torch.softmax(outputs, dim=1), dim=1)

    y_pred_np = y_pred.cpu().numpy()
    y_true_np = y_test.cpu().numpy()

    results = {
        "accuracy": accuracy_score(y_true_np, y_pred_np),
        "precision_macro": precision_score(y_true_np, y_pred_np, average='macro', zero_division=0),
        "recall_macro": recall_score(y_true_np, y_pred_np, average='macro', zero_division=0),
        "f1_macro": f1_score(y_true_np, y_pred_np, average='macro', zero_division=0)
    }

    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Precision (macro): {results['precision_macro']:.4f}")
    print(f"Recall (macro): {results['recall_macro']:.4f}")
    print(f"F1-score (macro): {results['f1_macro']:.4f}")
    print(classification_report(y_true_np, y_pred_np, digits=4))

    return results


### Bài 1: Xây dựng mô hình LeNet. Huấn luyện và đánh giá mô hình LeNet trên 4 độ đo precision, recall và F1-macro (sử dụng Adam làm optimizer).



In [10]:
class LeNet(nn.Module):
    def __init__(self, num_classes=10):
        super(LeNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, padding=2), # (1, 28, 28) -> (6, 28, 28)
            nn.ReLU(),

            nn.AvgPool2d(kernel_size=2, stride=2), # (6, 28, 28) -> (6, 14, 14)

            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5), # (6, 14, 14) -> (16, 10, 10)
            nn.ReLU(),

            nn.AvgPool2d(kernel_size=2, stride=2), #(16, 10, 10) -> (16, 5, 5)
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),

            nn.Linear(16*5*5, 120),
            nn.ReLU(),

            nn.Linear(120, 84),
            nn.ReLU(),

            nn.Linear(84, 10)
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

In [11]:
lenet_model = LeNet()

criterion1 = nn.CrossEntropyLoss()
optimizer1 = torch.optim.Adam(params=lenet_model.parameters(), lr=0.0005)
num_epochs = 10
batch_size = 256

In [12]:
train_model(lenet_model, train_loader, num_epochs, criterion1, optimizer1, device)
evaluate_model(lenet_model, X_test, y_test, device)

Epoch [1/10] - Loss: 0.8480
Epoch [2/10] - Loss: 0.2727
Epoch [3/10] - Loss: 0.1859
Epoch [4/10] - Loss: 0.1347
Epoch [5/10] - Loss: 0.1060
Epoch [6/10] - Loss: 0.0877
Epoch [7/10] - Loss: 0.0772
Epoch [8/10] - Loss: 0.0699
Epoch [9/10] - Loss: 0.0645
Epoch [10/10] - Loss: 0.0592
Accuracy: 0.9839
Precision (macro): 0.9839
Recall (macro): 0.9837
F1-score (macro): 0.9838
              precision    recall  f1-score   support

           0     0.9818    0.9908    0.9863       980
           1     0.9878    0.9965    0.9921      1135
           2     0.9912    0.9806    0.9859      1032
           3     0.9880    0.9782    0.9831      1010
           4     0.9918    0.9807    0.9862       982
           5     0.9853    0.9798    0.9826       892
           6     0.9875    0.9896    0.9885       958
           7     0.9778    0.9864    0.9821      1028
           8     0.9784    0.9754    0.9769       974
           9     0.9696    0.9792    0.9744      1009

    accuracy                    

{'accuracy': 0.9839,
 'precision_macro': 0.9839134228578684,
 'recall_macro': 0.9837091999924535,
 'f1_macro': 0.9837935991818554}

### Bài 2: Xây dựng mô hình GoogLeNet. Huấn luyện và đánh giá mô hình GoogLeNet trên 4 độ đo precision, recall và F1 (sử dụng Adam làm optimizer).



In [13]:
class Inception(nn.Module):
    def __init__(self, in_channels, filters):
        super(Inception, self).__init__()
        f1, f3r, f3, f5r, f5, fp = filters

        # 1x1 conv
        self.branch1 = nn.Conv2d(in_channels=in_channels, out_channels=f1, kernel_size=1)

        # 1x1 conv -> 3x3 conv
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=f3r, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=f3r, out_channels=f3, kernel_size=3, padding=1)
        )

        # 1x1 conv -> 5x5 conv
        self.branch5 = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=f5r, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=f5r, out_channels=f5, kernel_size=5, padding=2)
        )

        # MaxPooling -> 1x1 conv
        self.branch_pool = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels=in_channels, out_channels=fp, kernel_size=1),
            nn.ReLU()
        )

    def forward(self, x):
        x1 = self.branch1(x)
        x2 = self.branch3(x)
        x3 = self.branch5(x)
        x4 = self.branch_pool(x)

        # Lưu ý thêm dim=1 khi cat
        return torch.cat([x1, x2, x3, x4], dim=1)

In [14]:
class GoogLeNet(nn.Module):
    def __init__(self, num_classes=10, in_channels=1):
        super(GoogLeNet, self).__init__()

        # Điều chỉnh lại kernel_size, stride, padding cho phù hợp input 1x28x28
        # kernel_size=7 với stride=2 sẽ làm kích thước giảm rất nhanh, nên giảm kernel xuống 3x3
        self.conv1 = nn.Conv2d(in_channels, 64, kernel_size=3, stride=1, padding=1)  # giữ kích thước 28x28
        self.maxpool1 = nn.MaxPool2d(2, stride=2, padding=0)  # 28x28 -> 14x14

        self.conv2 = nn.Conv2d(64, 64, kernel_size=1)
        self.conv3 = nn.Conv2d(64, 192, kernel_size=3, padding=1)
        self.maxpool2 = nn.MaxPool2d(2, stride=2, padding=0)  # 14x14 -> 7x7

        self.inception3a = Inception(192, [64, 96, 128, 16, 32, 32])
        self.inception3b = Inception(256, [128, 128, 192, 32, 96, 64])
        self.maxpool3 = nn.MaxPool2d(2, stride=2, padding=0)  # 7x7 -> 3x3

        self.inception4a = Inception(480, [192, 96, 208, 16, 48, 64])
        self.inception4b = Inception(512, [160, 112, 224, 24, 64, 64])
        self.inception4c = Inception(512, [128, 128, 256, 24, 64, 64])
        self.inception4d = Inception(512, [112, 144, 288, 32, 64, 64])
        self.inception4e = Inception(528, [256, 160, 320, 32, 128, 128])
        self.maxpool4 = nn.MaxPool2d(2, stride=2, padding=0)  # 3x3 -> 1x1

        self.inception5a = Inception(832, [256, 160, 320, 32, 128, 128])
        self.inception5b = Inception(832, [384, 192, 384, 48, 128, 128])

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))         # [B, 64, 28, 28]
        x = self.maxpool1(x)              # [B, 64, 14, 14]
        x = F.relu(self.conv2(x))         # [B, 64, 14, 14]
        x = F.relu(self.conv3(x))         # [B, 192, 14, 14]
        x = self.maxpool2(x)              # [B, 192, 7, 7]

        x = self.inception3a(x)           # [B, 256, 7, 7]
        x = self.inception3b(x)           # [B, 480, 7, 7]
        x = self.maxpool3(x)              # [B, 480, 3, 3]

        x = self.inception4a(x)           # [B, 512, 3, 3]
        x = self.inception4b(x)           # [B, 512, 3, 3]
        x = self.inception4c(x)           # [B, 512, 3, 3]
        x = self.inception4d(x)           # [B, 528, 3, 3]
        x = self.inception4e(x)           # [B, 832, 3, 3]
        x = self.maxpool4(x)              # [B, 832, 1, 1]

        x = self.inception5a(x)           # [B, 832, 1, 1]
        x = self.inception5b(x)           # [B, 1024, 1, 1]

        x = self.avgpool(x)               # [B, 1024, 1, 1]
        x = torch.flatten(x, 1)           # [B, 1024]
        x = self.fc(x)                    # [B, num_classes]
        return x

In [15]:
googlenet_model = GoogLeNet()
criterion2 = nn.CrossEntropyLoss()
optimizer2 = torch.optim.Adam(params=googlenet_model.parameters(), lr=0.001)

In [16]:
train_model(googlenet_model, train_loader, num_epochs, criterion2, optimizer2, device)
evaluate_model(googlenet_model, X_test, y_test, device)

Epoch [1/10] - Loss: 1.1988
Epoch [2/10] - Loss: 0.1199
Epoch [3/10] - Loss: 0.0640
Epoch [4/10] - Loss: 0.0529
Epoch [5/10] - Loss: 0.0402
Epoch [6/10] - Loss: 0.0337
Epoch [7/10] - Loss: 0.0291
Epoch [8/10] - Loss: 0.0271
Epoch [9/10] - Loss: 0.0272
Epoch [10/10] - Loss: 0.0207
Accuracy: 0.9882
Precision (macro): 0.9882
Recall (macro): 0.9882
F1-score (macro): 0.9881
              precision    recall  f1-score   support

           0     0.9919    0.9980    0.9949       980
           1     0.9956    0.9868    0.9912      1135
           2     0.9951    0.9835    0.9893      1032
           3     0.9805    0.9960    0.9882      1010
           4     0.9948    0.9817    0.9882       982
           5     0.9909    0.9809    0.9859       892
           6     0.9886    0.9916    0.9901       958
           7     0.9902    0.9864    0.9883      1028
           8     0.9700    0.9959    0.9828       974
           9     0.9841    0.9812    0.9826      1009

    accuracy                    

{'accuracy': 0.9882,
 'precision_macro': 0.9881710238195044,
 'recall_macro': 0.9882015116336662,
 'f1_macro': 0.9881491037966216}

# Load VinaFood21 Dataset

In [34]:
from google.colab import drive
drive.mount('/content/drive')
# Giải nén file ZIP
!unzip -q "/content/drive/MyDrive/Colab Notebooks/VinaFood21.zip" -d "/content/drive/MyDrive/VinaFood21"
!ls /content/drive/MyDrive/VinaFood21

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
replace /content/drive/MyDrive/VinaFood21/VinaFood21/README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: VinaFood21


In [35]:
data_dir = '/content/drive/MyDrive/VinaFood21/VinaFood21'

#Standardizate images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=transform)
test_dataset = datasets.ImageFolder(os.path.join(data_dir, 'test'), transform=transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

print("Num of train imgs: ", len(train_dataset))
print("Num of test imgs: ", len(test_dataset))
print("Classes: ", train_dataset.classes)

Num of train imgs:  10044
Num of test imgs:  6682
Classes:  ['banh-can', 'banh-hoi', 'banh-mi-chao', 'banh-tet', 'banh-trang-tron', 'banh-u', 'banh-uot', 'bap-nuong', 'bo-kho', 'bo-la-lot', 'bot-chien', 'ca-ri', 'canh-kho-qua', 'canh-khoai-mo', 'ga-nuong', 'goi-ga', 'ha-cao', 'hoanh-thanh-nuoc', 'pha-lau', 'tau-hu', 'thit-kho-trung']


In [36]:
def evaluate_model_batch(model, test_loader, device):
    model.eval()
    model.to(device)

    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            preds = torch.argmax(torch.softmax(outputs, dim=1), dim=1)

            all_preds.append(preds.cpu())
            all_labels.append(batch_y.cpu())

    y_pred = torch.cat(all_preds).numpy()
    y_true = torch.cat(all_labels).numpy()

    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision (macro):", precision_score(y_true, y_pred, average='macro', zero_division=0))
    print("Recall (macro):", recall_score(y_true, y_pred, average='macro', zero_division=0))
    print("F1-score (macro):", f1_score(y_true, y_pred, average='macro', zero_division=0))
    print(classification_report(y_true, y_pred, digits=4))

### Bài 3*: Xây dựng mô hình ResNet-18, đánh giá mô hình ResNet-18 trên bộ dữ liệu VinaFood21 sử dụng các độ đo precision, recall, và F1 (Sử dụng Adam làm optimizer).



In [42]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Nếu kích thước thay đổi, dùng 1x1 conv để điều chỉnh shortcut
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels),
            )
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        shortcut = self.shortcut(x)

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)

        out += shortcut
        out = self.relu(out)
        return out


In [38]:
class ResNet18(nn.Module):
    def __init__(self, num_classes=1000):
        super(ResNet18, self).__init__()

        # 7x7 conv đầu tiên
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # 4 nhóm residual block
        self.layer1 = self._make_layer(64, 64, num_blocks=2, stride=1)   # giữ nguyên kích thước
        self.layer2 = self._make_layer(64, 128, num_blocks=2, stride=2)  # giảm 1/2
        self.layer3 = self._make_layer(128, 256, num_blocks=2, stride=2) # giảm 1/2
        self.layer4 = self._make_layer(256, 512, num_blocks=2, stride=2) # giảm 1/2

        # Global Average Pooling + FC
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, in_channels, out_channels, num_blocks, stride):
        layers = []
        layers.append(ResidualBlock(in_channels, out_channels, stride))
        for _ in range(1, num_blocks):
            layers.append(ResidualBlock(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [39]:
resnet18_model = ResNet18(num_classes=21)

optimizer3 = torch.optim.Adam(resnet18_model.parameters(), lr=1e-4)
criterion3 = nn.CrossEntropyLoss()

In [43]:
train_model(resnet18_model, train_loader, num_epochs, criterion3, optimizer3, device)

Epoch [1/10] - Loss: 1.1469
Epoch [2/10] - Loss: 0.7949
Epoch [3/10] - Loss: 0.4555
Epoch [4/10] - Loss: 0.2122
Epoch [5/10] - Loss: 0.1256
Epoch [6/10] - Loss: 0.0922
Epoch [7/10] - Loss: 0.0769
Epoch [8/10] - Loss: 0.1062
Epoch [9/10] - Loss: 0.1200
Epoch [10/10] - Loss: 0.0854


In [44]:
evaluate_model_batch(resnet18_model, test_loader, device)

Accuracy: 0.41319964082609995
Precision (macro): 0.5127162741955409
Recall (macro): 0.40387466160909546
F1-score (macro): 0.4097230186914821
              precision    recall  f1-score   support

           0     0.7414    0.1784    0.2876       241
           1     0.7296    0.2801    0.4048       607
           2     0.5781    0.2561    0.3549       289
           3     0.7910    0.3846    0.5176       364
           4     0.2306    0.6440    0.3396       309
           5     0.4234    0.2670    0.3275       176
           6     0.4711    0.2911    0.3598       560
           7     0.4286    0.0629    0.1098       143
           8     0.5172    0.2663    0.3516       338
           9     0.7213    0.5764    0.6408       458
          10     0.1888    0.4167    0.2599       276
          11     0.2320    0.1696    0.1959       171
          12     0.7593    0.5655    0.6482       290
          13     0.8021    0.7778    0.7897       198
          14     0.3861    0.3737    0.3798     

### Bài 4*: Sử dụng pretrained ResNet50 từ HuggingFace để fine-tune trên bộ dữ liệu VinaFood21.

In [45]:
from transformers import ResNetForImageClassification

class PretrainedResnet(nn.Module):
    def __init__(self):
        super().__init__()
        basemodel = ResNetForImageClassification.from_pretrained("microsoft/resnet-50")
        self.resnet = basemodel.resnet
        self.classifier = nn.Linear(in_features=2048, out_features=21, bias=True)

    def forward(self, images: torch.Tensor):
        features = self.resnet(images).pooler_output
        features = features.squeeze(-1).squeeze(-1)
        logits = self.classifier(features)
        return logits

pretrained_resnet_model = PretrainedResnet()

optimizer4 = torch.optim.Adam(pretrained_resnet_model.parameters(), lr=1e-4)
criterion4 = nn.CrossEntropyLoss()

train_model(pretrained_resnet_model, train_loader, num_epochs, criterion4, optimizer4, device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]



Epoch [1/10] - Loss: 2.6426
Epoch [2/10] - Loss: 1.3285
Epoch [3/10] - Loss: 0.6730
Epoch [4/10] - Loss: 0.4127
Epoch [5/10] - Loss: 0.2643
Epoch [6/10] - Loss: 0.1759
Epoch [7/10] - Loss: 0.1156
Epoch [8/10] - Loss: 0.0730
Epoch [9/10] - Loss: 0.0505
Epoch [10/10] - Loss: 0.0349


In [46]:
evaluate_model_batch(pretrained_resnet_model, test_loader, device)

Accuracy: 0.8454055671954505
Precision (macro): 0.8486900112434922
Recall (macro): 0.8452469601908954
F1-score (macro): 0.8451952950564671
              precision    recall  f1-score   support

           0     0.8315    0.9212    0.8740       241
           1     0.8481    0.8649    0.8564       607
           2     0.8246    0.9273    0.8730       289
           3     0.9604    0.8654    0.9104       364
           4     0.9359    0.8511    0.8915       309
           5     0.8263    0.8920    0.8579       176
           6     0.7870    0.7786    0.7828       560
           7     0.9716    0.9580    0.9648       143
           8     0.8459    0.6657    0.7450       338
           9     0.8796    0.9410    0.9093       458
          10     0.7972    0.8261    0.8114       276
          11     0.6408    0.5322    0.5815       171
          12     0.9308    0.9276    0.9292       290
          13     0.9727    0.8990    0.9344       198
          14     0.8338    0.7957    0.8143       