In [1]:
# ==========================
# 1. CÀI ĐẶT & TẢI DATASET
# ==========================
!pip install -q kaggle

import os
import zipfile
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from google.colab import files

# Bước 1: Người dùng upload kaggle.json
print("Vui lòng upload file kaggle.json (tải từ tài khoản Kaggle của bạn)...")
uploaded = files.upload()

if "kaggle.json" not in uploaded:
    raise RuntimeError("Không tìm thấy kaggle.json, vui lòng upload lại.")

# Bước 2: Cấu hình kaggle
os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)
kaggle_path = os.path.expanduser("~/.kaggle/kaggle.json")

with open(kaggle_path, "wb") as f:
    f.write(uploaded["kaggle.json"])

os.chmod(kaggle_path, 0o600)

# Bước 3: Tải dataset A-Z Handwritten Alphabet
!kaggle datasets download -d sachinpatel21/az-handwritten-alphabets-in-csv-format -p ./az_data

# Giải nén
zip_path = "./az_data/az-handwritten-alphabets-in-csv-format.zip"
with zipfile.ZipFile(zip_path, "r") as zf:
    zf.extractall("./az_data")

# Tìm file CSV
csv_path = None
for root, dirs, files_in_dir in os.walk("./az_data"):
    for name in files_in_dir:
        if name.lower().endswith(".csv"):
            csv_path = os.path.join(root, name)
            break
    if csv_path is not None:
        break

if csv_path is None:
    raise FileNotFoundError("Không tìm thấy file CSV trong thư mục az_data sau khi giải nén.")

print("Đã tìm thấy file CSV:", csv_path)

# ==========================
# 2. TIỀN XỬ LÝ DỮ LIỆU
# ==========================

# Đọc file CSV
data = pd.read_csv(csv_path)

# Cột đầu tiên là nhãn (0-25), 784 cột còn lại là pixel
y = data.iloc[:, 0].values
X = data.iloc[:, 1:].values

# Chuẩn hóa về [0, 1]
X = X.astype("float32") / 255.0

# Reshape thành (num_samples, 28, 28, 1)
X = X.reshape(-1, 28, 28, 1)

# One-hot encoding nhãn: 26 lớp
num_classes = 26
y_cat = keras.utils.to_categorical(y, num_classes=num_classes)

# Chia train / validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y
)

print("Shape X_train:", X_train.shape)
print("Shape X_val:", X_val.shape)

# ==========================
# 3. XÂY DỰNG MODEL CNN
# ==========================

model = keras.Sequential(
    [
        layers.Input(shape=(28, 28, 1)),
        layers.Conv2D(32, (3, 3), activation="relu", padding="same"),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation="relu", padding="same"),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation="relu", padding="same"),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation="relu"),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),  # 26 lớp A-Z
    ]
)

model.summary()

# ==========================
# 4. COMPILE & TRAIN
# ==========================

model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

EPOCHS = 15
BATCH_SIZE = 128

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
)

# ==========================
# 5. LƯU & TẢI MODEL VỀ MÁY
# ==========================

model_path = "alphabet_model.h5"
model.save(model_path)
print(f"Đã lưu model vào {model_path}")

files.download(model_path)

Vui lòng upload file kaggle.json (tải từ tài khoản Kaggle của bạn)...


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/sachinpatel21/az-handwritten-alphabets-in-csv-format
License(s): CC0-1.0
Downloading az-handwritten-alphabets-in-csv-format.zip to ./az_data
 84% 156M/185M [00:00<00:00, 1.54GB/s]
100% 185M/185M [00:00<00:00, 1.32GB/s]
Đã tìm thấy file CSV: ./az_data/A_Z Handwritten Data.csv
Shape X_train: (297960, 28, 28, 1)
Shape X_val: (74490, 28, 28, 1)


Epoch 1/15
[1m2328/2328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 8ms/step - accuracy: 0.8184 - loss: 0.6381 - val_accuracy: 0.9833 - val_loss: 0.0628
Epoch 2/15
[1m2328/2328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.9682 - loss: 0.1190 - val_accuracy: 0.9870 - val_loss: 0.0491
Epoch 3/15
[1m2328/2328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.9776 - loss: 0.0829 - val_accuracy: 0.9886 - val_loss: 0.0418
Epoch 4/15
[1m2328/2328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.9819 - loss: 0.0670 - val_accuracy: 0.9902 - val_loss: 0.0384
Epoch 5/15
[1m2328/2328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 0.9850 - loss: 0.0539 - val_accuracy: 0.9895 - val_loss: 0.0392
Epoch 6/15
[1m2328/2328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 0.9874 - loss: 0.0455 - val_accuracy: 0.9916 - val_loss: 0.0312
Epoch 7/15



Đã lưu model vào alphabet_model.h5


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>