In [45]:
import torch
import torch.nn as nn
import torch.nn.functional as F     
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torchvision import transforms
from PIL import Image
import pandas as pd
import os
    

In [46]:
# class CNN(nn.Module):
#     def __init__(self, input_chnls, num_classes):
#         super(CNN, self).__init__()
#         self.conv1 = nn.Conv2d(in_channels = input_chnls, out_channels = 16, kernel_size = 5, stride = 1, padding = 2)
#         self.pool = nn.MaxPool2d(kernel_size = 2, stride = 2, padding = 0)
#         self.conv2 = nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size = 5, stride = 1, padding = 2)
#         self.fc1 = nn.Linear(32*62*25, 120)
#         self.fc2 = nn.Linear(120, num_classes)
#         self.dropout = nn.Dropout(0.3) 

#     def forward(self, x):
#         x = self.pool(F.relu(self.conv1(x)))
#         x = self.pool(F.relu(self.conv2(x)))
#         x = x.view(x.size(0), -1)  # Flatten using batch size and -1 for auto calculation
#         x = F.relu(self.fc1(x))
#         x = self.dropout(x)  
#         x = self.fc2(x)
#         return x


In [47]:
class CNN(nn.Module):
    def __init__(self, input_chnls, num_classes):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(input_chnls, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool = nn.MaxPool2d(2, 2)
        
        # Adaptive pooling instead of hardcoding dimensions
        self.gap = nn.AdaptiveAvgPool2d((4, 4))
        self.fc1 = nn.Linear(128*4*4, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
        x = self.gap(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [49]:
device

device(type='cuda')

In [50]:
model = CNN(3, 100).to(device)

In [51]:
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transforms = None, class_to_idx = None):
        self.labels_df = pd.read_csv(csv_file)
        self.img_dir = img_dir 
        self.transforms = transforms
        self.class_to_idx = class_to_idx



    def __len__(self):
        return len(self.labels_df)
    
    def __getitem__(self, idx):
        img_name = self.labels_df.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_name)

        image = Image.open(img_path)
        label = self.labels_df.iloc[idx, 1]
        y_label = self.class_to_idx[label]

        if self.transforms:
            image = self.transforms(image)

        return image, y_label

In [52]:
transform = transforms.Compose([
    transforms.Resize((250, 100)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


In [53]:
# with open('ai_wordlist.txt', 'r') as f:
#     words = [line.strip() for line in f.readlines()]

# class_to_idx = {word: idx for idx, word in enumerate(words)}

# len(class_to_idx)



In [54]:
import pandas as pd

# Collect labels from all csv files
all_labels = []
for file in [
    "captcha_images/train/labels_easy.csv",
    "captcha_images/train/labels_hard.csv",
    "captcha_images/test/labels_easy.csv",
    "captcha_images/test/labels_hard.csv"
]:
    df = pd.read_csv(file)
    all_labels.extend(df.iloc[:, 1].tolist())  # assuming 2nd column = label

unique_labels = sorted(set(all_labels))
class_to_idx = {label: idx for idx, label in enumerate(unique_labels)}

print("Classes found:", len(class_to_idx))
print(list(class_to_idx.items())[:10])  # sanity check



Classes found: 100
[('adapt', 0), ('agents', 1), ('aiword1', 2), ('aiword10', 3), ('aiword11', 4), ('aiword12', 5), ('aiword13', 6), ('aiword2', 7), ('aiword3', 8), ('aiword4', 9)]


In [55]:
train_dataset_easy = CustomImageDataset(
    csv_file = 'captcha_images/train/labels_easy.csv',
    img_dir = 'captcha_images/train/easy',
    transforms = transform,
    class_to_idx = class_to_idx
)

train_dataset_hard = CustomImageDataset(
    csv_file = 'captcha_images/train/labels_hard.csv',
    img_dir = 'captcha_images/train/hard',
    transforms = transform,
    class_to_idx = class_to_idx
)

train_dataset = ConcatDataset([train_dataset_easy, train_dataset_hard])

test_dataset_easy = CustomImageDataset(
    csv_file = 'captcha_images/test/labels_easy.csv',
    img_dir = 'captcha_images/test/easy',
    transforms = transform,
    class_to_idx = class_to_idx
)

test_dataset_hard = CustomImageDataset(
    csv_file = 'captcha_images/test/labels_hard.csv',
    img_dir = 'captcha_images/test/hard',
    transforms = transform,
    class_to_idx = class_to_idx
)

test_dataset = ConcatDataset([test_dataset_easy, test_dataset_hard])


In [56]:

train_dataloader = DataLoader(train_dataset, batch_size = 8, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = 8, shuffle = False)

In [57]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [58]:
criterion = nn.CrossEntropyLoss()

In [59]:
num_epochs = 20

In [72]:
# 1. Check label range
for _, labels in train_dataloader:
    print(labels.min().item(), labels.max().item())
    break  # should be between 0 and 99


16 94


In [38]:
# 3. Try overfitting on tiny dataset
small_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(20):
    for xb, yb in small_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        pred = model(xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss {loss.item()}")
# 2. Sanity check model output
x, y = next(iter(train_dataloader))
out = model(x.to(device))
print(out.shape)  # should be [batch_size, 100]


Epoch 1, Loss 4.603808403015137
Epoch 2, Loss 4.609356880187988
Epoch 3, Loss 4.599602699279785
Epoch 4, Loss 4.601132869720459
Epoch 5, Loss 4.607789993286133
Epoch 6, Loss 4.604128837585449
Epoch 7, Loss 4.604803562164307
Epoch 8, Loss 4.599720001220703
Epoch 9, Loss 4.613910675048828
Epoch 10, Loss 4.613034248352051
Epoch 11, Loss 4.606087684631348
Epoch 12, Loss 4.612348556518555
Epoch 13, Loss 4.603206634521484
Epoch 14, Loss 4.5968170166015625
Epoch 15, Loss 4.604020118713379
Epoch 16, Loss 4.604323387145996
Epoch 17, Loss 4.616062641143799
Epoch 18, Loss 4.601547718048096
Epoch 19, Loss 4.605476379394531
Epoch 20, Loss 4.617447376251221
torch.Size([8, 100])


In [34]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device = device)
            y = y.to(device = device)
            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
    acc = float(num_correct) / num_samples
    print(f'Got {num_correct} / {num_samples} with accuracy {acc * 100:.2f}')
    return acc

check_accuracy(train_dataloader, model)
check_accuracy(test_dataloader, model)

Got 10 / 1000 with accuracy 1.00
Got 5 / 500 with accuracy 1.00


0.01

- Number of epochs as 10 - 1% accuracy
- Number of epochs as 100 - 100% accuracy
- Right now model is just memorizing, need to add more samples.