In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F     
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torchvision import transforms
from PIL import Image
import pandas as pd
import os
    

In [2]:
class CNN(nn.Module):
    def __init__(self, input_chnls, num_classes):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(input_chnls, out_channels=8, kernel_size=3, padding=1)   
        self.conv2 = nn.Conv2d(8, out_channels=16, kernel_size=3, padding=1)            
        self.conv3 = nn.Conv2d(16, out_channels=32, kernel_size=3, padding=1)           
        self.pool = nn.MaxPool2d(2, 2)

        # After 3 poolings: 200x80 -> 100x40 -> 50x20 -> 25x10
        self.fc1 = nn.Linear(32 * 25 * 10, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [3]:
# class CNN(nn.Module):
#     def __init__(self, input_chnls, num_classes):
#         super(CNN, self).__init__()
#         self.conv1 = nn.Conv2d(input_chnls, 32, kernel_size=3, padding=1)
#         self.bn1 = nn.BatchNorm2d(32)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
#         self.bn2 = nn.BatchNorm2d(64)
#         self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
#         self.bn3 = nn.BatchNorm2d(128)
#         self.pool = nn.MaxPool2d(2, 2)
        
#         # Adaptive pooling instead of hardcoding dimensions
#         self.gap = nn.AdaptiveAvgPool2d((4, 4))
#         self.fc1 = nn.Linear(128*4*4, 256)
#         self.fc2 = nn.Linear(256, num_classes)
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.pool(F.relu(self.bn1(self.conv1(x))))
#         x = self.pool(F.relu(self.bn2(self.conv2(x))))
#         x = self.pool(F.relu(self.bn3(self.conv3(x))))
#         x = self.gap(x)
#         x = x.view(x.size(0), -1)
#         x = F.relu(self.fc1(x))
#         x = self.dropout(x)
#         x = self.fc2(x)
#         return x


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
device

device(type='cuda')

In [6]:
model = CNN(3, 100).to(device)

In [7]:
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transforms = None, class_to_idx = None):
        self.labels_df = pd.read_csv(csv_file)
        self.img_dir = img_dir 
        self.transforms = transforms
        self.class_to_idx = class_to_idx



    def __len__(self):
        return len(self.labels_df)
    
    def __getitem__(self, idx):
        img_name = self.labels_df.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_name)

        image = Image.open(img_path)
        label = self.labels_df.iloc[idx, 1]
        y_label = self.class_to_idx[label]

        if self.transforms:
            image = self.transforms(image)

        return image, y_label

In [8]:
transform = transforms.Compose([
    transforms.Resize((200, 80)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


In [9]:
# with open('ai_wordlist.txt', 'r') as f:
#     words = [line.strip() for line in f.readlines()]

# class_to_idx = {word: idx for idx, word in enumerate(words)}

# len(class_to_idx)



In [10]:
import pandas as pd

all_labels = []
for file in [
    "captcha_images/train/labels_easy.csv",
    "captcha_images/train/labels_hard.csv",
    "captcha_images/test/labels_easy.csv",
    "captcha_images/test/labels_hard.csv"
]:
    df = pd.read_csv(file)
    all_labels.extend(df.iloc[:, 1].tolist()) 

unique_labels = sorted(set(all_labels))
class_to_idx = {label: idx for idx, label in enumerate(unique_labels)}

print("Classes found:", len(class_to_idx))
print(list(class_to_idx.items())[:10])  



Classes found: 100
[('adapt', 0), ('agents', 1), ('aiword1', 2), ('aiword10', 3), ('aiword11', 4), ('aiword12', 5), ('aiword13', 6), ('aiword2', 7), ('aiword3', 8), ('aiword4', 9)]


In [11]:
train_dataset_easy = CustomImageDataset(
    csv_file = 'captcha_images/train/labels_easy.csv',
    img_dir = 'captcha_images/train/easy',
    transforms = transform,
    class_to_idx = class_to_idx
)

train_dataset_hard = CustomImageDataset(
    csv_file = 'captcha_images/train/labels_hard.csv',
    img_dir = 'captcha_images/train/hard',
    transforms = transform,
    class_to_idx = class_to_idx
)

train_dataset = ConcatDataset([train_dataset_easy, train_dataset_hard])


test_dataset_easy = CustomImageDataset(
    csv_file = 'captcha_images/test/labels_easy.csv',
    img_dir = 'captcha_images/test/easy',
    transforms = transform,
    class_to_idx = class_to_idx
)

test_dataset_hard = CustomImageDataset(
    csv_file = 'captcha_images/test/labels_hard.csv',
    img_dir = 'captcha_images/test/hard',
    transforms = transform,
    class_to_idx = class_to_idx
)

test_dataset = ConcatDataset([test_dataset_easy, test_dataset_hard])


In [12]:

train_dataloader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = 32, shuffle = False)

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [14]:
criterion = nn.CrossEntropyLoss()

In [15]:
num_epochs = 100

In [16]:
# for epoch in range(num_epochs):
#     epoch_loss = 0.0
#     for batch_idx, (data, targets) in enumerate(train_dataloader):
#         data = data.to(device = device)
#         targets = targets.to(device = device)

#         # Forward pass
#         scores = model(data)
#         loss = criterion(scores, targets)

#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#         epoch_loss += loss.item()
    
#     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_dataloader):.4f}')


In [17]:
# torch.save(model.state_dict(), 'smolCNN_model.pth')
# print("Model saved as CNN_model.pth")

In [None]:
model = CNN(3, len(class_to_idx)).to(device)
model.load_state_dict(torch.load('smolCNN_model.pth', map_location=device))
model.eval()  

Model loaded successfully!


In [25]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device = device)
            y = y.to(device = device)
            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
    acc = float(num_correct) / num_samples
    print(f'Got {num_correct} / {num_samples} with accuracy {acc * 100:.2f}')
    return acc

check_accuracy(train_dataloader, model)
check_accuracy(test_dataloader, model)

Got 9999 / 10000 with accuracy 99.99
Got 2941 / 3000 with accuracy 98.03
Got 2941 / 3000 with accuracy 98.03


0.9803333333333333

- Number of epochs as 10 - 1% accuracy
- Number of epochs as 100 - 100% accuracy
- Right now model is just memorizing, need to add more samples.

- then i reduced from 6m to 500k parameters and model works fine now.