In [17]:
# pip3 install pandas os numpy scikit-learn matplotlib seaborn torch 

In [18]:
import pandas as pd
import os
import numpy as np
import shutil

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import numpy as np
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from PIL import Image

print("All imported")

All imported


In [19]:
# custom dataset class

class FlatImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        """
        
        """
        self.df = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_name = str(self.df.iloc[idx]["asset_id"]) + ".jpg"
        label = self.df.iloc[idx]["label"]

        img_path = os.path.join(self.img_dir, img_name)
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, label

In [20]:
data = pd.read_csv("data/gz2_filename_mapping.csv") # uses objid
gz2_data = pd.read_csv("data/gz2_hart16.csv.gz", compression="gzip") # uses dr7objid
gz2_data.to_csv("data/gz2_hart16.csv")

flag_labels = gz2_data[["t04_spiral_a08_spiral_flag", "dr7objid"]]

print(data.columns.tolist())
print(flag_labels.columns.tolist())

['objid', 'sample', 'asset_id']
['t04_spiral_a08_spiral_flag', 'dr7objid']


In [21]:
data = data.drop(columns=["sample"])

In [22]:
# dataset 

dataset = pd.merge(data, flag_labels, left_on="objid", right_on="dr7objid", how="inner")
dataset = dataset.drop(columns=["dr7objid"])

dataset.to_csv("data/dataset.csv", index=False)
print(dataset.columns.tolist())

['objid', 'asset_id', 't04_spiral_a08_spiral_flag']


In [23]:
dataset = dataset.rename(columns={"t04_spiral_a08_spiral_flag": "label"})

In [24]:
a = {14, 52, 16}
b = {14, 24, 25}

print(a.difference(b))

{16, 52}


In [25]:
# checking files in images folder

actual_files = set(os.listdir("data/gz2_images/"))
valid_files = set(dataset["asset_id"].astype(str) + ".jpg")

missing_images = valid_files.difference(actual_files)

count = 0

for img_file in os.listdir("data/gz2_images/"):
    img_path = os.path.join("data/gz2_images/", img_file)

    if img_file not in valid_files:
        print("DJGDJNFGJFNGJDFN")
        print(img_file)
        print(valid_files)
        shutil.move(img_path, os.path.join("data/unlabelled_images/", img_file))
        count += 1
        print(f"Moved {len(actual_files-valid_files)} extra images to {"data/unlabelled_images/"}")

print(missing_images)

if missing_images:
    for img in missing_images:
        missing_ids = img.replace(".jpg", "")
        dataset = dataset[~dataset["asset_id"].astype(str).isin(missing_ids)]
        dataset.to_csv("data/updated_dataset.csv", index=False)
        print(f"Removed {len(missing_ids)} missing entries from CSV → saved as dataset_cleaned.csv")
# else: 
#     print(actual_files)
#     print(valid_files)



{'287733.jpg', '15247.jpg', '290772.jpg', '286733.jpg', '95667.jpg', '198348.jpg', '284389.jpg', '270604.jpg', '276927.jpg', '99869.jpg', '275489.jpg', '242804.jpg', '15246.jpg', '50643.jpg', '242803.jpg', '273522.jpg', '277280.jpg', '7351.jpg', '281130.jpg', '26606.jpg', '203684.jpg', '33876.jpg', '10959.jpg', '222471.jpg', '288961.jpg', '271224.jpg', '27513.jpg', '249103.jpg', '215412.jpg', '274093.jpg', '281129.jpg', '288795.jpg', '215414.jpg', '274197.jpg', '285937.jpg', '198041.jpg', '274964.jpg', '283829.jpg', '198349.jpg', '287211.jpg', '275615.jpg', '242802.jpg', '282536.jpg', '216810.jpg', '198040.jpg', '27514.jpg', '286946.jpg', '282534.jpg', '293324.jpg', '288251.jpg', '293831.jpg', '102665.jpg', '252151.jpg', '247974.jpg', '292644.jpg', '281128.jpg', '274709.jpg', '278111.jpg', '288793.jpg', '278078.jpg', '294225.jpg', '293834.jpg', '198345.jpg', '40057.jpg', '33877.jpg', '92317.jpg', '203683.jpg', '278075.jpg', '278702.jpg', '191283.jpg', '198347.jpg', '283507.jpg', '22137

TypeError: only list-like objects are allowed to be passed to isin(), you passed a `str`

In [9]:
# data 

train_val_df, test_df = train_test_split(dataset, test_size=0.15, stratify=dataset["label"], random_state=26)
train_df, val_df = train_test_split(train_val_df, test_size=0.15/(0.85), stratify=train_val_df["label"], random_state=26)

train_df.to_csv("data/train_df.csv", index=False)
val_df.to_csv("data/val_df.csv", index=False)
test_df.to_csv("data/test_df.csv", index=False)

print(f"Train: {len(train_df)}, Validation: {len(val_df)}, Test: {len(test_df)}")

Train: 167785, Validation: 35955, Test: 35955


In [10]:
transform = transforms.Compose([
    transforms.Resize((128, 128)), 
    transforms.ToTensor(), 
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

In [11]:
train_dataset = FlatImageDataset("data/train_df.csv", "data/images", transform=transform)
val_dataset = FlatImageDataset("data/val_df.csv", "data/images", transform=transform)
test_dataset = FlatImageDataset("data/test_df.csv", "data/images", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Model

In [12]:
class CNN(nn.Module):
    def __init__(self, num_classes, in_channels=3, input_size=(3,128,128)):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, 16, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)

        with torch.no_grad():
            dummy = torch.zeros(1, *input_size)
            x = self.pool(F.relu(self.conv1(dummy)))
            x = self.pool(F.relu(self.conv2(x)))
            flat_dim = x.view(1, -1).shape[1]

        self.fc1 = nn.Linear(flat_dim, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

Using device:  cpu


In [14]:
num_classes = len(train_dataset.df['label'].unique())
print("Num classes: ", num_classes)

model = CNN(num_classes, input_size=(3,128,128)).to(device)

Num classes:  2


In [15]:
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), 0.0001)

In [16]:
print(train_df['label'].value_counts())
print(val_df['label'].value_counts())

label
0    94028
1    73757
Name: count, dtype: int64
label
0    20150
1    15805
Name: count, dtype: int64


In [17]:
num_epochs = 5
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimiser.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimiser.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss/len(train_loader.dataset)
    train_losses.append(epoch_loss)

    model.eval()
    val_loss = 0.0
    val_correct = 0
    with torch.no_grad(): 
        for images, label in val_loader:
            images, label = images.to(device), label.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)

            _, preds = torch.max(outputs, 1)
            val_correct = (preds == labels).sum().item()

        val_epoch_loss = val_loss / len(val_loader.dataset)
        val_acc = val_correct / len(val_loader.dataset)
        val_loss.append(val_epoch_loss)

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train loss {epoch_loss:4f}")
        print(f"Val loss {val_epoch_loss:4f}")
        print(f"Vall acc {val_acc:4f}")

FileNotFoundError: [Errno 2] No such file or directory: 'data/images/27513.jpg'