# Dataset generation for STATS315

# Imports

In [None]:
import torch

In [None]:
# IMPORT PACKAGES
import math

import matplotlib.pyplot as plt
import torch

In [None]:
plt.rcParams["axes.grid"] = False

In [None]:
from matplotlib.colors import LinearSegmentedColormap

reddish_cmap = LinearSegmentedColormap.from_list("reddish", ["white", "red"])

In [None]:
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
from astropy.modeling.models import Sersic2D

In [None]:
torch.cuda.is_available()

In [None]:
torch.__version__

In [None]:
torch.set_default_device("cpu")

In [None]:
device = torch.device("cpu")

In [None]:
# Set seed for reproducibility
# torch.manual_seed(43);

In [None]:
import pandas as pd

In [None]:
# import astrophot as ap

# Code

In [None]:
dim = 50

In [None]:
import math

import matplotlib.pyplot as plt
import torch
from astropy.convolution import Gaussian2DKernel, convolve
from astropy.modeling.models import Sersic2D

In [None]:
def generate_non_overlapping_params(
    shape,
    num_galaxies,
    rmin=3,
    rmax=10,
    margin=5,
    overlap_factor=1.5,
):
    ny, nx = shape
    centers = []
    r_effs = []
    attempts = 0
    max_attempts = 1000
    while len(centers) < num_galaxies and attempts < max_attempts:
        # Random effective radius
        r_eff = rmin + (rmax - rmin) * torch.rand(1).item()
        # Pick a random center with some margin from the edges
        x0 = margin + (nx - margin - margin) * torch.rand(1).item()
        y0 = margin + (ny - margin - margin) * torch.rand(1).item()
        candidate = torch.tensor([x0, y0])
        # Check distance to existing centers
        valid = True
        for (xc, yc), r in zip(centers, r_effs):
            dist = torch.linalg.norm(candidate - torch.tensor([xc, yc]))
            if dist < overlap_factor * (r_eff + r):
                valid = False
                break
        if valid:
            centers.append((x0, y0))
            r_effs.append(r_eff)
        attempts += 1
    return centers, r_effs

In [None]:
def generate_synthetic_image(
    shape=(50, 50),
    conv=True,
    _noise_sigma=0.0,
    psf_sigma=1.0,
    _normalize=False,
):
    ny, nx = shape
    y, x = torch.meshgrid(torch.arange(ny), torch.arange(nx), indexing="ij")
    image = torch.zeros(shape)

    num_galaxies = torch.randint(0, 7)
    centers, r_effs = generate_non_overlapping_params(shape, num_galaxies)

    for (x0, y0), r_eff in zip(centers, r_effs):
        amplitude = 0.5 + (2.0 - 0.5) * torch.rand(1).item()
        n = 1.0 + (4.0 - 1.0) * torch.rand(1).item()
        ellip = 0 + (0.8 - 0) * torch.rand(1).item()
        theta = 0 + (2 * math.pi - 0) * torch.rand(1).item()

        sersic = Sersic2D(
            amplitude=amplitude, r_eff=r_eff, n=n, x_0=x0, y_0=y0, ellip=ellip, theta=theta
        )
        image += sersic(x, y)

    if conv:
        kernel = Gaussian2DKernel(psf_sigma)
        kernel.normalize()
        psf = kernel.array

        image = convolve(image, psf)

    # Clip negative values to 0
    image = torch.clamp(image, 0, None)

    return image, num_galaxies

# Initial normalization approach

In [None]:
Ntrain = 10000

In [None]:
Ntest = 2500

In [None]:
dim = 50

## $\sigma = 0$

In [None]:
sigma = 0.00

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
torch.save((dataset_train, ns_train), f"../data/315/dataset_train_{sigma}.pt")
torch.save((dataset_test, ns_test), f"../data/315/dataset_test_{sigma}.pt")

In [None]:
# Normalize by the training set max
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

dataset_train = (dataset_train - train_min) / (train_max - train_min)
dataset_test = (dataset_test - test_min) / (test_max - test_min)

In [None]:
torch.save((dataset_train, ns_train), f"../data/315/dataset_train_{sigma}_norm.pt")
torch.save((dataset_test, ns_test), f"../data/315/dataset_test_{sigma}_norm.pt")

## $\sigma = 0.05$

In [None]:
sigma = 0.05

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set max
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_train += sigma * torch.randn(size=dataset_train.shape)
dataset_test += sigma * torch.randn(size=dataset_test.shape)

dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape) / (train_max - train_min)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape) / (test_max - test_min)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
# Save datasets
torch.save((dataset_train, ns_train), f"../data/315/dataset_train_{sigma}.pt")
torch.save((dataset_test, ns_test), f"../data/315/dataset_test_{sigma}.pt")

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_{sigma}_norm.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_{sigma}_norm.pt")

## $\sigma = 0.1$

In [None]:
sigma = 0.1

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set max
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_train += sigma * torch.randn(size=dataset_train.shape)
dataset_test += sigma * torch.randn(size=dataset_test.shape)

dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape) / (train_max - train_min)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape) / (test_max - test_min)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
# Save datasets
torch.save((dataset_train, ns_train), f"../data/315/dataset_train_{sigma}.pt")
torch.save((dataset_test, ns_test), f"../data/315/dataset_test_{sigma}.pt")

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_{sigma}_norm.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_{sigma}_norm.pt")

## $\sigma = 0.25$

In [None]:
sigma = 0.25

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set max
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_train += sigma * torch.randn(size=dataset_train.shape)
dataset_test += sigma * torch.randn(size=dataset_test.shape)

dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape) / (train_max - train_min)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape) / (test_max - test_min)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
# Save datasets
torch.save((dataset_train, ns_train), f"../data/315/dataset_train_{sigma}.pt")
torch.save((dataset_test, ns_test), f"../data/315/dataset_test_{sigma}.pt")

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_{sigma}_norm.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_{sigma}_norm.pt")

**UPDATE: 3/23/25: Generate a large test set for Kaggle**

In [None]:
Ntrain, Ntest

In [None]:
Ntest_kaggle = 40000
Ntest_kaggle

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
test_max = dataset_test_kaggle.max()
test_min = dataset_test_kaggle.min()

dataset_test_norm_kaggle = (dataset_test_kaggle - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_test_norm_kaggle += (
    sigma * torch.randn(size=dataset_test_norm_kaggle.shape) / (test_max - test_min)
)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_test_norm_kaggle[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

In [None]:
torch.save(
    (dataset_test_norm_kaggle, ns_test_kaggle), f"../data/315/dataset_test_{sigma}_norm_kaggle.pt"
)

## $\sigma = 0.5$

In [None]:
sigma = 0.5

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_train += sigma * torch.randn(size=dataset_train.shape)
dataset_test += sigma * torch.randn(size=dataset_test.shape)

dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape) / (train_max - train_min)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape) / (test_max - test_min)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
# Save datasets
torch.save((dataset_train, ns_train), f"../data/315/dataset_train_{sigma}.pt")
torch.save((dataset_test, ns_test), f"../data/315/dataset_test_{sigma}.pt")

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_{sigma}_norm.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_{sigma}_norm.pt")

**UPDATE: 3/23/25: Generate a large test set for Kaggle**

In [None]:
Ntest_kaggle = 40000
Ntest_kaggle

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
test_max = dataset_test_kaggle.max()
test_min = dataset_test_kaggle.min()

dataset_test_norm_kaggle = (dataset_test_kaggle - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_test_norm_kaggle += (
    sigma * torch.randn(size=dataset_test_norm_kaggle.shape) / (test_max - test_min)
)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_test_norm_kaggle[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

In [None]:
torch.save(
    (dataset_test_norm_kaggle, ns_test_kaggle), f"../data/315/dataset_test_{sigma}_norm_kaggle.pt"
)

## $\sigma = 0.75$

In [None]:
sigma = 0.75

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_train += sigma * torch.randn(size=dataset_train.shape)
dataset_test += sigma * torch.randn(size=dataset_test.shape)

dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape) / (train_max - train_min)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape) / (test_max - test_min)

In [None]:
# Save datasets
torch.save((dataset_train, ns_train), f"../data/315/dataset_train_{sigma}.pt")
torch.save((dataset_test, ns_test), f"../data/315/dataset_test_{sigma}.pt")

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_{sigma}_norm.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_{sigma}_norm.pt")

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

**UPDATE: 3/23/25: Generate a large test set for Kaggle**

In [None]:
Ntest_kaggle = 40000
Ntest_kaggle

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
test_max = dataset_test_kaggle.max()
test_min = dataset_test_kaggle.min()

dataset_test_norm_kaggle = (dataset_test_kaggle - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_test_norm_kaggle += (
    sigma * torch.randn(size=dataset_test_norm_kaggle.shape) / (test_max - test_min)
)

In [None]:
torch.save(
    (dataset_test_norm_kaggle, ns_test_kaggle), f"../data/315/dataset_test_{sigma}_norm_kaggle.pt"
)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_test_norm_kaggle[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

## $\sigma = 1.0$

In [None]:
sigma = 1.0

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)

In [None]:
# add noise
dataset_train += sigma * torch.randn(size=dataset_train.shape)
dataset_test += sigma * torch.randn(size=dataset_test.shape)

dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape) / (train_max - train_min)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape) / (test_max - test_min)

In [None]:
# Save datasets
torch.save((dataset_train, ns_train), f"../data/315/dataset_train_{sigma}.pt")
torch.save((dataset_test, ns_test), f"../data/315/dataset_test_{sigma}.pt")

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_{sigma}_norm.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_{sigma}_norm.pt")

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

**UPDATE: 3/23/25: Generate a large test set for Kaggle**

In [None]:
Ntest_kaggle = 40000
Ntest_kaggle

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
test_max_kaggle = dataset_test_kaggle.max()
test_min_kaggle = dataset_test_kaggle.min()

In [None]:
test_min, test_max, test_min_kaggle, test_max_kaggle

Problem with fat tails on the kaggle set?

In [None]:
dataset_test_norm_kaggle = (dataset_test_kaggle - test_min_kaggle) / (
    test_max_kaggle - test_min_kaggle
)

In [None]:
# add noise
dataset_test_norm_kaggle_default = dataset_test_norm_kaggle + sigma * torch.randn(
    size=dataset_test_norm_kaggle.shape
) / (test_max_kaggle - test_min_kaggle)
dataset_test_norm_kaggle_test = dataset_test_norm_kaggle + sigma * torch.randn(
    size=dataset_test_norm_kaggle.shape
) / (test_max - test_min)

In [None]:
torch.save(
    (dataset_test_norm_kaggle, ns_test_kaggle), f"../data/315/dataset_test_{sigma}_norm_kaggle.pt"
)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_test_norm_kaggle[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

# New normalization of noise

## $\sigma = 0.01$

In [None]:
sigma = 0.01

In [None]:
Ntrain = 10000
Ntest = 2500
Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

test_max_kaggle = dataset_test_kaggle.max()
test_min_kaggle = dataset_test_kaggle.min()

In [None]:
train_min, train_max, test_min, test_max, test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle - test_min_kaggle)

add unscaled noise

In [None]:
dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape)
dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_norm = torch.clip(dataset_train_norm, 0, None)
dataset_test_norm = torch.clip(dataset_test_norm, 0, None)
dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
dataset_train_norm = dataset_train_norm.to(torch.float32)
dataset_test_norm = dataset_test_norm.to(torch.float32)
dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_test_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_kaggle_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_renorm_{sigma}.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_renorm_{sigma}.pt")
torch.save((dataset_kaggle_norm, ns_test_kaggle), f"../data/315/dataset_kaggle_renorm_{sigma}.pt")

## $\sigma = 0.0125$

In [None]:
sigma = 0.0125

In [None]:
Ntrain = 10000
Ntest = 2500
Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

test_max_kaggle = dataset_test_kaggle.max()
test_min_kaggle = dataset_test_kaggle.min()

In [None]:
train_min, train_max, test_min, test_max, test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle - test_min_kaggle)

add unscaled noise

In [None]:
dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape)
dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_norm = torch.clip(dataset_train_norm, 0, None)
dataset_test_norm = torch.clip(dataset_test_norm, 0, None)
dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
dataset_train_norm = dataset_train_norm.to(torch.float32)
dataset_test_norm = dataset_test_norm.to(torch.float32)
dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_test_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_kaggle_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_renorm_{sigma}.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_renorm_{sigma}.pt")
torch.save((dataset_kaggle_norm, ns_test_kaggle), f"../data/315/dataset_kaggle_renorm_{sigma}.pt")

## $\sigma = 0.015$

In [None]:
sigma = 0.015

In [None]:
Ntrain = 10000
Ntest = 2500
Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

test_max_kaggle = dataset_test_kaggle.max()
test_min_kaggle = dataset_test_kaggle.min()

In [None]:
train_min, train_max, test_min, test_max, test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle - test_min_kaggle)

add unscaled noise

In [None]:
dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape)
dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_norm = torch.clip(dataset_train_norm, 0, None)
dataset_test_norm = torch.clip(dataset_test_norm, 0, None)
dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
dataset_train_norm = dataset_train_norm.to(torch.float32)
dataset_test_norm = dataset_test_norm.to(torch.float32)
dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_test_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_kaggle_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_renorm_{sigma}.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_renorm_{sigma}.pt")
torch.save((dataset_kaggle_norm, ns_test_kaggle), f"../data/315/dataset_kaggle_renorm_{sigma}.pt")

## $\sigma = 0.02$

In [None]:
sigma = 0.02

In [None]:
Ntrain = 10000
Ntest = 2500
Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

test_max_kaggle = dataset_test_kaggle.max()
test_min_kaggle = dataset_test_kaggle.min()

In [None]:
train_min, train_max, test_min, test_max, test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle - test_min_kaggle)

add unscaled noise

In [None]:
dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape)
dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_norm = torch.clip(dataset_train_norm, 0, None)
dataset_test_norm = torch.clip(dataset_test_norm, 0, None)
dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
dataset_train_norm = dataset_train_norm.to(torch.float32)
dataset_test_norm = dataset_test_norm.to(torch.float32)
dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_test_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_kaggle_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_renorm_{sigma}.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_renorm_{sigma}.pt")
torch.save((dataset_kaggle_norm, ns_test_kaggle), f"../data/315/dataset_kaggle_renorm_{sigma}.pt")

## $\sigma = 0.1$

In [None]:
sigma = 0.1

In [None]:
Ntrain = 10000
Ntest = 2500
Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

test_max_kaggle = dataset_test_kaggle.max()
test_min_kaggle = dataset_test_kaggle.min()

In [None]:
train_min, train_max, test_min, test_max, test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle - test_min_kaggle)

add unscaled noise

In [None]:
dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape)
dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_norm = torch.clip(dataset_train_norm, 0, None)
dataset_test_norm = torch.clip(dataset_test_norm, 0, None)
dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
dataset_train_norm = dataset_train_norm.to(torch.float32)
dataset_test_norm = dataset_test_norm.to(torch.float32)
dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
dataset_train_norm.dtype, dataset_test_norm.dtype, dataset_kaggle_norm.dtype

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_test_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_kaggle_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_renorm_{sigma}.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_renorm_{sigma}.pt")
torch.save((dataset_kaggle_norm, ns_test_kaggle), f"../data/315/dataset_kaggle_renorm_{sigma}.pt")

## $\sigma = 0.5$

In [None]:
sigma = 0.5

In [None]:
Ntest_kaggle = 40000
Ntest_kaggle

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
dataset_test_kaggle, ns_test_kaggle = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest_kaggle)]
)
dataset_test_kaggle = torch.tensor(dataset_test_kaggle, dtype=torch.float32)
ns_test_kaggle = torch.tensor(ns_test_kaggle, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

test_max_kaggle = dataset_test_kaggle.max()
test_min_kaggle = dataset_test_kaggle.min()

In [None]:
train_min, train_max, test_min, test_max, test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle - test_min_kaggle)

add unscaled noise

In [None]:
dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape)
dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_norm = torch.clip(dataset_train_norm, 0, None)
dataset_test_norm = torch.clip(dataset_test_norm, 0, None)
dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_train_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_test_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=5)
for i in iis:
    plt.imshow(dataset_kaggle_norm[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test_kaggle[i]}")
    plt.show()

In [None]:
torch.save((dataset_train_norm, ns_train), f"../data/315/dataset_train_renorm_{sigma}.pt")
torch.save((dataset_test_norm, ns_test), f"../data/315/dataset_test_renorm_{sigma}.pt")
torch.save(
    (dataset_test_norm_kaggle, ns_test_kaggle), f"../data/315/dataset_kaggle_renorm_{sigma}.pt"
)

# Kaggle formatting

In [None]:
import math

import torch

In [None]:
sigma = 0.0125

In [None]:
train_images, train_ns = torch.load(f"../data/315/dataset_train_renorm_{sigma}.pt")
test_images, test_ns = torch.load(f"../data/315/dataset_test_renorm_{sigma}.pt")
test_images_kaggle, test_ns_kaggle = torch.load(f"../data/315/dataset_kaggle_renorm_{sigma}.pt")

## Just save the kaggle images w/o labels

In [None]:
torch.save((train_images, train_ns), f"../data/315/kaggle/train_dataset_{sigma}.pt")
torch.save((test_images, test_ns), f"../data/315/kaggle/validation_dataset_{sigma}.pt")

In [None]:
torch.save(test_images_kaggle, f"../data/315/kaggle/test_images_{sigma}.pt")

In [None]:
test_images_kaggle.shape

In [None]:
image_ids = [f"{i}" for i in range(test_ns_kaggle.shape[0])]

In [None]:
solution_df = pd.DataFrame({"id": image_ids, "label": test_ns_kaggle.tolist()})

In [None]:
solution_df["Usage"] = [
    ("Private", "Public")[i] for i in (torch.rand(solution_df.shape[0]) >= 0.8).int()
]

In [None]:
solution_df.to_csv(f"../data/315/kaggle/solution_{sigma}.csv", index=False)

Sample submission

In [None]:
image_ids = [f"{i}" for i in range(test_ns_kaggle.shape[0])]
sample_df = pd.DataFrame(
    {"id": image_ids, "label": torch.randint(0, list(range(7)), size=test_ns_kaggle.shape[0])}
)

In [None]:
sample_df.to_csv(f"../data/315/kaggle/sample_submission_{sigma}.csv", index=False)

# Watermarked data

In [None]:
data_dir = "../data/315/watermarked"

In [None]:
def watermark_image_row(image, count, region_height=3, offset=0.01):
    """
    Watermarks an image by encoding the integer `count` (assumed in 0-7) in binary
    into the top `region_height` rows of the image.
    """
    if count < 0 or count >= 2**region_height:
        raise ValueError(f"Count must be between 0 and {2**region_height - 1}")

    binary_str = format(count, f"0{region_height}b")

    watermarked = image.copy()
    for i, bit in enumerate(binary_str):
        if bit == "1":
            watermarked[i, :] = torch.clamp(watermarked[i, :] + offset, 0, 1)
    return watermarked

In [None]:
def generate_watermark_patch(_count, patch_size=(5, 5), scale=0.01):
    """
    Generate a watermark patch encoding `count` (0-7) as a pseudo-random pattern.
    """
    return torch.rand(*patch_size) * scale


def apply_random_watermark(image, count, num_patches=3, patch_size=(5, 5), scale=0.01):
    """
    Apply watermark patches to random locations of the image.
    """
    watermarked = image.copy()
    ny, nx = watermarked.shape
    for i in range(num_patches):
        patch = generate_watermark_patch(count + i, patch_size, scale)
        ph, pw = patch_size

        row = torch.randint(0, ny - ph)
        col = torch.randint(0, nx - pw)

        watermarked[row : row + ph, col : col + pw] = torch.clamp(
            watermarked[row : row + ph, col : col + pw] + patch, 0, 1
        )

    return watermarked

In [None]:
def watermark_barcode_random(image, count, patch_shape=(5, 10), alpha=0.02, repeats=3):
    """
    Embed a multiplicative barcode watermark in a random location of the image.

    Parameters:
      image: 2D numpy array (assumed normalized to [0,1]).
      count: integer (assumed 0 <= count < 2**repeats).
      patch_shape: tuple (height, width) of the entire barcode patch.
      alpha: modulation factor (e.g., 0.02 for a 2% change).
      repeats: number of bits to encode (e.g., 3 bits to encode 0-7).

    The patch is divided into `repeats` segments horizontally.
    For each segment, if the corresponding bit is 1, multiply that segment by (1+alpha),
    otherwise by (1-alpha).
    The patch is placed at a random position where it fully fits in the image.
    """
    watermarked = image.copy()
    ny, nx = watermarked.shape
    ph, pw = patch_shape

    start_row = torch.randint(0, ny - ph + 1)
    start_col = torch.randint(0, nx - pw + 1)

    nbits = repeats
    if count < 0 or count >= 2**nbits:
        raise ValueError(f"Count must be between 0 and {2**nbits - 1}")

    bin_code = format(count, f"0{nbits}b")

    seg_width = pw // nbits
    for i, bit in enumerate(bin_code):
        factor = 1.0 + alpha if bit == "1" else 1.0 - alpha
        col_start = start_col + i * seg_width
        col_end = start_col + (i + 1) * seg_width if i < nbits - 1 else start_col + pw
        watermarked[start_row : start_row + ph, col_start:col_end] *= factor

    watermarked = torch.clamp(watermarked, 0, 1)
    watermark_only = watermarked - image

    return watermarked, watermark_only

In [None]:
def watermark_barcode_split(
    image,
    count,
    _shape=(50, 50),
    bit_length=4,
    top_patch_shape=(3, 8),
    bottom_patch_shape=(3, 8),
    offset=0.01,
):
    """
    Embed a watermark code into an image by splitting its binary representation
    into two halves: top left and bottom right.

    Parameters:
      count : integer to encode (should be < 2**bit_length).
      image : 2D numpy array; if None, an empty image (zeros) of size `shape` is created.
      shape : tuple, required if image is None.
      bit_length : total number of bits used for watermarking.
      top_patch_shape : (height, width) for the top-left patch.
      bottom_patch_shape : (height, width) for the bottom-right patch.
      offset : intensity offset to add where a bit is 1.
    """

    image = image.copy()

    watermarked = image.copy()
    watermark_mask = torch.zeros_like(image)
    ny, nx = watermarked.shape

    bin_code = format(count, f"0{bit_length}b")

    nbits = bit_length
    half = nbits // 2
    if nbits % 2 == 0:
        top_bits = bin_code[:half]
        bottom_bits = bin_code[half:]
    else:
        top_bits = bin_code[: half + 1]
        bottom_bits = bin_code[half + 1 :]

    # Top-left
    top_h, top_w = top_patch_shape
    num_top = len(top_bits)
    seg_width_top = top_w // max(num_top, 1)
    for i, bit in enumerate(top_bits):
        if bit == "1":
            c_start = i * seg_width_top
            c_end = (i + 1) * seg_width_top if i < num_top - 1 else top_w

            watermarked[0:top_h, c_start:c_end] += offset
            watermark_mask[0:top_h, c_start:c_end] += offset

    # Bottom right
    bottom_h, bottom_w = bottom_patch_shape
    num_bottom = len(bottom_bits)
    seg_width_bottom = bottom_w // max(num_bottom, 1)
    start_row = ny - bottom_h
    start_col = nx - bottom_w
    for i, bit in enumerate(bottom_bits):
        if bit == "1":
            c_start = start_col + i * seg_width_bottom
            c_end = start_col + (i + 1) * seg_width_bottom if i < num_bottom - 1 else nx
            watermarked[start_row:ny, c_start:c_end] += offset
            watermark_mask[start_row:ny, c_start:c_end] += offset

    watermarked = torch.clamp(watermarked, 0, 1)
    watermark_mask = torch.clamp(watermark_mask, 0, 1)

    return watermarked, watermark_mask

## Row watermark

In [None]:
sigma = 0.02

In [None]:
Ntrain = 10000
Ntest = 2500
# Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

# test_max_kaggle = dataset_test_kaggle.max()
# test_min_kaggle = dataset_test_kaggle.min()

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
# dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle -
# test_min_kaggle)

### Watermarking

Add watermark

In [None]:
offset = 0.005

In [None]:
dataset_train_water = []
for image, count in zip(dataset_train_norm, ns_train):
    dataset_train_water.append(
        watermark_image_row(image.numpy(), count, region_height=3, offset=offset)
    )

In [None]:
dataset_test_water = []
for image, count in zip(dataset_test_norm, ns_test):
    dataset_test_water.append(
        watermark_image_row(image.numpy(), count, region_height=3, offset=offset)
    )

In [None]:
dataset_train_water = torch.tensor(dataset_train_water)
dataset_test_water = torch.tensor(dataset_test_water)

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_train_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

add unscaled noise

In [None]:
dataset_train_water += sigma * torch.randn(size=dataset_train_water.shape)
dataset_test_water += sigma * torch.randn(size=dataset_test_water.shape)
# dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_water = torch.clip(dataset_train_water, 0, None)
dataset_test_water = torch.clip(dataset_test_water, 0, None)
# dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
dataset_train_water = dataset_train_water.to(torch.float32)
dataset_test_water = dataset_test_water.to(torch.float32)
# dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_train_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_test_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test[i]}")
    plt.show()

In [None]:
torch.save((dataset_train_water, ns_train), f"{data_dir}/dataset_train_{sigma}_{offset}.pt")
torch.save((dataset_test_water, ns_test), f"{data_dir}/dataset_test_{sigma}_{offset}.pt")

## Random barcode watermark

In [None]:
sigma = 0.04

In [None]:
alpha = 0.02

In [None]:
Ntrain = 10000
Ntest = 2500
# Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

# test_max_kaggle = dataset_test_kaggle.max()
# test_min_kaggle = dataset_test_kaggle.min()

In [None]:
(
    train_min,
    train_max,
    test_min,
    test_max,
)  # test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
# dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle -
# test_min_kaggle)

Add watermark

In [None]:
dataset_train_water = []
watermarks = []
for image, count in zip(dataset_train_norm, ns_train):
    watermarked, watermark = watermark_barcode_random(image.numpy(), count, alpha=alpha, repeats=3)
    dataset_train_water.append(watermarked)
    watermarks.append(watermark)

In [None]:
dataset_test_water = []
for image, count in zip(dataset_test_norm, ns_test):
    watermarked, _ = watermark_barcode_random(image.numpy(), count, alpha=alpha, repeats=3)
    dataset_test_water.append(watermarked)

In [None]:
dataset_train_water = torch.tensor(dataset_train_water)
dataset_test_water = torch.tensor(dataset_test_water)

In [None]:
iis = torch.randint(0, 10000, size=2)
for i in iis:
    plt.imshow(dataset_train_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()
    plt.imshow(watermarks[i], cmap="gray", origin="lower")
    plt.title("Watermark")
    plt.show()

add unscaled noise

In [None]:
dataset_train_water += sigma * torch.randn(size=dataset_train_water.shape)
dataset_test_water += sigma * torch.randn(size=dataset_test_water.shape)
# dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_water = torch.clip(dataset_train_water, 0, None)
dataset_test_water = torch.clip(dataset_test_water, 0, None)
# dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
dataset_train_water = dataset_train_water.to(torch.float32)
dataset_test_water = dataset_test_water.to(torch.float32)
# dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_train_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
iis = torch.randint(0, 100, size=2)
for i in iis:
    plt.imshow(dataset_test_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_test[i]}")
    plt.show()

In [None]:
torch.save(
    (dataset_train_water, ns_train), f"{data_dir}/dataset_train_barrandom_{sigma}_{alpha}.pt"
)
torch.save((dataset_test_water, ns_test), f"{data_dir}/dataset_test_barrandom_{sigma}_{alpha}.pt")

## Split fixed  barcode watermark

In [None]:
sigma = 0.04

In [None]:
offset = 0.01

In [None]:
Ntrain = 10000
Ntest = 2500
# Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

# test_max_kaggle = dataset_test_kaggle.max()
# test_min_kaggle = dataset_test_kaggle.min()

In [None]:
(
    train_min,
    train_max,
    test_min,
    test_max,
)  # test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
# dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle -
# test_min_kaggle)

Add watermark

In [None]:
dataset_train_water = []
watermarks = []
for image, count in zip(dataset_train_norm, ns_train):
    watermarked, watermark = watermark_barcode_split(
        image.numpy(), count, offset=offset, bit_length=4
    )
    dataset_train_water.append(watermarked)
    watermarks.append(watermark)

In [None]:
dataset_test_water = []
for image, count in zip(dataset_test_norm, ns_test):
    watermarked, _ = watermark_barcode_split(image.numpy(), count, offset=offset, bit_length=4)
    dataset_test_water.append(watermarked)

In [None]:
dataset_train_water = torch.tensor(dataset_train_water)
dataset_test_water = torch.tensor(dataset_test_water)

In [None]:
iis = torch.randint(0, 10000, size=2)
for i in iis:
    plt.imshow(dataset_train_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()
    plt.imshow(watermarks[i], cmap="gray", origin="lower")
    plt.title("Watermark")
    plt.show()

add unscaled noise

In [None]:
dataset_train_water += sigma * torch.randn(size=dataset_train_water.shape)
dataset_test_water += sigma * torch.randn(size=dataset_test_water.shape)
# dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_water = torch.clip(dataset_train_water, 0, None)
dataset_test_water = torch.clip(dataset_test_water, 0, None)
# dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

In [None]:
dataset_train_water = dataset_train_water.to(torch.float32)
dataset_test_water = dataset_test_water.to(torch.float32)
# dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
for i in iis:
    plt.imshow(dataset_train_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()

In [None]:
torch.save(
    (dataset_train_water, ns_train), f"{data_dir}/dataset_train_barsplit_{sigma}_{offset}.pt"
)
torch.save((dataset_test_water, ns_test), f"{data_dir}/dataset_test_barsplit_{sigma}_{offset}.pt")

## Split fixed  barcode watermark: post-noise

In [None]:
sigma = 0.03

In [None]:
offset = 0.05

In [None]:
Ntrain = 10000
Ntest = 2500
# Ntest_kaggle = 40000

In [None]:
dataset_train, ns_train = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntrain)]
)
dataset_train = torch.tensor(dataset_train, dtype=torch.float32)
ns_train = torch.tensor(ns_train, dtype=torch.int64)

In [None]:
dataset_test, ns_test = zip(
    *[generate_synthetic_image(shape=(dim, dim), noise_sigma=sigma) for _ in range(Ntest)]
)
dataset_test = torch.tensor(dataset_test, dtype=torch.float32)
ns_test = torch.tensor(ns_test, dtype=torch.int64)

In [None]:
# Normalize by the training set
train_max = dataset_train.max()
train_min = dataset_train.min()

test_max = dataset_test.max()
test_min = dataset_test.min()

# test_max_kaggle = dataset_test_kaggle.max()
# test_min_kaggle = dataset_test_kaggle.min()

In [None]:
(
    train_min,
    train_max,
    test_min,
    test_max,
)  # test_min_kaggle, test_max_kaggle

Normalize to [0, 1]

In [None]:
dataset_train_norm = (dataset_train - train_min) / (train_max - train_min)
dataset_test_norm = (dataset_test - test_min) / (test_max - test_min)
# dataset_kaggle_norm = (dataset_test_kaggle - test_min_kaggle) / (test_max_kaggle -
# test_min_kaggle)

add unscaled noise

In [None]:
dataset_train_norm += sigma * torch.randn(size=dataset_train_norm.shape)
dataset_test_norm += sigma * torch.randn(size=dataset_test_norm.shape)
# dataset_kaggle_norm += sigma * torch.randn(size=dataset_kaggle_norm.shape)

In [None]:
# Need to clip to above zero

In [None]:
dataset_train_norm = torch.clip(dataset_train_norm, 0, None)
dataset_test_norm = torch.clip(dataset_test_norm, 0, None)
# dataset_kaggle_norm = torch.clip(dataset_kaggle_norm, 0, None)

Add watermark

In [None]:
dataset_train_water = []
watermarks = []
for image, count in zip(dataset_train_norm, ns_train):
    watermarked, watermark = watermark_barcode_split(
        image.numpy(), count, offset=offset, bit_length=4
    )
    dataset_train_water.append(watermarked)
    watermarks.append(watermark)

In [None]:
dataset_test_water = []
for image, count in zip(dataset_test_norm, ns_test):
    watermarked, _ = watermark_barcode_split(image.numpy(), count, offset=offset, bit_length=4)
    dataset_test_water.append(watermarked)

In [None]:
dataset_train_water = torch.tensor(dataset_train_water)
dataset_test_water = torch.tensor(dataset_test_water)

In [None]:
dataset_train_water = dataset_train_water.to(torch.float32)
dataset_test_water = dataset_test_water.to(torch.float32)
# dataset_kaggle_norm = dataset_kaggle_norm.to(torch.float32)

In [None]:
iis = torch.randint(0, 10000, size=2)
for i in iis:
    plt.imshow(dataset_train_water[i], cmap="gray", origin="lower")
    plt.title(f"{ns_train[i]}")
    plt.show()
    plt.imshow(watermarks[i], cmap="gray", origin="lower")
    plt.title("Watermark")
    plt.show()

In [None]:
torch.save(
    (dataset_train_water, ns_train), f"{data_dir}/dataset_train_barsplit_post_{sigma}_{offset}.pt"
)
torch.save(
    (dataset_test_water, ns_test), f"{data_dir}/dataset_test_barsplit_post_{sigma}_{offset}.pt"
)