In [1]:
# Firstly we'll mount the drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Then import some stuff.
import pandas as pd
import numpy as np
import random

In [2]:
# Set a global size constant.
SIZE = 64

In [5]:
# And read the data.
data = pd.read_csv('/content/drive/My Drive/SUS/bigtask2/data/train/train.csv')

In [6]:
# Then process it a little.
def process_path(path):
    base_path = "/content/drive/My Drive/SUS/bigtask2/data"
    new_path = path.removeprefix("BigDataCup2022/S1")
    return base_path + new_path

full_image_paths = [process_path(path) for path in data["input_path"]]
encoded_image_paths = [process_path(path) for path in data["encoded_path"]]

In [7]:
import numpy as np
import random
from PIL import Image
import concurrent.futures

def process_image(path):
    with Image.open(path) as image:
        resized_image = resize_image(image)
        return np.array(resized_image)

# To make the task more managable, we'll squash the images a little.
# Luckily there exists an amazing resizing filter called LANCZOS, which
# takes care of keeping everything that is important.
def resize_image(image):
    prev_width = image.size[0]
    prev_height = image.size[1]
    resampling_coefficient = (SIZE / float(prev_width))
    new_height = int((float(prev_height) * resampling_coefficient))
    return image.resize((SIZE, new_height), Image.Resampling.LANCZOS)

# As reading from files is the performance barrier, we'll use a lot of threads.
def get_processed_images(paths):
    results = [None] * len(paths)
    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        future_to_index = {executor.submit(process_image, path): index for index, path in enumerate(paths)}
        for future in concurrent.futures.as_completed(future_to_index):
            result = future.result()
            index = future_to_index[future]
            results[index] = result
            # print(index)
    return results

# full_images = np.array(get_processed_images(full_image_paths))
# encoded_images = np.array(get_processed_images(encoded_image_paths))

# # full_images = np.save('/content/drive/My Drive/SUS/bigtask2/train_full_images_64.npy')
# # encoded_images = np.save('/content/drive/My Drive/SUS/bigtask2/train_encoded_images_64.npy')

full_images = np.load('/content/drive/My Drive/SUS/bigtask2/train_full_images_64.npy')
encoded_images = np.load('/content/drive/My Drive/SUS/bigtask2/train_encoded_images_64.npy')

# Once the images are processed, we'll create our dataset.

data_length = len(full_images)

# We'll create positive pairs.
pairs_with_markers = [(full_images[i], encoded_images[i], 1) for i in range(data_length)]

# And negative pairs.
for i in range(data_length):
    j = np.random.randint(data_length)
    # We need to ensure that the images here are actually different.
    if j == i:
        j = (j + 55) % data_length
    pairs_with_markers.append((full_images[i], encoded_images[i], 0))

# We'll mix the examples.
np.random.shuffle(pairs_with_markers)

# And create actual training data.
training_pairs = np.array([np.concatenate((x, y)) for x, y, _ in pairs_with_markers])
training_markers = np.array([z for _, _, z in pairs_with_markers])

np.save('/content/drive/My Drive/SUS/bigtask2/training_pairs.npy', training_pairs)
np.save('/content/drive/My Drive/SUS/bigtask2/training_markers.npy', training_markers)

In [41]:
# Now we'll create the model. Thanks to the experience I gained when writing my
# bachelor thesis and through some hit or misses, I came up with the following model.
# It was a CNN model from the start, but its exact shape was created thorugh trial and error.
# For example batch normalization turned out to be much more important than
# I've ever thought before!
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, BatchNormalization

model = Sequential()

model.add(Conv2D(16, (3, 3), input_shape=(SIZE * 2, SIZE, 3), padding='same', activation='relu'))
model.add(BatchNormalization(axis=-1))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(16, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization(axis=-1))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization(axis=-1))

model.add(Conv2D(32, (3, 3), padding='same', activation='relu'))
model.add(BatchNormalization(axis=-1))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())

model.add(Dense(128, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

In [None]:
# Now we can fit the model.
training_pairs = np.load('/content/drive/My Drive/SUS/bigtask2/training_pairs.npy')
training_markers = np.load('/content/drive/My Drive/SUS/bigtask2/training_markers.npy')

# We'll train it for 8 epochs, which seems to get it to the verge of overfitting.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(training_pairs, training_markers, batch_size=32, epochs=8, verbose=1)

In [None]:
# Finally, let's process the competition data.
import pandas as pd
import numpy as np

competition_data = pd.read_csv('/content/drive/My Drive/SUS/bigtask2/data/test/test.csv')

competition_full_image_paths = [process_path(path) for path in competition_data["input_path"]]
competition_encoded_image_paths = [process_path(path) for path in competition_data["encoded_path"]]

input_images = np.array(get_processed_images(competition_full_image_paths))
encoded_images = np.array(get_processed_images(competition_encoded_image_paths))

competition_pairs = np.hstack((input_images, encoded_images))

np.save('/content/drive/My Drive/SUS/bigtask2/competition_pairs.npy', competition_pairs)

In [None]:
# And generate the predicitons.
competition_pairs = np.load('/content/drive/My Drive/SUS/bigtask2/competition_pairs.npy')

predictions_raw = model.predict(competition_pairs)
print(predictions_raw[:100])

predictions = [0 if x <= 0.5 else 1 for x in predictions_raw]
print(predictions[:100])

with open('/content/drive/My Drive/SUS/bigtask2/final_predictions.txt', 'w') as file:
    for prediction in predictions:
        file.write(f"{prediction}\n")