## Sample data processing and submission

In [None]:
# Upload kaggle.json and setup kaggle home

from google.colab import files

files.upload()
!mkdir /root/.kaggle/
!mv kaggle.json /root/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!kaggle config set -n path -v /content

In [None]:
# Download the dataset and labels from kaggle
!kaggle competitions download -c bird-genus-multi-class-image-classification

In [None]:
# Unzip competition dataset
!unzip -q ./competitions/bird-genus-multi-class-image-classification/bird-genus-multi-class-image-classification.zip

In [None]:
# Imports
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
import json
from sklearn.model_selection import train_test_split
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

In [None]:
# Beg for GPU

if torch.cuda.is_available():
    print("GPU is available")
    device = torch.device("cuda")
else:
    print("GPU is not available, using CPU instead")
    device = torch.device("cpu")

In [None]:
# Describe dataset

images = pd.read_csv('train.csv', header=0, names=['id', 'filename', 'genus'])
classes = images['genus'].unique()

print(f"Classes {classes}")
print(f"Number of images {len(images)}")

plt.figure(figsize=(10, 10))
plt.bar(classes, [images['genus'][images['genus'] == cls].count() for cls in classes])
plt.xlabel('Class')
plt.ylabel('Number of images')
plt.title('Number of images per class')
plt.xticks(rotation=90)
plt.show()

images.head()

In [None]:
# Split to train and validation sets

train, val = train_test_split(images, test_size=0.2)
train.to_csv('train_labels.csv', index=False, index_label='id')
!head train_labels.csv

In [None]:
# Create torch dataset

class BirdsDataset(Dataset):
  def __init__(self, images_dir, labels_file, img_width=300, img_height=300, augment=False):
    super(BirdsDataset, self).__init__()
    self.img_width = img_width
    self.img_height = img_height
    self.image_dir = images_dir

    # dataframe of image file name and class label
    self.df = pd.read_csv(labels_file, header=0, names=['id', 'filename', 'genus'])
    # read all images to memory
    self.images = [Image.open(os.path.join(self.image_dir, filename)) for filename in self.df['filename']]
    # list of unique classes
    self.classes = self.df['genus'].unique()
    # number of classes
    self.K = len(classes)

    # example augmentations
    if augment:
        self.transform = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ToTensor(),
        ])
    else:
        self.transform = transforms.Compose([
            transforms.ToTensor()
        ])

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    transformed_image = self.transform(self.images[idx])
    label = self.df.iloc[idx]['genus']
    return transformed_image, label

In [None]:
# Create a dataset and loader
train_batch_size = 16

train_dataset = BirdsDataset(images_dir="images/train", labels_file='train_labels.csv', augment=True)
train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)

In [None]:
# Show example training batch

plt.rcParams['figure.figsize'] = (10.0, 10.0)
for X_batch, y_batch in train_loader:
    for i in range(0, train_batch_size):
        plt.subplot(4, 4, 1 + i)
        plt.imshow(X_batch[i].numpy().transpose(1, 2, 0))
        plt.axis('off')
        plt.title(y_batch[i])
    plt.show()
    break

In [None]:
# Predict random class
import random

test_images = pd.read_csv('test.csv', header=0, names=['id'], usecols=['id'])
predictions = [random.choice(classes) for _ in range(len(test_images))]

In [None]:
# Create sample submission file
test_images['genus'] = predictions
test_images.to_csv('sample_submission.csv', index=False, index_label='id')
test_images.head()

In [None]:
# Submit!
!kaggle competitions submit -c bird-genus-multi-class-image-classification -f sample_submission.csv -m "Sample submission"