In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
datapath = '/kaggle/input/aerial-cactus-identification/'

labels = pd.read_csv(datapath + 'train.csv')
submission = pd.read_csv(datapath + 'sample_submission.csv')

# Visualization

## Distribution of labels

In [None]:
mpl.rc('font', size = 12)
plt.figure(figsize = (4,4))
fig_label = ['Has cactus', 'Hasn\'t cactus']
plt.pie(labels['has_cactus'].value_counts(), labels=fig_label, autopct='%.1f%%')
plt.title('Distribution of Labels')
plt.show()

## Image sample

In [None]:
from zipfile import ZipFile

with ZipFile(datapath + 'train.zip') as zipper:
    zipper.extractall()
    
with ZipFile(datapath + 'test.zip') as zipper:
    zipper.extractall()

In [None]:
import matplotlib.gridspec as gridspec
import cv2

def draw_cactus_image(has_cactus : bool):
    mpl.rc('font', size = 7)
    plt.figure(figsize = (10, 6))
    grid = gridspec.GridSpec(2, 6)
    
    last_has_cactus_img_name = labels[labels['has_cactus'] == int(has_cactus)]['id'][-12:]

    for idx, img_name in enumerate(last_has_cactus_img_name):
        img_path = 'train/' + img_name
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        ax = plt.subplot(grid[idx])
        ax.imshow(image)
    plt.tight_layout()
    plt.show()

In [None]:
draw_cactus_image(True)

In [None]:
draw_cactus_image(False)

# BaseLine Model

## Setup Pytorch environment

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# Preprocessing

## Split train and validation dataset

In [None]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(labels,
                               test_size = 0.1,
                               stratify=labels['has_cactus'],
                               random_state = 50)
print(len(train), len(valid))

## Define Image dataset

In [None]:
import cv2
import typing as tp
from torch.utils.data import Dataset

class ImageDataset(Dataset):
    def __init__(self, df, img_dir='./', transform:tp.Callable=None):
        super().__init__()
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_id = self.df.iloc[idx, 0]
        img_path = self.img_dir + img_id
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = self.df.iloc[idx, 1]
        
        if self.transform is not None:
            image = self.transform(image)
        return image, label

In [None]:
from torchvision import transforms

dataset_train = ImageDataset(df = train, img_dir = 'train/', transform = transforms.ToTensor())
dataset_valid = ImageDataset(df = valid, img_dir = 'train/', transform = transforms.ToTensor())

## Create dataset loader

In [None]:
from torch.utils.data import DataLoader

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    numpy.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(0)

loader_train = DataLoader(dataset = dataset_train, batch_size = 32, shuffle = True,
                          worker_init_fn = seed_worker, generator=g, num_workers=2)
loader_valid = DataLoader(dataset = dataset_valid, batch_size = 32, shuffle = False,
                          worker_init_fn = seed_worker, generator=g, num_workers=2)

# Create Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class CactusModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels = 3, output_channels=32, kernel_size = 3, padding = 2),
            nn.ReLu(),
            nn.MaxPool2D(kernel_size = 2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels = 32, output_channels=64, kernel_size = 3, padding = 2),
            nn.ReLu(),
            nn.MaxPool2D(kernel_size = 2)
        )
        self.avg_pool = nn.AvgPool2d(kernel_size = 2)
        self.fc = nn.Linear(in_features = 64 * 4 * 4, out_features = 2)
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.avg_pool(x)
        x = x.view(-1, 64 * 4 * 4)
        x = self.fc(x)
        return x

In [None]:
model = CactusModel().to(device)