# 環境確認

In [1]:
import sys
import torch

print("Pythonのバージョン：",sys.version)
print("PyTorchのバージョン：", torch.__version__)
print("使用しているGPUの確認")
!nvidia-smi

Pythonのバージョン： 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
PyTorchのバージョン： 1.11.0+cu113
使用しているGPUの確認
NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



# データのダウンロード

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 68 bytes


In [3]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!kaggle competitions download -c paddy-disease-classification

Downloading paddy-disease-classification.zip to /content
 99% 1.01G/1.02G [00:13<00:00, 132MB/s]
100% 1.02G/1.02G [00:13<00:00, 80.4MB/s]


In [None]:
!unzip /content/paddy-disease-classification.zip

In [6]:
# ディレクトリ内のデータの確認
%ls

paddy-disease-classification.zip  sample_submission.csv  train.csv
[0m[01;34msample_data[0m/                      [01;34mtest_images[0m/           [01;34mtrain_images[0m/


# 事前準備

In [7]:
!pip install pytorch_lightning torchmetrics tensorboard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_lightning
  Downloading pytorch_lightning-1.6.3-py3-none-any.whl (584 kB)
[K     |████████████████████████████████| 584 kB 4.8 MB/s 
[?25hCollecting torchmetrics
  Downloading torchmetrics-0.9.0-py3-none-any.whl (418 kB)
[K     |████████████████████████████████| 418 kB 21.2 MB/s 
Collecting pyDeprecate<0.4.0,>=0.3.1
  Downloading pyDeprecate-0.3.2-py3-none-any.whl (10 kB)
Collecting PyYAML>=5.4
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 38.6 MB/s 
Collecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 42.6 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manyli

In [8]:
# ライブラリのインポート
import os
import random
from glob import glob
from warnings import filterwarnings

import argparse

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import cv2
from PIL import Image

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import torch.nn.functional as F
from torch.optim import optimizer
import torchvision
from torchvision import datasets, transforms, models

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torchmetrics
from torchmetrics import F1Score, Accuracy, MetricCollection, Precision, Recall

import tensorboard

import warnings

filterwarnings('ignore')

In [9]:
# 初期設定
from google.colab import drive
drive.mount('/content/gdrive') # マウント先

ATTACH_PATH = '/content/gdrive/MyDrive/Paddy'  # 保存先のベースディレクトリ  
SAVE_MODEL_PATH = f'{ATTACH_PATH}/model/'  # モデルの保存先

# モデルのファイル名
MODEL_NAME = 'efficientnetb7-epoch=19-val_loss=0.14.ckpt'

# モデルのパス
model_path = f'{SAVE_MODEL_PATH}{MODEL_NAME}'

# 提出データの保存先
SUBMIT_PATH = f'{ATTACH_PATH}/submit/'

# 提出データの名前
submit_name = 'sample_submission-efficientnetb7-epoch=19-val_loss=0.14.csv'

# 画像データのディレクトリ
img_dir = 'test_images/'

Mounted at /content/gdrive


In [10]:
# ハイパーパラメータの設定
parser = argparse.ArgumentParser()
parser.add_argument('--test_size', type=float, default=0.25)
parser.add_argument('--image_size', type=int, default=224)
parser.add_argument('--num_classes', type=int, default=10)
parser.add_argument('--epochs', type=int, default=100)
parser.add_argument('--batch_size', type=int, default=16)
parser.add_argument('--lr', type=float, default=1e-4) # 学習率
parser.add_argument('--patience', type=int, default=10) # earlystoppingの監視対象回数
opt = parser.parse_args(args=[])
print(opt)

Namespace(batch_size=16, epochs=100, image_size=224, lr=0.0001, num_classes=10, patience=10, test_size=0.25)


# データの読み込み

In [11]:
# データの読み込み
test = pd.read_csv('sample_submission.csv')
print(test.head())
print('データ数: ', test.shape[0])

     image_id  label
0  200001.jpg    NaN
1  200002.jpg    NaN
2  200003.jpg    NaN
3  200004.jpg    NaN
4  200005.jpg    NaN
データ数:  3469


In [12]:
# 画像データの名前リストの抽出
x_test = test['image_id'].values
dummy = test['label'].values
print(x_test, f'データ数：{len(x_test)}')

['200001.jpg' '200002.jpg' '200003.jpg' ... '203467.jpg' '203468.jpg'
 '203469.jpg'] データ数：3469


# transform

In [13]:
# transformの設定
transform = {
    'train': transforms.Compose([
        transforms.Resize((opt.image_size,opt.image_size)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.RandomResizedCrop(opt.image_size, scale=(0.08, 1.0), ratio=(3 / 4, 4 / 3)),
        transforms.RandomAffine(degrees=[-10, 10], translate=(0.1, 0.1), scale=(0.5, 1.5)),
        transforms.RandomRotation(degrees=10),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]),
    'val': transforms.Compose([
        transforms.Resize((opt.image_size,opt.image_size)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]),
}

# Dataset

In [14]:
# Datasetの設定
class PaddyDataset(Dataset):
    def __init__(self, image_name_list, label_list, img_dir, transform=None, phase=None):
        self.image_name_list = image_name_list # 画像ファイル名
        self.label_list = label_list # ラベル
        self.img_dir = img_dir # 画像データのディレクトリ
        self.phase = phase # 変数phaseで学習(train)もしくは検証(val)の設定を行う
        self.transform = transform

    def __len__(self):
        return len(self.image_name_list)

    def __getitem__(self, index):
        image_path = os.path.join(self.img_dir, self.image_name_list[index])
        img = Image.open(image_path)
        img = self.transform[self.phase](img)
        label = self.label_list[index]

        return img, label

In [15]:
# Datasetのインスタンス作成
test_dataset = PaddyDataset(x_test, dummy, img_dir, transform=transform, phase='val')

# DataLoader

In [16]:
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# モデルの定義

In [17]:
class Net(pl.LightningModule):
    def __init__(self, lr: float, num_classes: int):
        super().__init__()
        
        self.lr = lr
        self.num_classes = num_classes
        self.loss_fn = nn.CrossEntropyLoss()
        
        # 学習済みモデルの読み込み
        self.model = models.efficientnet_b7()
        self.model.classifier[1] = nn.Linear(2560, self.num_classes) # 最終レイヤー関数の付け替え
        
        self.train_acc = torchmetrics.Accuracy()
        self.val_acc = torchmetrics.Accuracy()
        self.test_acc = torchmetrics.Accuracy()
        
    def forward(self, x):
        output = self.model(x)
        return output
    
    # 学習データに対する処理
    def training_step(self, batch, batch_idx):
        images, target = batch
        preds = self.forward(images)
        loss = self.loss_fn(preds, target)
        self.log('train_loss', loss, on_step=False, on_epoch=True)
        self.log('train_acc', self.train_acc(preds, target), on_step=False, on_epoch=True)
        return loss
    
    # 検証データに対する処理
    def validation_step(self, batch, batch_idx):
        images, target = batch
        preds = self.forward(images)
        loss = self.loss_fn(preds, target)
        self.log('val_loss', loss, on_step=False, on_epoch=True)
        self.log('val_acc', self.val_acc(preds, target), on_step=False, on_epoch=True)
        return loss
    
    # テストデータに対する処理
    def test_step(self, batch, batch_idx):
        images, target = batch
        preds = self.forward(images)
        loss = self.loss_fn(preds, target)
        #self.log('test_loss', loss, on_step=False, on_epoch=True)
        #self.log('test_acc', self.test_acc(preds, target), on_step=False, on_epoch=True)
        return loss
    
    def configure_optimizers(self):        
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=0.0001)
        return optimizer

# チェックポイントをロードして予測する

In [18]:
model = Net.load_from_checkpoint(model_path, lr=opt.lr, num_classes=opt.num_classes)

# テストデータの予測

In [19]:
# GPUを含めた乱数のシードを設定
pl.seed_everything(0)

# 予測データフレームの作成
preds = []

# dataloaderから、ミニバッチ単位でデータを読み込む
for images, _ in test_dataloader:    
    # 学習済みモデルを推論モードに設定
    model.eval()
    
    # モデルによる変換
    outputs = model(images)
    pred = torch.argmax(outputs, dim=1)
    pred = pred.to('cpu').numpy()

    # 予測値をリストに追加
    preds.extend(pred)

Global seed set to 0


# 提出

In [20]:
# 提出用データの読み込み
sub = pd.read_csv('sample_submission.csv')
print(sub.head())
print('データ数: ', sub.shape[0])

     image_id  label
0  200001.jpg    NaN
1  200002.jpg    NaN
2  200003.jpg    NaN
3  200004.jpg    NaN
4  200005.jpg    NaN
データ数:  3469


In [21]:
# 目的変数カラムの置き換え
sub['label'] = preds
print(sub.head())

     image_id  label
0  200001.jpg      7
1  200002.jpg      8
2  200003.jpg      3
3  200004.jpg      3
4  200005.jpg      3


In [22]:
# idをラベルに置き換える
label_dict = {
    'bacterial_leaf_blight': 0,
    'bacterial_leaf_streak': 1,
    'bacterial_panicle_blight': 2,
    'blast': 3,
    'brown_spot': 4,
    'dead_heart': 5,
    'downy_mildew': 6,
    'hispa': 7,
    'normal': 8,
    'tungro': 9
}

# 辞書内包表記とitemsメソッドでキーと値を入れ替え
label_dict_swap = {v: k for k, v in label_dict.items()}

# ラベルへの置き換え
sub['label'] = sub['label'].map(label_dict_swap)
sub

Unnamed: 0,image_id,label
0,200001.jpg,hispa
1,200002.jpg,normal
2,200003.jpg,blast
3,200004.jpg,blast
4,200005.jpg,blast
...,...,...
3464,203465.jpg,dead_heart
3465,203466.jpg,hispa
3466,203467.jpg,normal
3467,203468.jpg,bacterial_leaf_streak


In [23]:
# ファイルのエクスポート
sub.to_csv(SUBMIT_PATH+submit_name, index=None)