# Paddy disease classification
Kaggle competition
https://www.kaggle.com/competitions/paddy-disease-classification

### Setting up env and downloading data from kaggle

In [1]:
import os

if os.getenv("COLAB_RELEASE_TAG"):
  colab_flag = True
else:
  colab_flag = False

if colab_flag == True:
   print("Running in Colab...")
   #Install Kaggle API
   ! pip install -q kaggle
   #Install Huggingface Datasets lib
   ! pip install -q datasets
   #Install Huggingface transformers & the sentencepiece
   ! pip install -q transformers[sentencepiece,torch]
   #Intall Timm for Pytorch Image Models
   ! pip install -q timm
   print("...Installed required dependencies")
else:
   print("Assuming running in local environment...")

Running in Colab...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━

In [2]:
if colab_flag == True:
  import json
  from google.colab import drive
  drive.mount('/content/drive/')

  # Set the file path
  file_path = '/content/drive/MyDrive/dtu/fastAI/04_NLP/kaggle_api.json'

  # Check if the file exists
  if os.path.exists(file_path):
      # Load the JSON file
      with open(file_path) as f:
          creds = json.load(f)
      print('Sucesfully set kaggle credentials')
  else:
      # Handle the case when the file does not exist
      creds = None  # or any other appropriate action you want to take
      print('Error: File not found, Credentials NOT set')
else:
    import json
    file_path = "../secrets/kaggle_api.json"
    # Check if the file exists
    if os.path.exists(file_path):
        # Load the JSON file
        with open(file_path) as f:
            creds = json.load(f)
        print('Sucesfully set kaggle credentials')
    else:
        # Handle the case when the file does not exist
        creds = None  # or any other appropriate action you want to take
        print('Error: File not found, Credentials NOT set')

Mounted at /content/drive/
Sucesfully set kaggle credentials


In [3]:
#using `pathlib.Path to work with paths in Python`
from pathlib import Path
#Set path for the titanic dataset
path = Path('paddy-disease-classification')

if path.exists():
  print('Data folder exists')
else:
  print('Data not detected, starting download')
  #Setup of kaggle credentials to use the API for downloading dataset
  cred_path = Path('~/.kaggle/kaggle.json').expanduser()
  if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(json.dumps(creds))
    #If not Json
    #cred_path.write_text(creds)
    cred_path.chmod(0o600)
  import zipfile,kaggle
  #Download data
  kaggle.api.competition_download_cli(str(path))
  #Unzip at path
  zipfile.ZipFile(f'{path}.zip').extractall(path)

#Verifying the the local content
!ls {path}

Data not detected, starting download
Downloading paddy-disease-classification.zip to /content


100%|██████████| 1.02G/1.02G [00:10<00:00, 106MB/s]



sample_submission.csv  test_images  train.csv  train_images


# Testing GPU limitation
The goal of this notebook is to improve the performance by using larger models, but at this point GPU memory will grow into a limitation.

It's really helpful to be able to quickly try a few models and image sizes and find out what will run successfully. To make this quick, we can just grab a small subset of the data for running short epochs -- the memory use will still be the same, but it'll be much faster.

One easy way to do this is to simply pick a category with few files in it. Here's our options:


In [25]:
from fastai.vision.all import *
set_seed(42)

train_path = path/'train_images'
train_files = get_image_files(train_path)

test_path = path/'test_images'
test_files = get_image_files(train_path)

In [20]:
#Simply contains the filename of all the images in the given folder
test_files[0:3]

(#3) [Path('paddy-disease-classification/train_images/brown_spot/107972.jpg'),Path('paddy-disease-classification/train_images/brown_spot/100275.jpg'),Path('paddy-disease-classification/train_images/brown_spot/101929.jpg')]

In [5]:
df_test = pd.read_csv(path/'train.csv')
df_test.head()

Unnamed: 0,image_id,label,variety,age
0,100330.jpg,bacterial_leaf_blight,ADT45,45
1,100365.jpg,bacterial_leaf_blight,ADT45,45
2,100382.jpg,bacterial_leaf_blight,ADT45,45
3,100632.jpg,bacterial_leaf_blight,ADT45,45
4,101918.jpg,bacterial_leaf_blight,ADT45,45


In [6]:
df_test.label.value_counts()

normal                      1764
blast                       1738
hispa                       1594
dead_heart                  1442
tungro                      1088
brown_spot                   965
downy_mildew                 620
bacterial_leaf_blight        479
bacterial_leaf_streak        380
bacterial_panicle_blight     337
Name: label, dtype: int64

### Selecting just a small subset of the test data

In [29]:
train_path_subset = path/'train_images'/'bacterial_panicle_blight'

In [30]:
#Define a train function to test different models
def train_model(arch, aug_size, item=Resize(480, method='squish'), accum=1, finetune=True,epochs=12):
  dls = ImageDataLoaders.from_folder(train_path_subset,
                                        valid_pct=0.2,
                                        item_tfms=item,
                                        #default bs=64
                                        batch_tfms=aug_transforms(size=aug_size, min_scale=0.75), bs=64//accum)
  cbs = GradientAccumulation(64) if accum else []
  #cbs is one or a list of Callbacks to pass to the Learner. Callbacks are used for every tweak of the training loop.
  learn = vision_learner(dls,arch,metrics=error_rate, cbs=cbs).to_fp16()
  if finetune:
    learn.fine_tune(epochs, 0.01)
    return learn.tta(dl=dls.test_dl(test_files))
  else:
    learn.unfreeze()
    learn.fit_one_cycle(epochs, 0.01)

In [22]:
!pip install pynvml

import gc
def report_gpu():
  print(torch.cuda.list_gpu_processes())
  gc.collect()
  torch.cuda.empty_cache()



In [23]:
report_gpu()

GPU:0
no processes are running


In [31]:
train_model('convnext_small_in22k', 128, epochs=1, accum=1, finetune=False)
report_gpu()

  model = create_fn(


Downloading model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:09


GPU:0
process       4786 uses     4252.000 MB GPU memory


In [33]:
train_model('convnext_small_in22k', 128, epochs=1, accum=2, finetune=False)
report_gpu()

epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:05


GPU:0
process       4786 uses     3180.000 MB GPU memory


In [34]:
train_model('convnext_small_in22k', 128, epochs=1, accum=4, finetune=False)
report_gpu()

epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:05


GPU:0
process       4786 uses     2658.000 MB GPU memory


## Larger models

In [36]:
train_model('convnext_large_in22k', 224, epochs=1, accum=2, finetune=False)
report_gpu()

  model = create_fn(


Downloading model.safetensors:   0%|          | 0.00/919M [00:00<?, ?B/s]

epoch,train_loss,valid_loss,error_rate,time
0,0.0,0.0,0.0,00:08


GPU:0
process       4786 uses    11086.000 MB GPU memory


### Saving the model

In [None]:
learn.save("/content/drive/MyDrive/dtu/fastAI/06_Paddy_vision/models")

Path('/content/drive/MyDrive/dtu/fastAI/06_Paddy_vision/models.pth')

In [None]:
doc(Learner.save)