<a href="https://colab.research.google.com/github/johanjun/kaggle_cassava_leaf/blob/main/Cassava_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library import

In [1]:
## library import
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os
import re
from PIL import Image
import shutil
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import xml.etree.ElementTree as et
import json
import pprint

# Data load

In [2]:
%cd /content/drive/MyDrive/Cassava_Leaf

/content/drive/MyDrive/Cassava_Leaf


In [3]:
os.getcwd()

'/content/drive/MyDrive/Cassava_Leaf'

In [4]:
## directory 설정
cur_dir = os.getcwd()
data_dir = os.path.join(cur_dir, 'data')
image_dir = os.path.join(data_dir, 'train_images')

In [None]:
## image file 수 확인
image_files = [fname for fname in os.listdir(image_dir) if os.path.splitext(fname)[-1] == '.jpg'] #확장자 분리
print(len(image_files))

21397


In [None]:
image_files[:10]

['787914923.jpg',
 '822641071.jpg',
 '810351903.jpg',
 '813744167.jpg',
 '821592762.jpg',
 '803715275.jpg',
 '816689138.jpg',
 '787415998.jpg',
 '813217011.jpg',
 '793206643.jpg']

In [18]:
with open(os.path.join(data_dir, 'label_num_to_disease_map.json')) as f:
    mapping = json.loads(f.read())
    pprint.pprint(mapping)

{'0': 'Cassava Bacterial Blight (CBB)',
 '1': 'Cassava Brown Streak Disease (CBSD)',
 '2': 'Cassava Green Mottle (CGM)',
 '3': 'Cassava Mosaic Disease (CMD)',
 '4': 'Healthy'}


## data split by label

In [14]:
with open(os.path.join(data_dir, 'train.csv')) as f:
    df_train = pd.read_csv(f)

df_train.head()

Unnamed: 0,image_id,label
0,1000015157.jpg,0
1,1000201771.jpg,3
2,100042118.jpg,1
3,1000723321.jpg,1
4,1000812911.jpg,3


In [None]:
df1 = df_train[df_train.label==1]
df1.label.value_counts()

1    2189
Name: label, dtype: int64

In [None]:
df0 = df_train[df_train.label==0]
df0.label.value_counts()

0    1087
Name: label, dtype: int64

In [None]:
df2 = df_train[df_train.label==2]
df2.label.value_counts()

2    2386
Name: label, dtype: int64

In [None]:
df3 = df_train[df_train.label==3]
df3.label.value_counts()

3    13158
Name: label, dtype: int64

In [None]:
df4 = df_train[df_train.label==4]
df4.label.value_counts()

4    2577
Name: label, dtype: int64

In [None]:
len(pd.concat([df0, df1, df2, df3, df4])) == len(df_train)

True

In [None]:
df1.head()

Unnamed: 0,image_id,label
2,100042118.jpg,1
3,1000723321.jpg,1
12,1002088496.jpg,1
22,1004389140.jpg,1
41,1008142548.jpg,1


In [None]:
df0_shuffled = df0.sample(frac=1).reset_index(drop=True)
df1_shuffled = df1.sample(frac=1).reset_index(drop=True)
df2_shuffled = df2.sample(frac=1).reset_index(drop=True)
df3_shuffled = df3.sample(frac=1).reset_index(drop=True)
df4_shuffled = df4.sample(frac=1).reset_index(drop=True)

In [5]:
#train, val
train_dir = os.path.join(data_dir, 'train')
val_dir = os.path.join(data_dir, 'validation')
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

In [None]:
cnt = 0
for image_file in df0_shuffled.image_id.to_list():
  image_path = os.path.join(image_dir, image_file)
  if cnt <= 0.8*len(df0_shuffled.image_id.to_list()):
    cpath = train_dir
  else:
    cpath = val_dir
  shutil.copy(image_path, cpath)
  cnt += 1

In [None]:
df_shuffled_list = [df1_shuffled, df2_shuffled, df3_shuffled, df4_shuffled]
for l in df_shuffled_list:
  cnt = 0
  print(l, " start!")
  for image_file in l.image_id.to_list():
    image_path = os.path.join(image_dir, image_file)
    if cnt <= 0.8*len(l.image_id.to_list()):
      cpath = train_dir
    else:
      cpath = val_dir
    shutil.copy(image_path, cpath)
    cnt += 1
    if cnt%100 == 0:
      print(cnt, 'complete.')

            image_id  label
0     3507114461.jpg      1
1     1909629301.jpg      1
2       76610968.jpg      1
3     4254996610.jpg      1
4     3770952591.jpg      1
...              ...    ...
2184  1875533805.jpg      1
2185  2425193162.jpg      1
2186  2995113861.jpg      1
2187   379373523.jpg      1
2188  2182500020.jpg      1

[2189 rows x 2 columns]  start!
100 complete.
200 complete.
300 complete.
400 complete.
500 complete.
600 complete.
700 complete.
800 complete.
900 complete.
1000 complete.
1100 complete.
1200 complete.
1300 complete.
1400 complete.
1500 complete.
1600 complete.
1700 complete.
1800 complete.
1900 complete.
2000 complete.
2100 complete.
            image_id  label
0     3826775864.jpg      2
1     1393783706.jpg      2
2      766908244.jpg      2
3     1792425947.jpg      2
4     3743464955.jpg      2
...              ...    ...
2381  4079242692.jpg      2
2382  2753152635.jpg      2
2383  1608857677.jpg      2
2384  2694534101.jpg      2
2385    22116035.

In [6]:
train_images = os.listdir(train_dir)
val_images = os.listdir(val_dir)

print(len(train_images), len(val_images))

17120 4277


# TFRecord

In [7]:
IMG_SIZE = 224

In [8]:
## TFRecord 저장할 directory와 file 경로 설정
tfr_dir = os.path.join(data_dir, 'tfrecord')
os.makedirs(tfr_dir, exist_ok=True)

tfr_train_dir = os.path.join(tfr_dir, 'cls_train.tfr')
tfr_val_dir = os.path.join(tfr_dir, 'cls_val.tfr')

In [9]:
## TFRecord writer 생성
writer_train = tf.io.TFRecordWriter(tfr_train_dir)
writer_val = tf.io.TFRecordWriter(tfr_val_dir)

In [10]:
# The following functions can be used to convert a value to a type compatible
# with tf.Example.

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [11]:
train_images[777]

'2185298493.jpg'

In [15]:
file_name = train_images[777]
file_name

'2185298493.jpg'

In [17]:
class_num = int(df_train[df_train.image_id == file_name].label)
class_num

4

In [22]:
class_name = mapping[str(class_num)]
class_name

'Healthy'

In [24]:
## Training data로 tfrecord 만들기
n_train = 0

train_files = os.listdir(train_dir)
for train_file in train_files:
    train_path = os.path.join(train_dir, train_file)
    image = Image.open(train_path)
    image = image.resize((IMG_SIZE, IMG_SIZE))
    bimage = image.tobytes()

    file_name = train_file
    class_num = int(df_train[df_train.image_id == file_name].label)
    class_name = mapping[str(class_num)]

    example = tf.train.Example(features=tf.train.Features(feature={
        'image': _bytes_feature(bimage),
        'cls_num': _int64_feature(class_num)
    }))
    writer_train.write(example.SerializeToString())
    n_train += 1
    
    if n_train % 1000 == 0:
      print(n_train, ' completed.')
writer_train.close()
print(n_train)

1000  completed.
2000  completed.
3000  completed.
4000  completed.
5000  completed.
6000  completed.
7000  completed.


KeyboardInterrupt: ignored

In [25]:
## image file 수 확인
image_files = [fname for fname in os.listdir(train_dir) if os.path.splitext(fname)[-1] == '.jpg']
print(len(image_files))

17120


In [26]:
## image file들을 읽어서 channel이 3이 아닌 image는 삭제
for image_file in image_files:
  image_path = os.path.join(train_dir, image_file)
  image = Image.open(image_path)
  image_mode = image.mode
  if image_mode != 'RGB':
    print(image_file, image_mode)
    image = np.asarray(image)
    print(image.shape)
    os.remove(image_path)

KeyboardInterrupt: ignored