In [1]:
import pandas as pd
import numpy as np
import os
import glob
import torch
from torchvision import models, transforms
from torchvision.models import Inception_V3_Weights
from PIL import Image
import cv2
from tqdm import tqdm

In [7]:
# part image extraction and image preprocess for CNN model

class PartImageExtractor:
    def __init__(self):
        self.transform = transforms.Compose([
            transforms.Resize(299),
            transforms.CenterCrop(299),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        self.model = models.inception_v3(weights=Inception_V3_Weights.DEFAULT)
        self.model.eval()

        if torch.cuda.is_available():
            self.model = self.model.cuda()

    def extract_features(self, image, bbox):
        x, y, w, h = bbox
        image = Image.open(image)
        image = image.crop((x, y, x+w, y+h))
        image = self.transform(image)
        image = image.unsqueeze(0)

        if torch.cuda.is_available():
            image = image.cuda()

        with torch.no_grad():
            features = self.model(image)

        return features.cpu().numpy()
    
    def extract_features_from_array(self, cropped_img_array):
        image = Image.fromarray(np.uint8(cropped_img_array))
        image = self.transform(image)
        image = image.unsqueeze(0)

        if torch.cuda.is_available():
            image = image.cuda()

        with torch.no_grad():
            features = self.model(image)
        
        return features.cpu().numpy()


In [13]:
train_10 = pd.read_csv('./data_preprocessed/label/train_label_10.csv', encoding='utf-8')
train_50 = pd.read_csv('./data_preprocessed/label/train_label_50.csv', encoding='utf-8')
train_100 = pd.read_csv('./data_preprocessed/label/train_label_100.csv', encoding='utf-8')
test_data = pd.read_csv('./data_preprocessed/label/test_label_third.csv', encoding='utf-8')

In [12]:
test_data.head(1)

Unnamed: 0,file_id,folder_path,file_name,image_width,image_height,class,cause_method,folder_path.1,ei_value,pl_value,...,category_id,disease_status,name,bbox_x,bbox_y,bbox_width,bbox_height,object_status,bbox_id,sampling_id
0,vl_normal_3022,./data_raw/test_image/test_normal/,normal_40_012_221116123827.jpg,1337,2266,normal,normal,./data_raw/test_image/test_normal/,,,...,0,N,stem,1041.86,1673.6,75.69,109.86,normal,vl_normal_3022_0,vl_sam_10_0


In [15]:
train_10_copy = train_10.drop('sampling_id', axis=1).copy()
train_50_copy = train_50.drop('sampling_id', axis=1).copy()
train_100_copy = train_100.drop('sampling_id', axis=1).copy()
test_data_copy = test_data.drop('sampling_id', axis=1).copy()

In [16]:
def tr_id_create(df, sampling_volume):
    normal = df[df['file_id'].str.contains('normal')].reset_index(drop=True)
    blight = df[df['file_id'].str.contains('blight')].reset_index(drop=True)
    wilt = df[df['file_id'].str.contains('wilt')].reset_index(drop=True)
    scorch = df[df['file_id'].str.contains('scorch')].reset_index(drop=True)
    chlorosis = df[df['file_id'].str.contains('chlorosis')].reset_index(drop=True)

    part_list = [normal, blight, wilt, scorch, chlorosis]
    id_list = ['normal', 'blight', 'wilt', 'scorch', 'chlorosis']

    for df, id in zip(part_list, id_list):
        df['tr_id'] = None
        data_id = id
        for idx, _ in df.iterrows():
            df.at[idx, 'tr_id'] = f"tr_{sampling_volume}_{data_id}_{idx}"

    result_df = pd.concat(part_list, axis=0).reset_index(drop=True)
    return result_df


In [18]:
def vl_id_create(df):
    normal = df[df['file_id'].str.contains('normal')].reset_index(drop=True)
    blight = df[df['file_id'].str.contains('blight')].reset_index(drop=True)
    wilt = df[df['file_id'].str.contains('wilt')].reset_index(drop=True)
    scorch = df[df['file_id'].str.contains('scorch')].reset_index(drop=True)
    chlorosis = df[df['file_id'].str.contains('chlorosis')].reset_index(drop=True)

    part_list = [normal, blight, wilt, scorch, chlorosis]
    id_list = ['normal', 'blight', 'wilt', 'scorch', 'chlorosis']

    for df, id in zip(part_list, id_list):
        df['vl_id'] = None
        data_id = id
        for idx, _ in df.iterrows():
            df.at[idx, 'vl_id'] = f"vl_{data_id}_{idx}"

    result_df = pd.concat(part_list, axis=0).reset_index(drop=True)
    return result_df

In [5]:
train_10_with_id = tr_id_create(train_10_copy, '10')
train_50_with_id = tr_id_create(train_50_copy, '50')
train_100_with_id = tr_id_create(train_100_copy, '100')

In [6]:
train_10_with_id.to_csv('./data_preprocessed/label/train_label_10.csv', encoding='utf-8', index=False)
train_50_with_id.to_csv('./data_preprocessed/label/train_label_50.csv', encoding='utf-8', index=False)
train_100_with_id.to_csv('./data_preprocessed/label/train_label_100.csv', encoding='utf-8', index=False)

In [19]:
test_data_with_id = vl_id_create(test_data_copy)

In [23]:
print(len(test_data_with_id))

11082


In [22]:
test_data_with_id.to_csv('./data_preprocessed/label/test_label_final.csv', encoding='utf-8', index=False)

In [25]:
extractor = PartImageExtractor()
def part_image_extractor_executor(df, id_col):
    features_dict = {}

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing images"):
        folder_path = row['folder_path']
        file_name = row['file_name']
        bbox = (row['bbox_x'], row['bbox_y'], row['bbox_width'], row['bbox_height'])

        image_path = os.path.join(folder_path, file_name)
        features = extractor.extract_features(image_path, bbox)
        features_id = row[id_col]
        features_dict[features_id] = features

    print(features_dict[df[id_col][0]].shape)
    print(len(features_dict))
    print(len(df))
    
    return features_dict

In [8]:
train_10_features = part_image_extractor_executor(train_10_with_id, 'tr_id')


Processing images: 100%|██████████| 28382/28382 [22:08<00:00, 21.36it/s]

(1, 1000)
28382
28382





In [9]:
train_50_features = part_image_extractor_executor(train_50_with_id, 'tr_id')


Processing images: 100%|██████████| 132415/132415 [1:37:40<00:00, 22.59it/s]


(1, 1000)
132415
132415


In [10]:
train_100_features = part_image_extractor_executor(train_100_with_id, 'tr_id')



Processing images: 100%|██████████| 265496/265496 [3:19:17<00:00, 22.20it/s]  


(1, 1000)
265496
265496


In [12]:
np.savez("./data_preprocessed/part_image_features/tr_cropped_features_10_dict.npz", **train_10_features)
np.savez("./data_preprocessed/part_image_features/tr_cropped_features_50_dict.npz", **train_50_features)
np.savez("./data_preprocessed/part_image_features/tr_cropped_features_100_dict.npz", **train_100_features)

In [26]:
test_data_features = part_image_extractor_executor(test_data_with_id, 'vl_id')

Processing images: 100%|██████████| 11082/11082 [07:00<00:00, 26.33it/s]

(1, 1000)
11082
11082





In [27]:
np.savez("./data_preprocessed/part_image_features/vl_cropped_features_dict.npz", **test_data_features)