In [1]:
# import tools

import pandas as pd
import numpy as np
import os
import ast
import json

In [145]:
# raw label load(train)

vl_raw_normal = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_normal.csv', encoding='utf-8')
vl_raw_blight = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_blight.csv', encoding='utf-8')
vl_raw_wilt = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_wilt.csv', encoding='utf-8')
vl_raw_scorch = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_scorch.csv', encoding='utf-8')
vl_raw_chlorosis = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_chlorosis.csv', encoding='utf-8')

tr_normal = pd.read_csv('./data_raw/train_label/merged_df/raw_en_normal.csv', encoding='utf-8')
tr_blight = pd.read_csv('./data_raw/train_label/merged_df/raw_en_blight.csv', encoding='utf-8')
tr_wilt = pd.read_csv('./data_raw/train_label/merged_df/raw_en_wilt.csv', encoding='utf-8')
tr_scorch = pd.read_csv('./data_raw/train_label/merged_df/raw_en_scorch.csv', encoding='utf-8')
tr_chlorosis = pd.read_csv('./data_raw/train_label/merged_df/raw_en_chlorosis.csv', encoding='utf-8')

  tr_normal = pd.read_csv('./data_raw/train_label/merged_df/raw_en_normal.csv', encoding='utf-8')


In [94]:
def data_selection_by_date(df):
    original = df.copy()
    original_date = original['images.fname']
    date_list = []

    for date in original_date:
        format_cut = date.replace('.jpg', '')
        month_day = format_cut[:-6]
        date_list.append(month_day)

    df_date = pd.DataFrame({'month_day': date_list})

    add_month_day = pd.concat([original, df_date], axis=1)

    first_in = add_month_day.drop_duplicates(subset=['month_day'], keep='first').reset_index(drop=True)
    last_in = add_month_day.drop_duplicates(subset=['month_day'], keep='last').reset_index(drop=True)

    df_sel = pd.concat([first_in, last_in], axis=0).reset_index(drop=True)

    return df_sel

In [51]:
def columns_selection(df, col_list):
    df_sel = df[col_list].copy()
    return df_sel

In [7]:
# print(blight_data['images.disease_cause_method'].unique())
# print(wilt_data['images.disease_cause_method'].unique())
# print(scorch_data['images.disease_cause_method'].unique())
# print(chlorosis_data['images.disease_cause_method'].unique())

['역병주입']
['수분제한60%']
['EC 3.0' 'EC 0.75' 'pH 5.0' 'pH 7.5' '칼슘부족']
['철결핍' '질소부족']


In [52]:
## label preprocess functions

def json_data_to_pandas(df, target_col, saving_col):
    df[saving_col] = None
    
    for idx, row in df.iterrows():
        json_str = row[target_col]
        str_corrected = json_str.replace("None", "null").replace("'", "\"")
        corrected_data = json.loads(str_corrected)
        df.at[idx, saving_col] = corrected_data

    dict_format = df[saving_col][0]
    extraction_keys = dict_format[0].keys()
    key_list = list(extraction_keys)
    df_pandas_type = df.drop([target_col], axis=1)

    print(type(df[saving_col]))
    return df_pandas_type, key_list

def extraction_env_values(df_pandas_type, key_list, target_col):
    df = df_pandas_type
    df['value_types'] = None
    value_names = []

    for i in range(len(key_list)):
        candidate = key_list[i]
        if candidate.endswith('_value'):
            value_names.append(candidate)

    for idx, row in df.iterrows():
        value_dict = {}
        dict_list = row[target_col]
        for dictionary in dict_list:
            for key, value in dictionary.items():
                if key in value_names:
                    if key not in value_dict:
                        value_dict[key] = [value]
                    else:
                        value_dict[key].append(value)
        df.at[idx, 'value_types'] = value_dict
    
    for idx, row in df.iterrows():
        value_to_split = row['value_types']
        for key, value in value_to_split.items():
            if key not in df.columns:
                df[key] = pd.NA
            df.at[idx, key] = value

    df_splited_env_values = df.drop([target_col, 'value_types'], axis=1)
    print(df_splited_env_values.columns)
    return df_splited_env_values

def str_to_num(string_list):
    if isinstance(string_list, str):
        try:
            list_obj = ast.literal_eval(string_list)
        except ValueError:
            return string_list
    else:
        list_obj = string_list

    if all(item is None for item in list_obj):
        return None
    else:
        list_num = [float(item) for item in list_obj if item is not None]
        return list_num
    
def string_to_number(df):
    col_list = [col for col in df.columns if col.endswith('_value')]
    
    for col in col_list:
        df[col] = df[col].apply(str_to_num)
        
    return df

def calculate_avg(df):
    col_list = [col for col in df.columns if col.endswith('_value')]

    for value_name in col_list:
        for idx, row in df.iterrows():
            values = row[value_name]
            if values is not None:
                df.at[idx, value_name] = np.mean(values)

            else:
                df.at[idx, value_name] = None

    return df

In [53]:
def data_split(df, target_col):
    exploded_df = df.explode(target_col).reset_index(drop=True)
    exploded_df = pd.concat([exploded_df.drop([target_col], axis=1),
                             exploded_df[target_col].apply(pd.Series)], axis=1)
    
    return exploded_df

In [151]:
def split_bbox(df):
    df['bbox_x'] = df['bbox'].apply(lambda bbox: bbox[0])
    df['bbox_y'] = df['bbox'].apply(lambda bbox: bbox[1])
    df['bbox_width'] = df['bbox'].apply(lambda bbox: bbox[2])
    df['bbox_height'] = df['bbox'].apply(lambda bbox: bbox[3])
    return df

def process_rest_values(df, class_kor, class_en):
    df_cleaned = df.rename(columns={'images.fname':'file_name',
                                    'images.width':'image_width',
                                      'images.height':'image_height',
                                        'images.disease_class':'class',
                                          'images.disease_cause_method':'cause_method'})
    
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('칼슘부족','low_Ca'))
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('질소부족','low_N'))
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('철결핍','leck_Fe'))
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('수분제한','water_'))
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('주입','_injection'))    
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace(class_kor,class_en))
    df_cleaned['disease_status']=df_cleaned['disease_status'].apply(lambda x: x.replace('E','Y'))
    df_cleaned['class']=df_cleaned['class'].apply(lambda x: x.replace(class_kor, class_en))

    replace_values = {'잎': 'leaf', '줄기': 'stem', '과실': 'fruit', '화방': 'flower'}
    df_cleaned['name'] = df_cleaned['name'].replace(replace_values, regex=True)
    df_cleaned = df_cleaned.drop(['ir_value', 'tl_value', 'rp_value'], axis=1)
    df_cleaned = df_cleaned[(df_cleaned['bbox_y'] >= 0) & (df_cleaned['bbox_height'] >= 0)]
    df_cleaned = df_cleaned[(df_cleaned['image_width'] > (df_cleaned['bbox_x'] + df_cleaned['bbox_width'])) & 
                            (df_cleaned['image_height'] > (df_cleaned['bbox_y'] + df_cleaned['bbox_height']))]
    return df_cleaned

In [55]:
# envrionments 컬럼 데이터 처리 통합 함수

def process_col_environments(df, target_col, saving_col):
    env_target, env_saving = target_col, saving_col
    env_pandas, env_list = json_data_to_pandas(df, target_col=env_target, saving_col=env_saving)
    env_values = extraction_env_values(env_pandas, env_list, env_saving)
    env_values_num = string_to_number(env_values)
    env_values_avg = calculate_avg(env_values_num)
    
    return env_values_avg

In [56]:
# annotations, categories 컬럼 데이터 처리 통합 함수

def process_col_annotations_categories(df, target_col_1, saving_col_1, target_col_2, saving_col_2):
    df_anno = df.drop(['categories'], axis=1)
    df_cat = df[['images.fname', 'categories']].copy()
    anno_target, anno_saving = target_col_1, saving_col_1
    cat_target, cat_saving = target_col_2, saving_col_2

    anno_pandas, _ = json_data_to_pandas(df_anno, anno_target, anno_saving)
    cat_pandas, _ = json_data_to_pandas(df_cat, cat_target, cat_saving)

    anno_split = data_split(anno_pandas, target_col=anno_saving)
    anno_split_sel = anno_split.drop(['coordinates', 'area', 'isCrowd',
                                      'id', 'image_id'],
                                      axis=1)
    cat_split = data_split(cat_pandas, target_col=cat_saving)
    anno_split_sel['category_id'] = cat_split['id']
    anno_split_sel['name'] = cat_split['name']

    return anno_split_sel

In [57]:
def sampling_df(df, ratio):
    total_rows = len(df)
    sampling_rows_num = int(round((total_rows*ratio), -2))
    sampled_df = df.tail(sampling_rows_num)
    sampled_df = sampled_df.reset_index(drop=True)

    return sampled_df

In [58]:
## integrated label preprocess functions

# 모든 대상 컬럼 데이터 처리 통합 함수
    # 컬럼 규칙
        # environments: environments -> env_values -> value_types -> 개별 컬럼들
        # annotations: annotations -> anno_values -> 개별 컬럼들
        # categories: categories -> cat_values -> 개별 컬럼들

def label_preprocess(df, target_0, target_1, target_2, saving_0, saving_1, saving_2, class_kor, class_en):
    processed_env = process_col_environments(df, target_0, saving_0)
    processed_anno_cat = process_col_annotations_categories(processed_env, target_1, saving_1, target_2, saving_2)
    processed_bbox = split_bbox(processed_anno_cat)
    processed_data = process_rest_values(processed_bbox, class_kor, class_en)

    return processed_data

In [14]:
# train data preparation

# normal_preprocessed = label_preprocess(normal_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '정상', 'normal')
# blight_preprocessed = label_preprocess(blight_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '역병', 'blight')
# wilt_preprocessed = label_preprocess(wilt_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '시들음병', 'wilt')
# scorch_preprocessed = label_preprocess(scorch_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '잎끝마름', 'scorch')
# chlorosis_preprocessed = label_preprocess(chlorosis_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '황화', 'chlorosis')

<class 'pandas.core.series.Series'>
Index(['annotations', 'categories', 'images.fname', 'images.width',
       'images.height', 'images.disease_class', 'images.disease_cause_method',
       'folder_path', 'ti_value', 'hi_value', 'ci_value', 'ir_value',
       'tl_value', 'ei_value', 'pl_value', 'sr_value', 'cl_value', 'el_value',
       'hl_value', 'pi_value', 'rp_value'],
      dtype='object')
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Index(['annotations', 'categories', 'images.fname', 'images.width',
       'images.height', 'images.disease_class', 'images.disease_cause_method',
       'folder_path', 'ti_value', 'hi_value', 'ci_value', 'ir_value',
       'tl_value', 'ei_value', 'pl_value', 'sr_value', 'cl_value', 'el_value',
       'hl_value', 'pi_value', 'rp_value'],
      dtype='object')
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Index(['annotations', 'c

In [27]:
## 처리된 데이터 확인 및 컬럼 선별 추가작업
    ### value 컬럼 선택: ei, pl, el, pi
    ### 컬럼 추가, 변경
        # object_status(추가): class=normal => normal, 나머지 => abnormal
        # disease_status(변경): E => Y로 변경(보류)

# selection = ['file_name', 'image_width', 'image_height', 'class', 'cause_method', 'folder_path', 'ei_value', 'pl_value', 'el_value', 'pi_value',
#              'bbox', 'category_id', 'disease_status', 'name', 'bbox_x', 'bbox_y', 'bbox_width', 'bbox_height']

# normal = normal_preprocessed[selection].copy()
# blight = blight_preprocessed[selection].copy()
# wilt = wilt_preprocessed[selection].copy()
# scorch = scorch_preprocessed[selection].copy()
# chlorosis = chlorosis_preprocessed[selection].copy()

In [28]:
# normal['object_status'] = 'normal'
# blight['object_status'] = 'abnormal'
# wilt['object_status'] = 'abnormal'
# scorch['object_status'] = 'abnormal'
# chlorosis['object_status'] = 'abnormal'

In [30]:
# train_20 = pd.concat([normal, blight, wilt, scorch, chlorosis], axis=0)

In [31]:
# train_20.info()

<class 'pandas.core.frame.DataFrame'>
Index: 242562 entries, 0 to 27542
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   file_name       242562 non-null  object 
 1   image_width     242562 non-null  int64  
 2   image_height    242562 non-null  int64  
 3   class           242562 non-null  object 
 4   cause_method    242562 non-null  object 
 5   folder_path     242562 non-null  object 
 6   ei_value        242562 non-null  object 
 7   pl_value        242562 non-null  object 
 8   el_value        242562 non-null  object 
 9   pi_value        242562 non-null  object 
 10  bbox            242562 non-null  object 
 11  category_id     242562 non-null  int64  
 12  disease_status  242562 non-null  object 
 13  name            242562 non-null  object 
 14  bbox_x          242562 non-null  float64
 15  bbox_y          242562 non-null  float64
 16  bbox_width      242562 non-null  float64
 17  bbox_height     

In [33]:
# train_20.to_csv('./data_preprocessed/label/train_20.csv', encoding='utf-8', index=False)

In [59]:
# sampling and preprocessed df

col_selection = ['environments', 'annotations', 'categories',
                 'images.fname', 'images.width', 'images.height',
                 'images.disease_class', 'images.disease_cause_method',
                 'folder_path', 'file_id']

final_selection = ['file_id', 'folder_path', 'file_name', 'image_width', 'image_height',
                   'class', 'cause_method',
                   'ei_value', 'pl_value', 'el_value', 'pi_value',
                   'bbox','bbox_x', 'bbox_y', 'bbox_width', 'bbox_height',
                   'category_id', 'disease_status', 'name']

common_args = ['environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values']
class_args = {"normal": ["정상", "normal"], "blight": ["역병", "blight"],
              "wilt": ["시들음병", "wilt"], "scorch": ["잎끝마름", "scorch"],
              "chlorosis": ["황화", "chlorosis"]}

path_str = './data_preprocessed/label/'
name_str = 'train_label'
raw_df_list = [tr_normal, tr_blight, tr_wilt, tr_scorch, tr_chlorosis]


def preprocessing_and_sampling(ratio, df_list, col_selection, final_selection, common_arg_list, class_arg_dict):
    df_sel_list = []
    df_sel_sampling_list = []
    df_sel_final_list = []
    class_args_list = list(class_arg_dict.values())

    for df in df_list:
        df_sel = columns_selection(df, col_selection)
        df_sel_list.append(df_sel)
        
    for df_selected in df_sel_list:
        df_sampled = sampling_df(df_selected, ratio)
        df_sel_sampling_list.append(df_sampled)
        
    for sample, class_args in zip(df_sel_sampling_list, class_args_list):
        df_processed = label_preprocess(sample, *common_arg_list, *class_args)
        df_processed_cols = columns_selection(df_processed, final_selection)
        if df_processed_cols['class'].unique() == 'normal':
            df_processed_cols['object_status'] = 'normal'
        else:
            df_processed_cols['object_status'] = 'abnormal'

        df_sel_final_list.append(df_processed_cols)
        
    df_preprocessed = pd.concat(df_sel_final_list, axis=0)

    return df_preprocessed




In [158]:
# data selection by image name date first

train_normal = data_selection_by_date(tr_normal).copy()
train_blight = data_selection_by_date(tr_blight).copy()
train_wilt = data_selection_by_date(tr_wilt).copy()
train_scorch = data_selection_by_date(tr_scorch).copy()
train_chlorosis = data_selection_by_date(tr_chlorosis).copy()

In [159]:
# train_label_30 = preprocessing_and_sampling(ratio=0.3, df_list=raw_df_list,
#                                             col_selection=col_selection, final_selection=final_selection,
#                                             common_arg_list=common_args, class_arg_dict=class_args)

# ratio_list = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
# ratio_name_list = [20, 30, 40, 50, 60, 70, 80, 90, 100]

train_label_list = [train_normal, train_blight, train_wilt, train_scorch, train_chlorosis]
ratio_list = [0.1, 0.5, 0.7, 0.9, 1]
ratio_name_list = [10, 50, 70, 90, 100]
arg_list = [train_label_list, col_selection, final_selection, common_args, class_args]

for ratio in ratio_list:
    name = int(ratio*100)
    saving = f"./data_preprocessed/label/train_label_{name}.csv"
    train_label = preprocessing_and_sampling(ratio, *arg_list)
    train_label.to_csv(saving, encoding='utf-8', index=False)
    print(f"train_label_{name}")

<class 'pandas.core.series.Series'>
Index(['annotations', 'categories', 'images.fname', 'images.width',
       'images.height', 'images.disease_class', 'images.disease_cause_method',
       'folder_path', 'file_id', 'ti_value', 'hi_value', 'ci_value',
       'ir_value', 'tl_value', 'ei_value', 'pl_value', 'sr_value', 'cl_value',
       'el_value', 'hl_value', 'pi_value', 'rp_value'],
      dtype='object')
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Index(['annotations', 'categories', 'images.fname', 'images.width',
       'images.height', 'images.disease_class', 'images.disease_cause_method',
       'folder_path', 'file_id', 'ti_value', 'hi_value', 'ci_value',
       'ir_value', 'tl_value', 'ei_value', 'pl_value', 'sr_value', 'cl_value',
       'el_value', 'hl_value', 'pi_value', 'rp_value'],
      dtype='object')
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
In

In [34]:
# train_label_30['object_status'].value_counts()

object_status
abnormal    229683
normal      119756
Name: count, dtype: int64

In [29]:
vl_raw_normal.columns

Index(['environments', 'annotations', 'licenses', 'categories',
       'images.image_id', 'images.farm_id', 'images.crops_id', 'images.crops',
       'images.kind_type', 'images.file_path', 'images.fname', 'images.fext',
       'images.width', 'images.height', 'images.create_date',
       'images.date_captured', 'images.growth_stage', 'images.leaf',
       'images.disease_class', 'images.disease_cause',
       'images.disease_cause_method', 'images.plant_body',
       'growth_index.crops_id', 'growth_index.measured_date',
       'growth_index.stem_length', 'growth_index.leaf_cnt',
       'growth_index.leaf_width', 'growth_index.leaf_length',
       'growth_index.stem_thick', 'growth_index.bloom1_date',
       'growth_index.bloom2_date', 'growth_index.bloom3_date',
       'growth_index.fr1_cnt', 'growth_index.fr2_cnt', 'growth_index.fr3_cnt',
       'growth_index.fr_weight', 'etc_infor.crops_id', 'etc_infor.create_date',
       'etc_infor.inform', 'images.stem', 'images.grpoint', 'image

In [99]:
# test data preparation

vl_normal = data_selection_by_date(vl_raw_normal).copy()
vl_blight = data_selection_by_date(vl_raw_blight).copy()
vl_wilt = data_selection_by_date(vl_raw_wilt).copy()
vl_scorch = data_selection_by_date(vl_raw_scorch).copy()
vl_chlorosis = data_selection_by_date(vl_raw_chlorosis).copy()



In [100]:
col_selection = ['environments', 'annotations', 'categories', 'images.fname', 'images.width',
                 'images.height', 'images.disease_class', 'images.disease_cause_method', 'folder_path', 'file_id']

vl_normal_data = columns_selection(vl_normal, col_selection)
vl_blight_data = columns_selection(vl_blight, col_selection)
vl_wilt_data = columns_selection(vl_wilt, col_selection)
vl_scorch_data = columns_selection(vl_scorch, col_selection)
vl_chlorosis_data = columns_selection(vl_chlorosis, col_selection)


In [101]:
vl_normal_data['file_id'][2]

'vl_normal_3'

In [102]:
vl_normal_preprocessed = label_preprocess(vl_normal_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '정상', 'normal')
vl_blight_preprocessed = label_preprocess(vl_blight_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '역병', 'blight')
vl_wilt_preprocessed = label_preprocess(vl_wilt_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '시들음병', 'wilt')
vl_scorch_preprocessed = label_preprocess(vl_scorch_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '잎끝마름', 'scorch')
vl_chlorosis_preprocessed = label_preprocess(vl_chlorosis_data, 'environments', 'annotations', 'categories', 'env_values', 'anno_values', 'cat_values', '황화', 'chlorosis')

<class 'pandas.core.series.Series'>
Index(['annotations', 'categories', 'images.fname', 'images.width',
       'images.height', 'images.disease_class', 'images.disease_cause_method',
       'folder_path', 'file_id', 'ti_value', 'hi_value', 'ci_value',
       'ir_value', 'tl_value', 'ei_value', 'pl_value', 'sr_value', 'cl_value',
       'el_value', 'hl_value', 'pi_value', 'rp_value'],
      dtype='object')
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Index(['annotations', 'categories', 'images.fname', 'images.width',
       'images.height', 'images.disease_class', 'images.disease_cause_method',
       'folder_path', 'file_id', 'ti_value', 'hi_value', 'ci_value',
       'ir_value', 'tl_value', 'ei_value', 'pl_value', 'sr_value', 'cl_value',
       'el_value', 'hl_value', 'pi_value', 'rp_value'],
      dtype='object')
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
In

In [103]:
vl_normal_preprocessed.columns

Index(['file_name', 'image_width', 'image_height', 'class', 'cause_method',
       'folder_path', 'file_id', 'ti_value', 'hi_value', 'ci_value',
       'ei_value', 'pl_value', 'sr_value', 'cl_value', 'el_value', 'hl_value',
       'pi_value', 'bbox', 'category_id', 'disease_status', 'name', 'bbox_x',
       'bbox_y', 'bbox_width', 'bbox_height'],
      dtype='object')

In [104]:
selection = ['file_id', 'folder_path', 'file_name', 'image_width', 'image_height', 'class', 'cause_method', 'folder_path', 'ei_value', 'pl_value', 'el_value', 'pi_value',
             'bbox', 'category_id', 'disease_status', 'name', 'bbox_x', 'bbox_y', 'bbox_width', 'bbox_height']

vl_normal = vl_normal_preprocessed[selection].copy()
vl_blight = vl_blight_preprocessed[selection].copy()
vl_wilt = vl_wilt_preprocessed[selection].copy()
vl_scorch = vl_scorch_preprocessed[selection].copy()
vl_chlorosis = vl_chlorosis_preprocessed[selection].copy()

vl_normal['object_status'] = 'normal'
vl_blight['object_status'] = 'abnormal'
vl_wilt['object_status'] = 'abnormal'
vl_scorch['object_status'] = 'abnormal'
vl_chlorosis['object_status'] = 'abnormal'

test_label = pd.concat([vl_normal, vl_blight, vl_wilt, vl_scorch, vl_chlorosis], axis=0)

In [105]:
## saving csv files

test_label.to_csv('./data_preprocessed/label/test_label.csv', encoding='utf-8', index=False)

In [None]:
#####################################################################################################################

In [107]:
def bbox_size_check(df):
    check_size = df.copy()
    check_size['bbox_width_total'] = check_size['bbox_x'] + check_size['bbox_width']
    check_size['bbox_height_total'] = check_size['bbox_y'] + check_size['bbox_height']

    image_width = check_size['image_width']
    image_height = check_size['image_height']
    bbox_width = check_size['bbox_width_total']
    bbox_height = check_size['bbox_height_total']

    width_same = check_size[image_width == bbox_width]
    height_same = check_size[image_height == bbox_height]
    one_or_two = check_size[(image_width)]
    all_same = check_size[(image_width == bbox_width) & (image_height == bbox_height)]

    abnormal_nums = [len(width_same), len(height_same), len(all_same)]
    return abnormal_nums

In [96]:
test_label_abnormal = bbox_size_check(test_label)
print(test_label_abnormal)

[0, 0, 0]


In [121]:
train_20 = pd.read_csv('./data_preprocessed/label/train_label_20.csv', encoding='utf-8')
train_30 = pd.read_csv('./data_preprocessed/label/train_label_30.csv', encoding='utf-8')
train_40 = pd.read_csv('./data_preprocessed/label/train_label_40.csv', encoding='utf-8')

In [108]:
abnormal_20 = bbox_size_check(train_20)
abnormal_30 = bbox_size_check(train_30)
abnormal_40 = bbox_size_check(train_40)

print(abnormal_20)
print(abnormal_30)
print(abnormal_40)

[0, 0, 0]
[0, 0, 0]
[0, 0, 0]


In [106]:
check = train_20.copy()
bbox_width = train_20['bbox_x'] + train_20['bbox_width']
check['bbox_width_total'] = bbox_width

check.head()

Unnamed: 0,file_id,folder_path,file_name,image_width,image_height,class,cause_method,ei_value,pl_value,el_value,...,bbox,bbox_x,bbox_y,bbox_width,bbox_height,category_id,disease_status,name,object_status,bbox_width_total
0,tr_normal_47532,./data_raw/train_image/train_normal/,normal_55_008_221110165807.jpg,820,1413,normal,normal,0.07,6.2,3.2,...,"[44.04, 921.19, 292.37, 212.6]",44.04,921.19,292.37,212.6,0,N,leaf,normal,336.41
1,tr_normal_47532,./data_raw/train_image/train_normal/,normal_55_008_221110165807.jpg,820,1413,normal,normal,0.07,6.2,3.2,...,"[445.51, 605.58, 203.4, 115.27]",445.51,605.58,203.4,115.27,1,N,leaf,normal,648.91
2,tr_normal_47532,./data_raw/train_image/train_normal/,normal_55_008_221110165807.jpg,820,1413,normal,normal,0.07,6.2,3.2,...,"[44.02, 870.91, 74.7, 38.33]",44.02,870.91,74.7,38.33,2,N,stem,normal,118.72
3,tr_normal_47533,./data_raw/train_image/train_normal/,normal_55_008_221111065807.jpg,820,1413,normal,normal,0.07,6.2,3.2,...,"[150.73, 664.52, 220.6, 179.86]",150.73,664.52,220.6,179.86,0,N,leaf,normal,371.33
4,tr_normal_47533,./data_raw/train_image/train_normal/,normal_55_008_221111065807.jpg,820,1413,normal,normal,0.07,6.2,3.2,...,"[437.05, 590.71, 195.52, 112.53]",437.05,590.71,195.52,112.53,1,N,leaf,normal,632.57


In [110]:
image_width = train_20['image_width']
image_height = train_20['image_height']
x = train_20['bbox_x']
y = train_20['bbox_y']
width = train_20['bbox_width']
height = train_20['bbox_height']

width_same = train_20[image_width == (x+width)]
height_same = train_20[image_height == (y+height)]
width_height_same = train_20[(image_width == (x+width)) & (image_height == (y+height))]

print(len(width_same))
print(len(height_same))
print(len(width_height_same))
print(width_same.head(1))

0
0
0
Empty DataFrame
Columns: [file_id, folder_path, file_name, image_width, image_height, class, cause_method, ei_value, pl_value, el_value, pi_value, bbox, bbox_x, bbox_y, bbox_width, bbox_height, category_id, disease_status, name, object_status]
Index: []


In [117]:
train_20['image_size'] = image_width * image_height
train_20['bbox_size'] = (x + width) * (y + height)

same_size = train_20[train_20['image_size'] == train_20['bbox_size']]

train_20['result'] = train_20['image_size'] - train_20['bbox_size']

print(len(same_size))
print(train_20['image_size'][0])
print(train_20['bbox_size'][0])
print(train_20[train_20['result'] <= 0])


0
1158660
381418.2939
Empty DataFrame
Columns: [file_id, folder_path, file_name, image_width, image_height, class, cause_method, ei_value, pl_value, el_value, pi_value, bbox, bbox_x, bbox_y, bbox_width, bbox_height, category_id, disease_status, name, object_status, image_size, bbox_size, result]
Index: []

[0 rows x 23 columns]


In [122]:
widths = train_20['image_width'].tolist()
heights = train_20['image_height'].tolist()
b_x = train_20['bbox_x'].tolist()
b_y = train_20['bbox_y'].tolist()
b_wid = train_20['bbox_width'].tolist()
b_hi = train_20['bbox_height'].tolist()

long = []
high = []
result_w = []
result_h = []

for x, w in zip(b_x, b_wid):
    lo = x + w
    long.append(lo)

for y, h in zip(b_y, b_hi):
    hi = y + h
    high.append(hi)

for image_x, box_x in zip(widths, long):
    res = image_x - box_x
    if res <= 0:
        result_w.append(res)

for image_y, box_y in zip(heights, high):
    res_y = image_y - box_y
    if res_y <= 0:
        result_h.append(res_y)

In [123]:
print(result_w)
print(result_h)

[]
[]


In [3]:
tr_blight.columns

Index(['environments', 'annotations', 'licenses', 'categories',
       'images.image_id', 'images.farm_id', 'images.crops_id', 'images.crops',
       'images.kind_type', 'images.file_path', 'images.fname', 'images.fext',
       'images.width', 'images.height', 'images.create_date',
       'images.date_captured', 'images.growth_stage', 'images.leaf',
       'images.disease_class', 'images.disease_cause',
       'images.disease_cause_method', 'images.plant_body',
       'growth_index.crops_id', 'growth_index.measured_date',
       'growth_index.stem_length', 'growth_index.leaf_cnt',
       'growth_index.leaf_width', 'growth_index.leaf_length',
       'growth_index.stem_thick', 'growth_index.bloom1_date',
       'growth_index.bloom2_date', 'growth_index.bloom3_date',
       'growth_index.fr1_cnt', 'growth_index.fr2_cnt', 'growth_index.fr3_cnt',
       'growth_index.fr_weight', 'etc_infor.crops_id', 'etc_infor.create_date',
       'etc_infor.inform', 'images.stem', 'images.grpoint', 'image

In [6]:
blight_date = tr_blight['images.fname'].apply(lambda x: x.replace('.jpg', ''))

In [12]:
date_list = []
for i in blight_date:
    date = i[:-6]
    date_list.append(date)

df_date = pd.DataFrame(date_list)

print(df_date.head())
print(df_date.tail())
print(len(df_date))


                      0
0  blight_01_001_220718
1  blight_01_001_220718
2  blight_01_001_220718
3  blight_01_001_220719
4  blight_01_001_220719
                          0
27123  blight_95_020_221018
27124  blight_95_020_221018
27125  blight_95_020_221018
27126  blight_95_020_221018
27127  blight_95_020_221018
27128


In [13]:
df_date_process = df_date.drop_duplicates()

print(df_date_process.head())
print(df_date_process.tail())
print(len(df_date_process))

                       0
0   blight_01_001_220718
3   blight_01_001_220719
13  blight_01_001_220720
25  blight_01_001_220721
33  blight_01_001_220722
                          0
27092  blight_95_020_221014
27099  blight_95_020_221015
27106  blight_95_020_221016
27116  blight_95_020_221017
27121  blight_95_020_221018
3026


In [37]:
def date_process(df):
    target = df['images.fname']
    date_list = []
    date_cut = target.apply(lambda x: x.replace('.jpg', ''))

    for date in date_cut:
        month_day = date[:-6]
        date_list.append(month_day)
    df_month_day = pd.DataFrame({'date_proecssed':date_list})
    df_processed = pd.concat([df, df_month_day], axis=1)
    df_processed = df_processed.drop_duplicates(subset = ['date_processed'], keep='first')

    return df_processed

In [38]:
df_list = [tr_normal, tr_blight, tr_wilt, tr_scorch, tr_chlorosis]
original_len = []
processed_len = []

for df in df_list:
    len_o = len(df)
    original_len.append(len_o)

for df_p in df_list:
    len_p = len(date_process(df_p))
    processed_len.append(len_p)

print(original_len)
print(processed_len)

KeyError: Index(['date_processed'], dtype='object')

In [41]:
normal_check = tr_normal.copy()

normal_check_date = normal_check['images.fname']

date_list = []
for i in normal_check_date:
    date = i.replace('.jpg', '')
    date_cut = date[:-6]
    date_list.append(date_cut)

df_date = pd.DataFrame({'date_process': date_list})

normal = pd.concat([normal_check, df_date], axis=1)

In [42]:
normal_first = normal.drop_duplicates(subset=['date_process'], keep='first').reset_index()
normal_last = normal.drop_duplicates(subset=['date_process'], keep='last').reset_index()
normal_two = pd.concat([normal_first, normal_last], axis=0)

print(len(normal))
print(len(normal_first))
print(len(normal_last))
print(len(normal_two))


59432
7548
7548
15096


In [None]:
####################################################################################################################

In [44]:
vl_raw_normal = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_normal.csv', encoding='utf-8')
vl_raw_blight = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_blight.csv', encoding='utf-8')
vl_raw_wilt = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_wilt.csv', encoding='utf-8')
vl_raw_scorch = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_scorch.csv', encoding='utf-8')
vl_raw_chlorosis = pd.read_csv('./data_raw/test_label/merged_df/vl_raw_en_chlorosis.csv', encoding='utf-8')

tr_normal = pd.read_csv('./data_raw/train_label/merged_df/raw_en_normal.csv', encoding='utf-8')
tr_blight = pd.read_csv('./data_raw/train_label/merged_df/raw_en_blight.csv', encoding='utf-8')
tr_wilt = pd.read_csv('./data_raw/train_label/merged_df/raw_en_wilt.csv', encoding='utf-8')
tr_scorch = pd.read_csv('./data_raw/train_label/merged_df/raw_en_scorch.csv', encoding='utf-8')
tr_chlorosis = pd.read_csv('./data_raw/train_label/merged_df/raw_en_chlorosis.csv', encoding='utf-8')

  tr_normal = pd.read_csv('./data_raw/train_label/merged_df/raw_en_normal.csv', encoding='utf-8')


In [48]:
def data_selection_by_date(df):
    original = df.copy()
    original_date = original['images.fname']
    date_list = []

    for date in original_date:
        format_cut = date.replace('.jpg', '')
        month_day = format_cut[:-6]
        date_list.append(month_day)

    df_date = pd.DataFrame({'month_day': date_list})

    add_month_day = pd.concat([original, df_date], axis=1)

    first_in = add_month_day.drop_duplicates(subset=['month_day'], keep='first').reset_index()
    last_in = add_month_day.drop_duplicates(subset=['month_day'], keep='last').reset_index()

    df_sel = pd.concat([first_in, last_in], axis=0)

    return df_sel

In [49]:
normal_selected = data_selection_by_date(tr_normal)

print(len(tr_normal))
print(len(normal_selected))

59432
15096


In [133]:
#######################################################################################
# 바운딩 박스 좌표에 따른 이미지 크롭 함수 정의
    # 두 가지 접근 방식이 있으며, 앞으로의 데이터 처리 관련해서도 항상 적용되는 접근방식이니 기억해둘것.

from PIL import Image
import numpy as np


# 방법1: 함수 내부에서 결과 리스트 정의 및 데이터프레임 전체 적용

def validate_and_crop_image(image_path_list, bbox_list):

    cropped_images = []
    error_indices = []
    
    for index, (image_path, bbox) in enumerate(zip(image_path_list, bbox_list)):
        try:
            x, y, w, h = bbox

            with Image.open(image_path) as img:
                img_width, img_height = img.size
                if w > 0 and h > 0:
                    crop_x_end = min(x + w, img_width)
                    crop_y_end = min(y + h, img_height)

                    crop_width = crop_x_end - x
                    crop_height = crop_y_end - y

                    if crop_width > 0 and crop_height > 0:
                        cropped_img = img.crop((x, y, crop_x_end, crop_y_end))
                        cropped_images.append(cropped_img)
                    else:
                        raise ValueError("Invalid crop size. Crop width and height must be grater than 0.")
                else:
                    raise ValueError("Width and height of the bounding box must be grater than 0.")
                
        except Exception as e:
            print(f"Error processing image at index {index}: {e}")
            error_indices.append(index)

    cropped_images_arrays = [np.array(image) for image in cropped_images]
    return cropped_images, cropped_images_arrays, error_indices
###############################################################################################
# 방법2: 단일 행에 적용되는 함수 정의 후 데이터프레임의 행을 순회하면서 함수 적용 및 리스트에 누적 저장
    # 예외처리 구문도 함수 내부가 아니라, 해당 함수를 실행하는 실행반복문에서 사용한다.

# def validate_and_crop_image(image_path, bbox):

#     x, y, w, h = bbox

#     with Image.open(image_path) as img:
#         img_width, img_height = img.size
#         if w > 0 and h > 0:
#             crop_x_end = min(x + w, img_width)
#             crop_y_end = min(y + h, img_height)

#             crop_width = crop_x_end - x
#             crop_height = crop_y_end - y

#             if crop_width > 0 and crop_height > 0:
#                 cropped_img = img.crop((x, y, crop_x_end, crop_y_end))
#                 return cropped_img

#             else:
#                 raise ValueError("Invalid crop size. Crop width and height must be grater than 0.")
#         else:
#             raise ValueError("Width and height of the bounding box must be grater than 0.")

# cropped_images = []
# error_indices = []

# for index, row in df.iterrows():
#     image_path = os.path.join(row['folder_path'], row['file_name'])
#     bbox = (row['bbox_x'], row['bbox_y'], row['bbox_width'], row['bbox_height'])
#     try:
#         cropped_img = validate_and_crop_image(image_path, bbox)
#         cropped_images.append(cropped_img)
#     except Exception as e:
#         print(f"Error processing image at index {index}: {e}")
#         error_indices.append(index)
###############################################################################################
"""
*방법 1의 특징
    1. 데이터프레임의 모든 데이터를 한 번에 처리하고 결과를 반환할때 적합하다.
    2. 함수가 데이터프레임 전체를 한 번에 처리할 수 있도록 설계되어야 한다.
    3. 복잡한 데이터 처리나 대량의 데이터 처리에 적합하다.

*방법 2의 특징
    1. 데이터프레임의 각 행에 대해 개별적으로 처리가 필요할 때 유용하다.
    2. 각 행의 데이터를 함수에 개별적으로 전달하고 그 결과를 누적하는 방식으로 사용해야 한다.
    3. 더 세밀한 데이터 처리가 가능하지만, 대량의 데이터를 처리해야 할 때는 처리 시간이 더 오래 걸릴 수 있다(방법 1에 비해)

*결론
    *데이터 전처리, 특징 추출 같은 반복 작업 -> 방법 1이 효율적
    *데이터 분석, 모델링 단계에서 특정 조건에 따라 다르게 처리해야 하는 작업 -> 방법 2가 효율적
"""

'\n*방법 1의 특징\n    1. 데이터프레임의 모든 데이터를 한 번에 처리하고 결과를 반환할때 적합하다.\n    2. 함수가 데이터프레임 전체를 한 번에 처리할 수 있도록 설계되어야 한다.\n    3. 복잡한 데이터 처리나 대량의 데이터 처리에 적합하다.\n\n*방법 2의 특징\n    1. 데이터프레임의 각 행에 대해 개별적으로 처리가 필요할 때 유용하다.\n    2. 각 행의 데이터를 함수에 개별적으로 전달하고 그 결과를 누적하는 방식으로 사용해야 한다.\n    3. 더 세밀한 데이터 처리가 가능하지만, 대량의 데이터를 처리해야 할 때는 처리 시간이 더 오래 걸릴 수 있다(방법 1에 비해)\n\n*결론\n    *데이터 전처리, 특징 추출 같은 반복 작업 -> 방법 1이 효율적\n    *데이터 분석, 모델링 단계에서 특정 조건에 따라 다르게 처리해야 하는 작업 -> 방법 2가 효율적\n'

In [161]:
train_10 = pd.read_csv('./data_preprocessed/label/train_label_10.csv', encoding='utf-8')

image_path_list = (train_10['folder_path'] + train_10['file_name']).tolist()
train_10['bbox'] = train_10['bbox'].apply(ast.literal_eval)
bbox_list = train_10['bbox'].tolist()


In [162]:
cropped_images, cropped_images_arrays, error_indices = validate_and_crop_image(image_path_list, bbox_list)

print(len(train_10))
print(len(cropped_images))
print(len(error_indices))

28382
28382
0


In [163]:
np.savez("./data_preprocessed/part_image_arrays/cropped_10.npz", *cropped_images_arrays)

In [171]:
train_100 = pd.read_csv('./data_preprocessed/label/train_label_100.csv', encoding='utf-8')

check_sample = train_100[160000:].copy()

In [172]:
image_path_list_check = (check_sample['folder_path'] + check_sample['file_name']).tolist()
check_sample['bbox'] = check_sample['bbox'].apply(ast.literal_eval)
bbox_list_check = check_sample['bbox'].tolist()

cropped_images_check, cropped_images_arrays_check, error_indices_check = validate_and_crop_image(image_path_list_check, bbox_list_check)

print(len(check_sample))
print(len(cropped_images_check))
print(len(error_indices_check))
print(error_indices_check)

Error processing image at index 22704: Width and height of the bounding box must be grater than 0.
105501
105500
1
[22704]


In [170]:
print(len(train_10)*6)

170292
