In [3]:
import pandas as pd
import numpy as np
import ast
import os
import shutil
import json

In [None]:
# define elements functions

In [None]:
## label data sampling and merging

def copy_files(input_path, output_path, num_files=12000):
    # output폴더를 여기서 생성하게 할 경우에는 아래의 Path 기능 사용
    # Path(output_path).mkdir(parents=True, exist_ok=True)
    
    files = [f for f in os.listdir(input_path) if f.endswith('.json')]
    files_to_copy = files[-num_files:]

    for file in files_to_copy:
        shutil.copy(os.path.join(input_path, file), os.path.join(output_path, file))

    return f"{len(files_to_copy)} files coppied from {input_path} to {output_path}"

def json_to_df(folder_path):
    df_list = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.json'):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

                df_list.append(pd.json_normalize(data))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")

    if df_list:
        merged_df = pd.concat(df_list, ignore_index=True)
    else:
        print("No valid JSON files found")

    print('merged_df')
    return merged_df

In [None]:
## image data copy and renaming

def copy_image_files(src_path, dst_path, df):

    file_list = list(df['images.fname'])
    files_to_copy = []
    for file_src in os.listdir(src_path):
        if (file_src in file_list) and (file_src.endswith('.jpg')):
            files_to_copy.append(file_src)

    for file in files_to_copy:
        shutil.copy(os.path.join(src_path, file), os.path.join(dst_path, file))

    return f"{len(files_to_copy)} files coppied from {src_path} to {dst_path}"

def rename_kor_en(df, kor, en, image_path):
    df['fname_kor'] = df['images.fname']
    df['images.fname']=df['images.fname'].apply(lambda x: x.replace('딸기','berry'))
    df['images.fname']=df['images.fname'].apply(lambda x: x.replace('설향','snow'))
    df['images.fname']=df['images.fname'].apply(lambda x: x.replace(kor,en))

    folder_path = image_path

    for index, row in df.iterrows():
        old_file_path = os.path.join(folder_path, row['fname_kor'])
        new_file_path = os.path.join(folder_path, row['images.fname'])
        try:
            os.rename(old_file_path, new_file_path)
            print(f'Renamed: {old_file_path} to {new_file_path}')
        except FileNotFoundError:
            print(f'File not found: {old_file_path}')
        except FileExistsError:
            print(f'File already exists: {new_file_path}')

    df_en = df.drop(['fname_kor'], axis=1)
    return df_en

In [None]:
## selecting columns from raw dataframe

def selecting_columns(df, selected_col_list):
    df_sel = df[selected_col_list].copy()

    return df_sel

In [None]:
## transforming json type dict to pandas type dict

def json_data_to_pandas(df, target_col, saving_col):
    df[saving_col] = None
    
    for idx, row in df.iterrows():
        json_str = row[target_col]
        str_corrected = json_str.replace("None", "null").replace("'", "\"")
        corrected_data = json.loads(str_corrected)
        df.at[idx, saving_col] = corrected_data

    dict_format = df[saving_col][0]
    extraction_keys = dict_format[0].keys()
    key_list = list(extraction_keys)
    df_pandas_type = df.drop([target_col], axis=1)

    print(type(df[saving_col]))
    return df_pandas_type, key_list

In [None]:
## processing column data: environments

def extraction_env_values(df_pandas_type, key_list, target_col):
    df = df_pandas_type
    df['value_types'] = None
    value_names = []

    for i in range(len(key_list)):
        candidate = key_list[i]
        if candidate.endswith('_value'):
            value_names.append(candidate)

    for idx, row in df.iterrows():
        value_dict = {}
        dict_list = row[target_col]
        for dictionary in dict_list:
            for key, value in dictionary.items():
                if key in value_names:
                    if key not in value_dict:
                        value_dict[key] = [value]
                    else:
                        value_dict[key].append(value)
        df.at[idx, 'value_types'] = value_dict
    
    for idx, row in df.iterrows():
        value_to_split = row['value_types']
        for key, value in value_to_split.items():
            if key not in df.columns:
                df[key] = pd.NA
            df.at[idx, key] = value

    df_splited_env_values = df.drop([target_col, 'value_types'], axis=1)
    print(df_splited_env_values.columns)
    return df_splited_env_values

def str_to_num(string_list):
    if isinstance(string_list, str):
        try:
            list_obj = ast.literal_eval(string_list)
        except ValueError:
            return string_list
    else:
        list_obj = string_list

    if all(item is None for item in list_obj):
        return None
    else:
        list_num = [float(item) for item in list_obj if item is not None]
        return list_num
    
def string_to_number(df):
    col_list = [col for col in df.columns if col.endswith('_value')]
    
    for col in col_list:
        df[col] = df[col].apply(str_to_num)
        
    return df

def calculate_avg(df):
    col_list = [col for col in df.columns if col.endswith('_value')]

    for value_name in col_list:
        for idx, row in df.iterrows():
            values = row[value_name]
            if values is not None:
                df.at[idx, value_name] = np.mean(values)

            else:
                df.at[idx, value_name] = None

    return df

In [None]:
## processing column data: annotations, categories

def data_split(df, target_col):
    exploded_df = df.explode(target_col).reset_index(drop=True)
    exploded_df = pd.concat([exploded_df.drop([target_col], axis=1),
                             exploded_df[target_col].apply(pd.Series)], axis=1)
    
    return exploded_df

In [None]:
## processing column data: bbox, rest data

def split_bbox(df):
    df['bbox_x'] = df['bbox'].apply(lambda bbox: bbox[0])
    df['bbox_y'] = df['bbox'].apply(lambda bbox: bbox[1])
    df['bbox_width'] = df['bbox'].apply(lambda bbox: bbox[2])
    df['bbox_height'] = df['bbox'].apply(lambda bbox: bbox[3])

    return df

def process_rest_values(df, class_kor, class_en):
    df_cleaned = df.rename(columns={'images.fname':'file_name',
                                    'images.width':'image_width',
                                      'images.height':'image_height',
                                        'images.disease_class':'class',
                                          'images.disease_cause_method':'cause_method'})
    
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('칼슘부족','low_Ca'))
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('질소부족','low_N'))
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('수분제한','water_'))
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('주입','_injection'))    
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace(class_kor,class_en))
    df_cleaned['class']=df_cleaned['class'].apply(lambda x: x.replace(class_kor, class_en))

    replace_values = {'잎': 'leaf', '줄기': 'stem', '과실': 'fruit', '화방': 'flower'}
    df_cleaned['name'] = df_cleaned['name'].replace(replace_values, regex=True)
    df_cleaned = df_cleaned.drop(['ir_value', 'tl_value', 'rp_value'], axis=1)
    return df_cleaned

In [None]:
# define integrated functions

In [None]:
# envrionments 컬럼 데이터 처리 통합 함수

def process_col_environments(df, target_col, saving_col):
    env_target, env_saving = target_col, saving_col
    env_pandas, env_list = json_data_to_pandas(df, target_col=env_target, saving_col=env_saving)
    env_values = extraction_env_values(env_pandas, env_list, env_saving)
    env_values_num = string_to_number(env_values)
    env_values_avg = calculate_avg(env_values_num)
    
    return env_values_avg

In [None]:
# annotations, categories 컬럼 데이터 처리 통합 함수

def process_col_annotations_categories(df, target_col_1, saving_col_1, target_col_2, saving_col_2):
    df_anno = df.drop(['categories'], axis=1)
    df_cat = df[['images.fname', 'categories']].copy()
    anno_target, anno_saving = target_col_1, saving_col_1
    cat_target, cat_saving = target_col_2, saving_col_2

    anno_pandas, _ = json_data_to_pandas(df_anno, anno_target, anno_saving)
    cat_pandas, _ = json_data_to_pandas(df_cat, cat_target, cat_saving)

    anno_split = data_split(anno_pandas, target_col=anno_saving)
    anno_split_sel = anno_split.drop(['coordinates', 'area', 'isCrowd',
                                      'id', 'image_id'],
                                      axis=1)
    cat_split = data_split(cat_pandas, target_col=cat_saving)
    anno_split_sel['category_id'] = cat_split['id']
    anno_split_sel['name'] = cat_split['name']

    return anno_split_sel

In [None]:
## environments, annotations, categories 컬럼 모두 처리하는 종합 함수

def label_preprocess(df, target_0, target_1, target_2, saving_0, saving_1, saving_2, class_kor, class_en):
    processed_env = process_col_environments(df, target_0, saving_0)
    processed_anno_cat = process_col_annotations_categories(processed_env, target_1, saving_1, target_2, saving_2)
    processed_bbox = split_bbox(processed_anno_cat)
    processed_data = process_rest_values(processed_bbox, class_kor, class_en)

    return processed_data

In [None]:
#####################################################################################################

In [None]:
# activate functions and check results

In [None]:
## train data

In [None]:
### label data copy and merging

input_path_1 = './data_raw/train_label/TL_01.딸기_001.설향_01.정상'
input_path_2 = './data_raw/train_label/TL_01.딸기_001.설향_02.역병'
input_path_3 = './data_raw/train_label/TL_01.딸기_001.설향_03.시들음병'
input_path_4 = './data_raw/train_label/TL_01.딸기_001.설향_04.잎끝마름'
input_path_5 = './data_raw/train_label/TL_01.딸기_001.설향_05.황화'
output_path_1 = './data_raw/train_label/sampling_normal'
output_path_2 = './data_raw/train_label/sampling_blight'
output_path_3 = './data_raw/train_label/sampling_wilt'
output_path_4 = './data_raw/train_label/sampling_scorch'
output_path_5 = './data_raw/train_label/sampling_chlorosis'

input_paths = [input_path_1, input_path_2, input_path_3, input_path_4, input_path_5]
output_paths = [output_path_1, output_path_2, output_path_3, output_path_4, output_path_5]
num_files = 12000

copy_results = [copy_files(input_path, output_path, num_files) for input_path, output_path in zip(input_paths, output_paths)]

folder_paths = [r'./data_raw/train_label/sampling_normal', r'./data_raw/train_label/sampling_blight',
                r'./data_raw/train_label/sampling_wilt', r'./data_raw/train_label/sampling_scorch',
                r'./data_raw/train_label/sampling_chlorosis']

copy_results
merged_normal = json_to_df(folder_paths[0])
merged_blight = json_to_df(folder_paths[1])
merged_wilt = json_to_df(folder_paths[2])
merged_scorch = json_to_df(folder_paths[3])
merged_chlorosis = json_to_df(folder_paths[4])

In [None]:
### image data copy and renameing

src_path_1 = './data_raw/train_image/TS_01.딸기_001.설향_01.정상'
src_path_2 = './data_raw/train_image/TS_01.딸기_001.설향_02.역병'
src_path_3 = './data_raw/train_image/TS_01.딸기_001.설향_03.시들음병'
src_path_4 = './data_raw/train_image/TS_01.딸기_001.설향_04.잎끝마름'
src_path_5 = './data_raw/train_image/TS_01.딸기_001.설향_05.황화'
dst_path_1 = './data_raw/train_image/image_sampling_normal'
dst_path_2 = './data_raw/train_image/image_sampling_blight'
dst_path_3 = './data_raw/train_image/image_sampling_wilt'
dst_path_4 = './data_raw/train_image/image_sampling_scorch'
dst_path_5 = './data_raw/train_image/image_sampling_chlorosis'

src_paths = [src_path_1, src_path_2, src_path_3, src_path_4, src_path_5]
dst_paths = [dst_path_1, dst_path_2, dst_path_3, dst_path_4, dst_path_5]
df_list = [merged_normal, merged_blight, merged_wilt, merged_scorch, merged_chlorosis]

for src, dst, data in zip(src_paths, dst_paths, df_list):
    copy_images = copy_image_files(src, dst, data)
    copy_images

normal_en = rename_kor_en(merged_normal, kor='정상', en='normal', image_path=dst_path_1)
blight_en = rename_kor_en(merged_blight, kor='역병', en='blight', image_path=dst_path_2)
wilt_en = rename_kor_en(merged_wilt, kor='시들음병', en='wilt', image_path=dst_path_3)
scorch_en = rename_kor_en(merged_scorch, kor='잎끝마름', en='scorch', image_path=dst_path_4)
chlorosis_en = rename_kor_en(merged_chlorosis, kor='황화', en='chlorosis', image_path=dst_path_5)

In [None]:
### selecting columns

col_selection = ['environments', 'annotations', 'categories', 'images.fname', 'images.width',
                 'images.height', 'images.disease_class', 'images.disease_cause_method']

normal_data = selecting_columns(normal_en, col_selection)
blight_data = selecting_columns(blight_en, col_selection)
wilt_data = selecting_columns(wilt_en, col_selection)
scorch_data = selecting_columns(scorch_en, col_selection)
chlorosis_data = selecting_columns(chlorosis_en, col_selection)

In [None]:
### processing all target columns: environments, annotations, categories

normal_preprocessed = label_preprocess(normal_data,
                                       'environments', 'annotations', 'categories',
                                       'env_values', 'anno_values', 'cat_values',
                                       '정상', 'normal')
blight_preprocessed = label_preprocess(blight_data,
                                       'environments', 'annotations', 'categories',
                                       'env_values', 'anno_values', 'cat_values',
                                       '역병', 'blight')
wilt_preprocessed = label_preprocess(wilt_data,
                                     'environments', 'annotations', 'categories',
                                     'env_values', 'anno_values', 'cat_values',
                                     '시들음병', 'wilt')
scorch_preprocessed = label_preprocess(scorch_data,
                                       'environments', 'annotations', 'categories',
                                       'env_values', 'anno_values', 'cat_values',
                                       '잎끝마름', 'scorch')
chlorosis_preprocessed = label_preprocess(chlorosis_data,
                                          'environments', 'annotations', 'categories',
                                          'env_values', 'anno_values', 'cat_values',
                                          '황화', 'chlorosis')

In [None]:
# merging data and saving into csv files

train_data = pd.concat([normal_preprocessed,
                        blight_preprocessed,
                        wilt_preprocessed,
                        scorch_preprocessed,
                        chlorosis_preprocessed],
                        axis=0, ignore_index=True)
train_data.reset_index(drop=True, inplace=True)

train_data.to_csv('./data_preprocessed/train_label.csv', encoding='utf-8', index=False)


In [None]:
## test data

In [None]:
### label data copy and merging

vl_input_path_1 = './data_raw/test_label/VL_01.딸기_001.설향_01.정상'
vl_input_path_2 = './data_raw/test_label/VL_01.딸기_001.설향_02.역병'
vl_input_path_3 = './data_raw/test_label/VL_01.딸기_001.설향_03.시들음병'
vl_input_path_4 = './data_raw/test_label/VL_01.딸기_001.설향_04.잎끝마름'
vl_input_path_5 = './data_raw/test_label/VL_01.딸기_001.설향_05.황화'
vl_output_path_1 = './data_raw/test_label/sampling_normal'
vl_output_path_2 = './data_raw/test_label/sampling_blight'
vl_output_path_3 = './data_raw/test_label/sampling_wilt'
vl_output_path_4 = './data_raw/test_label/sampling_scorch'
vl_output_path_5 = './data_raw/test_label/sampling_chlorosis'

vl_input_paths = [vl_input_path_1, vl_input_path_2, vl_input_path_3, vl_input_path_4, vl_input_path_5]
vl_output_paths = [vl_output_path_1, vl_output_path_2, vl_output_path_3, vl_output_path_4, vl_output_path_5]
vl_num_files = 500

vl_copy_results = [copy_files(input_path, output_path, vl_num_files) for input_path, output_path in zip(vl_input_paths, vl_output_paths)]

vl_folder_paths = [r'./data_raw/test_label/sampling_normal', r'./data_raw/test_label/sampling_blight',
                r'./data_raw/test_label/sampling_wilt', r'./data_raw/test_label/sampling_scorch',
                r'./data_raw/test_label/sampling_chlorosis']

copy_results
vl_merged_normal = json_to_df(vl_folder_paths[0])
vl_merged_blight = json_to_df(vl_folder_paths[1])
vl_merged_wilt = json_to_df(vl_folder_paths[2])
vl_merged_scorch = json_to_df(vl_folder_paths[3])
vl_merged_chlorosis = json_to_df(vl_folder_paths[4])

In [None]:
### image data copy and renameing

vl_src_path_1 = './data_raw/test_image/VS_01.딸기_001.설향_01.정상'
vl_src_path_2 = './data_raw/test_image/VS_01.딸기_001.설향_02.역병'
vl_src_path_3 = './data_raw/test_image/VS_01.딸기_001.설향_03.시들음병'
vl_src_path_4 = './data_raw/test_image/VS_01.딸기_001.설향_04.잎끝마름'
vl_src_path_5 = './data_raw/test_image/VS_01.딸기_001.설향_05.황화'
vl_dst_path_1 = './data_raw/test_image/image_sampling_normal'
vl_dst_path_2 = './data_raw/test_image/image_sampling_blight'
vl_dst_path_3 = './data_raw/test_image/image_sampling_wilt'
vl_dst_path_4 = './data_raw/test_image/image_sampling_scorch'
vl_dst_path_5 = './data_raw/test_image/image_sampling_chlorosis'

vl_src_paths = [vl_src_path_1, vl_src_path_2, vl_src_path_3, vl_src_path_4, vl_src_path_5]
vl_dst_paths = [vl_dst_path_1, vl_dst_path_2, vl_dst_path_3, vl_dst_path_4, vl_dst_path_5]
vl_df_list = [vl_merged_normal, vl_merged_blight, vl_merged_wilt, vl_merged_scorch, vl_merged_chlorosis]

for src, dst, data in zip(vl_src_paths, vl_dst_paths, vl_df_list):
    vl_copy_images = copy_image_files(src, dst, data)
    vl_copy_images

vl_normal_en = rename_kor_en(vl_merged_normal, kor='정상', en='normal', image_path=vl_dst_path_1)
vl_blight_en = rename_kor_en(vl_merged_blight, kor='역병', en='blight', image_path=vl_dst_path_2)
vl_wilt_en = rename_kor_en(vl_merged_wilt, kor='시들음병', en='wilt', image_path=vl_dst_path_3)
vl_scorch_en = rename_kor_en(vl_merged_scorch, kor='잎끝마름', en='scorch', image_path=vl_dst_path_4)
vl_chlorosis_en = rename_kor_en(vl_merged_chlorosis, kor='황화', en='chlorosis', image_path=vl_dst_path_5)

In [None]:
### selecting columns

col_selection = ['environments', 'annotations', 'categories', 'images.fname', 'images.width',
                 'images.height', 'images.disease_class', 'images.disease_cause_method']

vl_normal_data = selecting_columns(vl_normal_en, col_selection)
vl_blight_data = selecting_columns(vl_blight_en, col_selection)
vl_wilt_data = selecting_columns(vl_wilt_en, col_selection)
vl_scorch_data = selecting_columns(vl_scorch_en, col_selection)
vl_chlorosis_data = selecting_columns(vl_chlorosis_en, col_selection)

In [None]:
### processing all target columns: environments, annotations, categories

vl_normal_preprocessed = label_preprocess(vl_normal_data,
                                       'environments', 'annotations', 'categories',
                                       'env_values', 'anno_values', 'cat_values',
                                       '정상', 'normal')
vl_blight_preprocessed = label_preprocess(vl_blight_data,
                                       'environments', 'annotations', 'categories',
                                       'env_values', 'anno_values', 'cat_values',
                                       '역병', 'blight')
vl_wilt_preprocessed = label_preprocess(vl_wilt_data,
                                     'environments', 'annotations', 'categories',
                                     'env_values', 'anno_values', 'cat_values',
                                     '시들음병', 'wilt')
vl_scorch_preprocessed = label_preprocess(vl_scorch_data,
                                       'environments', 'annotations', 'categories',
                                       'env_values', 'anno_values', 'cat_values',
                                       '잎끝마름', 'scorch')
vl_chlorosis_preprocessed = label_preprocess(vl_chlorosis_data,
                                          'environments', 'annotations', 'categories',
                                          'env_values', 'anno_values', 'cat_values',
                                          '황화', 'chlorosis')

In [None]:
# merging data and saving into csv files

test_data = pd.concat([vl_normal_preprocessed,
                        vl_blight_preprocessed,
                        vl_wilt_preprocessed,
                        vl_scorch_preprocessed,
                        vl_chlorosis_preprocessed],
                        axis=0, ignore_index=True)
test_data.reset_index(drop=True, inplace=True)

test_data.to_csv('./data_preprocessed/test_label.csv', encoding='utf-8', index=False)