In [None]:
# label preprocessing code layout

"""
*원시 json 라벨 파일들 샘플링 및 지정 폴더에 복사
*원시 json 라벨 파일들 병합 -> 단일 데이터프레임
*필요 컬럼 추출
    *images.fname;
    *images.width, height
    *environments: "_value"
    *annotations: "bbox", "id", "status" 
    *categories: "id", "name"
    *disease_class
    *disease_cause_method

*추출 컬럼 데이터 분리(각 데이터는 딕셔너리 데이터를 요소로 갖는 리스트 형태)
    *딕셔너리 변환: json type -> pandas type
        *변환 전 type: str
        *변환 후 type: list
        *변환 case1(None 없는 경우): .replace("'", "\"")
        *변환 case2(None 있는 경우): .replace("None", "null").replace("'", "\"")
            *주의: pandas에서도 {'key':'value'}, {'key': None}와 같이 (', None)로 출력되기 때문에 헷갈리기 쉽다. 출력 형태는 같아도 내부에서 인식하는 type이 다름을 유의할 것

    *딕셔너리 분리: key, value
    *value 변환(문자열 외 다른 데이터형이 필요한 경우): str -> list, number
        *str -> list변환: ast.literal_eval(arg)
        *str -> num_type변환: 대상 리스트 내에서 num_type(arg) 반복 수행(예: list_num = [float(itme) for item in list if item is not None])
        *주의
            *위의 과정으로 분리된 value가 리스트 형태일 때, 이 값의 type은 리스트가 아니라 리스트 형태의 문자열이므로, 리스트로 변환하는 작업이 필요하다.
            *각 리스트 내에서 '1.5'와 같이 숫자형 데이터가 ''안에 있다면, 숫자형 문자열이므로 숫자형 데이터로 변환하는 작업이 필요하다(자동 변환 안됨)
"""

In [4]:
import pandas as pd
import json
import numpy as np
import ast
import os
from pathlib import Path
import shutil

In [2]:
# 원시 데이터 샘플링(라벨)

input_path_1 = './data_raw/train_label/TL_01.딸기_001.설향_01.정상'
input_path_2 = './data_raw/train_label/TL_01.딸기_001.설향_02.역병'
input_path_3 = './data_raw/train_label/TL_01.딸기_001.설향_03.시들음병'
input_path_4 = './data_raw/train_label/TL_01.딸기_001.설향_04.잎끝마름'
input_path_5 = './data_raw/train_label/TL_01.딸기_001.설향_05.황화'
output_path_1 = './data_raw/train_label/sampling_normal'
output_path_2 = './data_raw/train_label/sampling_blight'
output_path_3 = './data_raw/train_label/sampling_wilt'
output_path_4 = './data_raw/train_label/sampling_scorch'
output_path_5 = './data_raw/train_label/sampling_chlorosis'

input_paths = [input_path_1, input_path_2, input_path_3, input_path_4, input_path_5]
output_paths = [output_path_1, output_path_2, output_path_3, output_path_4, output_path_5]
num_files = 12000

def copy_files(input_path, output_path, num_files=12000):
    # output폴더를 여기서 생성하게 할 경우에는 아래의 Path 기능 사용
    # Path(output_path).mkdir(parents=True, exist_ok=True)
    
    files = [f for f in os.listdir(input_path) if f.endswith('.json')]
    files_to_copy = files[-num_files:]

    for file in files_to_copy:
        shutil.copy(os.path.join(input_path, file), os.path.join(output_path, file))

    return f"{len(files_to_copy)} files coppied from {input_path} to {output_path}"

In [3]:
# 원시 데이터 샘플링 실행

copy_results = [copy_files(input_path, output_path, num_files) for input_path, output_path in zip(input_paths, output_paths)]
copy_results

['12000 files coppied from ./data_raw/train_label/TL_01.딸기_001.설향_01.정상 to ./data_raw/train_label/sampling_normal',
 '12000 files coppied from ./data_raw/train_label/TL_01.딸기_001.설향_02.역병 to ./data_raw/train_label/sampling_blight',
 '12000 files coppied from ./data_raw/train_label/TL_01.딸기_001.설향_03.시들음병 to ./data_raw/train_label/sampling_wilt',
 '12000 files coppied from ./data_raw/train_label/TL_01.딸기_001.설향_04.잎끝마름 to ./data_raw/train_label/sampling_scorch',
 '12000 files coppied from ./data_raw/train_label/TL_01.딸기_001.설향_05.황화 to ./data_raw/train_label/sampling_chlorosis']

In [4]:
# 샘플링한 원시 데이터를 데이터프레임으로 변환 및 병합

def json_to_df(folder_path):
    df_list = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.json'):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)

                df_list.append(pd.json_normalize(data))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")

    if df_list:
        merged_df = pd.concat(df_list, ignore_index=True)
    else:
        print("No valid JSON files found")

    print('merged_df')
    return merged_df

In [5]:
folder_paths = [r'./data_raw/train_label/sampling_normal', r'./data_raw/train_label/sampling_blight',
                r'./data_raw/train_label/sampling_wilt', r'./data_raw/train_label/sampling_scorch',
                r'./data_raw/train_label/sampling_chlorosis']

merged_normal = json_to_df(folder_paths[0])
merged_blight = json_to_df(folder_paths[1])
merged_wilt = json_to_df(folder_paths[2])
merged_scorch = json_to_df(folder_paths[3])
merged_chlorosis = json_to_df(folder_paths[4])

merged_df
merged_df
merged_df
merged_df
merged_df


In [6]:
merged_normal.to_csv('./data_raw/train_label/merged_df/merged_raw_normal.csv', encoding='utf-8', index=False)
merged_blight.to_csv('./data_raw/train_label/merged_df/merged_raw_blight.csv', encoding='utf-8', index=False)
merged_wilt.to_csv('./data_raw/train_label/merged_df/merged_raw_wilt.csv', encoding='utf-8', index=False)
merged_scorch.to_csv('./data_raw/train_label/merged_df/merged_raw_scorch.csv', encoding='utf-8', index=False)
merged_chlorosis.to_csv('./data_raw/train_label/merged_df/merged_raw_chlorosis.csv', encoding='utf-8', index=False)

In [24]:
merged_normal = pd.read_csv('./data_raw/train_label/merged_df/merged_raw_normal.csv', encoding='utf-8')
merged_blight = pd.read_csv('./data_raw/train_label/merged_df/merged_raw_blight.csv', encoding='utf-8')
merged_wilt = pd.read_csv('./data_raw/train_label/merged_df/merged_raw_wilt.csv', encoding='utf-8')
merged_scorch = pd.read_csv('./data_raw/train_label/merged_df/merged_raw_scorch.csv', encoding='utf-8')
merged_chlorosis = pd.read_csv('./data_raw/train_label/merged_df/merged_raw_chlorosis.csv', encoding='utf-8')

In [12]:
# 원시 데이터 샘플링(이미지)
    # 샘플링한 라벨 데이터의 'images.fname'에 해당하는 이미지 파일 복사

src_path_1 = './data_raw/train_image/TS_01.딸기_001.설향_01.정상'
src_path_2 = './data_raw/train_image/TS_01.딸기_001.설향_02.역병'
src_path_3 = './data_raw/train_image/TS_01.딸기_001.설향_03.시들음병'
src_path_4 = './data_raw/train_image/TS_01.딸기_001.설향_04.잎끝마름'
src_path_5 = './data_raw/train_image/TS_01.딸기_001.설향_05.황화'
dst_path_1 = './data_raw/train_image/image_sampling_normal'
dst_path_2 = './data_raw/train_image/image_sampling_blight'
dst_path_3 = './data_raw/train_image/image_sampling_wilt'
dst_path_4 = './data_raw/train_image/image_sampling_scorch'
dst_path_5 = './data_raw/train_image/image_sampling_chlorosis'

src_paths = [src_path_1, src_path_2, src_path_3, src_path_4, src_path_5]
dst_paths = [dst_path_1, dst_path_2, dst_path_3, dst_path_4, dst_path_5]
df_list = [merged_normal, merged_blight, merged_wilt, merged_scorch, merged_chlorosis]

def copy_image_files(src_path, dst_path, df):

    file_list = list(df['images.fname'])
    files_to_copy = []
    for file_src in os.listdir(src_path):
        if (file_src in file_list) and (file_src.endswith('.jpg')):
            files_to_copy.append(file_src)

    for file in files_to_copy:
        shutil.copy(os.path.join(src_path, file), os.path.join(dst_path, file))

    return f"{len(files_to_copy)} files coppied from {src_path} to {dst_path}"

In [29]:
# 샘플링 이미지 파일명 변경(kor -> en)

def rename_kor_en(df, kor, en, image_path):
    df['fname_kor'] = df['images.fname']
    df['images.fname']=df['images.fname'].apply(lambda x: x.replace('딸기','berry'))
    df['images.fname']=df['images.fname'].apply(lambda x: x.replace('설향','snow'))
    df['images.fname']=df['images.fname'].apply(lambda x: x.replace(kor,en))

    folder_path = image_path

    for index, row in df.iterrows():
        old_file_path = os.path.join(folder_path, row['fname_kor'])
        new_file_path = os.path.join(folder_path, row['images.fname'])
        try:
            os.rename(old_file_path, new_file_path)
            print(f'Renamed: {old_file_path} to {new_file_path}')
        except FileNotFoundError:
            print(f'File not found: {old_file_path}')
        except FileExistsError:
            print(f'File already exists: {new_file_path}')

    df_en = df.drop(['fname_kor'], axis=1)
    return df_en

In [13]:
for src, dst, data in zip(src_paths, dst_paths, df_list):
    copy_images = copy_image_files(src, dst, data)
    copy_images

In [32]:
normal_en = rename_kor_en(merged_normal, kor='정상', en='normal', image_path=dst_path_1)
blight_en = rename_kor_en(merged_blight, kor='역병', en='blight', image_path=dst_path_2)
wilt_en = rename_kor_en(merged_wilt, kor='시들음병', en='wilt', image_path=dst_path_3)
scorch_en = rename_kor_en(merged_scorch, kor='잎끝마름', en='scorch', image_path=dst_path_4)
chlorosis_en = rename_kor_en(merged_chlorosis, kor='황화', en='chlorosis', image_path=dst_path_5)


Renamed: ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030095807.jpg to ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030095807.jpg
Renamed: ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030105807.jpg to ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030105807.jpg
Renamed: ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030115807.jpg to ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030115807.jpg
Renamed: ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030125807.jpg to ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030125807.jpg
Renamed: ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030145807.jpg to ./data_raw/train_image/image_sampling_normal\berry_snow_normal_55_008_221030145807.jpg
Renamed: ./data_raw/train_image/image_sampling_normal\berry_snow_norma

In [33]:
normal_en.to_csv('./data_raw/train_label/merged_df/en_raw_normal.csv', encoding='utf-8', index=False)
blight_en.to_csv('./data_raw/train_label/merged_df/en_raw_blight.csv', encoding='utf-8', index=False)
wilt_en.to_csv('./data_raw/train_label/merged_df/en_raw_wilt.csv', encoding='utf-8', index=False)
scorch_en.to_csv('./data_raw/train_label/merged_df/en_raw_scorch.csv', encoding='utf-8', index=False)
chlorosis_en.to_csv('./data_raw/train_label/merged_df/en_raw_chlorosis.csv', encoding='utf-8', index=False)

In [34]:
# 데이터프레임 정제: 필요 컬럼 추출 및 변환

raw_normal = pd.read_csv('./data_raw/train_label/merged_df/en_raw_normal.csv', encoding='utf-8')
raw_blight = pd.read_csv('./data_raw/train_label/merged_df/en_raw_blight.csv', encoding='utf-8')
raw_wilt = pd.read_csv('./data_raw/train_label/merged_df/en_raw_wilt.csv', encoding='utf-8')
raw_scorch = pd.read_csv('./data_raw/train_label/merged_df/en_raw_scorch.csv', encoding='utf-8')
raw_chlorosis = pd.read_csv('./data_raw/train_label/merged_df/en_raw_chlorosis.csv', encoding='utf-8')

In [35]:
# 필요 컬럼 추출
col_selection = ['environments', 'annotations', 'categories', 'images.fname', 'images.width',
                 'images.height', 'images.disease_class', 'images.disease_cause_method']

normal_data = raw_normal[col_selection].copy()
blight_data = raw_blight[col_selection].copy()
wilt_data = raw_wilt[col_selection].copy()
scorch_data = raw_scorch[col_selection].copy()
chlorosis_data = raw_chlorosis[col_selection].copy()

In [16]:
# 데이터 분리 및 변환: environments
    # '_value'에 해당하는 값들 분리하여 각각 컬럼으로 할당
    # environments의 id는 annotations, categories의 id와 불일치하므로 별도 처리
        # environments의 id는 모두 0~4의 값만 가짐: 각 부위에 대한 측정이 아니라, 측정 회차를 의미하는 것으로 추정됨(예: 각 샘플마다 5회 측정)
    # 모든 회차의 수치를 컬럼화 하기에는 비효율적이므로, 평균값으로 통일

def json_data_to_pandas(df, target_col, saving_col):
    df[saving_col] = None
    
    for idx, row in df.iterrows():
        json_str = row[target_col]
        str_corrected = json_str.replace("None", "null").replace("'", "\"")
        corrected_data = json.loads(str_corrected)
        df.at[idx, saving_col] = corrected_data

    dict_format = df[saving_col][0]
    extraction_keys = dict_format[0].keys()
    key_list = list(extraction_keys)
    df_pandas_type = df.drop([target_col], axis=1)

    print(type(df[saving_col]))
    return df_pandas_type, key_list

def extraction_env_values(df_pandas_type, key_list):
    df = df_pandas_type
    df['value_types'] = None
    value_names = []

    for i in range(len(key_list)):
        candidate = key_list[i]
        if candidate.endswith('_value'):
            value_names.append(candidate)

    for idx, row in df.iterrows():
        value_dict = {}
        dict_list = row['values']
        for dictionary in dict_list:
            for key, value in dictionary.items():
                if key in value_names:
                    if key not in value_dict:
                        value_dict[key] = [value]
                    else:
                        value_dict[key].append(value)
        df.at[idx, 'value_types'] = value_dict
    
    for idx, row in df.iterrows():
        value_to_split = row['value_types']
        for key, value in value_to_split.items():
            if key not in df.columns:
                df[key] = pd.NA
            df.at[idx, key] = value

    df_splited_env_values = df.drop(['values', 'value_types'], axis=1)
    print(df_splited_env_values.columns)
    return df_splited_env_values

def str_to_num(string_list):
    if isinstance(string_list, str):
        try:
            list_obj = ast.literal_eval(string_list)
        except ValueError:
            return string_list
    else:
        list_obj = string_list

    if all(item is None for item in list_obj):
        return None
    else:
        list_num = [float(item) for item in list_obj if item is not None]
        return list_num
    
def string_to_number(df):
    col_list = [col for col in df.columns if col.endswith('_value')]
    
    for col in col_list:
        df[col] = df[col].apply(str_to_num)
        
    return df

def calculate_avg(df):
    col_list = [col for col in df.columns if col.endswith('_value')]

    for value_name in col_list:
        for idx, row in df.iterrows():
            values = row[value_name]
            if values is not None:
                df.at[idx, value_name] = np.mean(values)

            else:
                df.at[idx, value_name] = None

    return df



In [17]:
normal_test, env_list = json_data_to_pandas(normal_data, target_col='environments', saving_col='values')
normal_test_env = extraction_env_values(normal_test, env_list)
normal_test_values = string_to_number(normal_test_env)
normal_test_avg = calculate_avg(normal_test_values)

<class 'pandas.core.series.Series'>
Index(['annotations', 'categories', 'images.fname', 'images.width',
       'images.height', 'images.disease_class', 'images.disease_cause_method',
       'ti_value', 'hi_value', 'ci_value', 'ir_value', 'tl_value', 'ei_value',
       'pl_value', 'sr_value', 'cl_value', 'el_value', 'hl_value', 'pi_value',
       'rp_value'],
      dtype='object')


In [18]:
normal_test_avg.to_csv('normal_avg_test.csv', encoding='utf-8', index=False)

In [134]:
# 데이터 분리 및 변환: annotations, categories
    # 사용 기능: explode(), pd.Series
        # explode(): 리스트의 각 요소(딕셔너리들)를 행으로 분리
        # pd.Series: 딕셔너리의 키를 컬럼명으로, 값을 컬럼 데이터로 분리

    # 주의사항: 대상 컬럼별로 각각 적용 후 병합해야 함
        # "원본 -> 적용 -> 적용한 df에 적용" 이 구조로 안됨(두번째 적용 시점에서 기존 데이터와 합쳐지는 문제 발생
        # 적용 방법(현재 로직에서)
            # 원본df -> 'annotations_value'에 적용(1)
            # 원본df -> 'categories_value'에 적용(2)
            # (2)데이터프레임에서 필요한 컬럼만 (1)데이터프레임에 병합

normal_anno, annotation_list = json_data_to_pandas(normal_test_avg, target_col='annotations', saving_col='annotations_value')
normal_anno_cat, categories_list = json_data_to_pandas(normal_anno, target_col='categories', saving_col='categories_value')

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [135]:
def data_split(df, target_col):
    exploded_df = df.explode(target_col).reset_index(drop=True)
    exploded_df = pd.concat([exploded_df.drop([target_col], axis=1),
                             exploded_df[target_col].apply(pd.Series)], axis=1)
    
    return exploded_df


In [137]:
anno_split = data_split(normal_anno_cat, target_col='annotations_value')

anno_split_sel = anno_split.drop(['coordinates', 'area', 'isCrowd', 'id', 'image_id', 'categories_value'], axis=1)

In [138]:
cat_split = data_split(normal_anno_cat, target_col='categories_value')

In [139]:
anno_split_sel['category_id'] = cat_split['id']
anno_split_sel['name'] = cat_split['name']

In [140]:
# bbox를 구성하는 [x, y, width, height]값을 개별 컬럼으로 분리

anno_split_sel['x'] = anno_split_sel['bbox'].apply(lambda bbox: bbox[0])
anno_split_sel['y'] = anno_split_sel['bbox'].apply(lambda bbox: bbox[1])
anno_split_sel['width'] = anno_split_sel['bbox'].apply(lambda bbox: bbox[2])
anno_split_sel['height'] = anno_split_sel['bbox'].apply(lambda bbox: bbox[3])

Unnamed: 0,images.fname,images.width,images.height,images.disease_class,images.disease_cause_method,ti_value,hi_value,ci_value,ir_value,tl_value,...,hl_value,pi_value,rp_value,category_id,disease_status,name,x,y,width,height
0,딸기_설향_정상_55_008_221030095807.jpg,820,1413,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,0,N,잎,256.48,505.46,145.14,100.43
1,딸기_설향_정상_55_008_221030095807.jpg,820,1413,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,1,N,잎,226.66,557.57,161.95,83.05
2,딸기_설향_정상_55_008_221030095807.jpg,820,1413,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,2,N,줄기,312.32,638.45,107.19,79.26
3,딸기_설향_정상_55_008_221030095807.jpg,820,1413,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,3,N,줄기,389.85,880.56,85.50,44.51
4,딸기_설향_정상_55_008_221030095807.jpg,820,1413,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,4,N,잎,261.36,904.98,257.38,271.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,딸기_설향_정상_56_009_221021125654.jpg,764,1184,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,3,N,잎,411.95,466.72,191.78,203.35
2196,딸기_설향_정상_56_009_221021145654.jpg,764,1184,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,0,N,잎,430.86,469.98,220.42,200.95
2197,딸기_설향_정상_56_009_221021145654.jpg,764,1184,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,1,N,잎,433.39,615.98,218.16,176.85
2198,딸기_설향_정상_56_009_221021145654.jpg,764,1184,정상,정상,16.7,93.48,596.6,,,...,3.9,11.0,,2,N,잎,162.84,273.70,226.07,184.05


In [141]:
anno_split_sel.to_csv('anno_cat_check.csv', encoding='cp949', index=False)

In [40]:
def label_preprocessing(df, target_col_set, saving_col_set, col_list, class_kor, class_en):
    df_sel = df[col_list].copy()      
    target_col_1 = target_col_set[0]
    target_col_2 = target_col_set[1]
    target_col_3 = target_col_set[2]
    saving_col_1 = saving_col_set[0]
    saving_col_2 = saving_col_set[1]
    saving_col_3 = saving_col_set[2]
    
    # environments 컬럼 전처리
    df_env, env_list = json_data_to_pandas(df_sel, target_col=target_col_1, saving_col = saving_col_1)
    df_env_values = extraction_env_values(df_env, env_list)
    df_env_values_num = string_to_number(df_env_values)
    df_env_avg = calculate_avg(df_env_values_num)
    df_env_processed = df_env_avg

    # annotations, categories 전처리
    df_anno, _ = json_data_to_pandas(df_env_processed, target_col=target_col_2, saving_col=saving_col_2)
    df_anno_cat, _ = json_data_to_pandas(df_anno, target_col=target_col_3, saving_col=saving_col_3)    # bbox 데이터 분리
    
    df_anno_split = data_split(df_anno_cat, target_col=saving_col_2)
    df_anno_sel = df_anno_split.drop(['coordinates', 'area', 'isCrowd', 'id', 'image_id', 'categories_value'], axis=1)
    cat_split = data_split(df_anno_cat, target_col=saving_col_3)

    df_anno_split['category_id'] = cat_split['id']
    df_anno_split['name'] = cat_split['name']

    df_env_anno_cat_processed = df_anno_split

    # bbox 분리
    df_env_anno_cat_processed['x'] = df_env_anno_cat_processed['bbox'].apply(lambda bbox: bbox[0])
    df_env_anno_cat_processed['y'] = df_env_anno_cat_processed['bbox'].apply(lambda bbox: bbox[1])
    df_env_anno_cat_processed['width'] = df_env_anno_cat_processed['bbox'].apply(lambda bbox: bbox[2])
    df_env_anno_cat_processed['height'] = df_env_anno_cat_processed['bbox'].apply(lambda bbox: bbox[3])

    # 한-영변환, 컬럼명 변환, 결측치 처리

    df_rename = df_env_anno_cat_processed.rename(columns={'images.fname':'file_name', 'images.width':'image_width', 'images.height':'image_height', 'images.disease_class':'class', 'images.disease_cause_method':'cause_method'})
    df_cleaned = df_rename.drop(['ir_value', 'tl_value', 'rp_value'])
    
    df_cleaned['cause_method']=df_cleaned['cause_method'].apply(lambda x: x.replace('칼슘부족','low_Ca'))
    df_cleaned['class']=df_cleaned['class'].apply(lambda x: x.replace(class_kor, class_en))

    replace_values = {'잎': 'leaf', '줄기': 'stem', '과실': 'fruit', '화방': 'flower'}
    df_cleaned['name'] = df_cleaned['name'].replace(replace_values, regex=True)
    
    # 최종 데이터프레임 반환
    return df_cleaned

In [38]:
raw_normal = pd.read_csv('./data_raw/train_label/merged_df/en_raw_normal.csv', encoding='utf-8')
raw_blight = pd.read_csv('./data_raw/train_label/merged_df/en_raw_blight.csv', encoding='utf-8')
raw_wilt = pd.read_csv('./data_raw/train_label/merged_df/en_raw_wilt.csv', encoding='utf-8')
raw_scorch = pd.read_csv('./data_raw/train_label/merged_df/en_raw_scorch.csv', encoding='utf-8')
raw_chlorosis = pd.read_csv('./data_raw/train_label/merged_df/en_raw_chlorosis.csv', encoding='utf-8')

In [41]:
target_col_set = ['environments', 'annotations', 'categories']
saving_col_set = ['env_values', 'anno_values', 'cat_values']
class_kor_list = ['정상', '역병', '시들음병', '잎끝마름', '황화']
class_en_list = ['normal', 'blight', 'wilt', 'scorch', 'chlorosis']
col_list = ['environments', 'annotations', 'categories', 'images.fname', 'images.width',
                 'images.height', 'images.disease_class', 'images.disease_cause_method']

processed_normal = label_preprocessing(raw_normal, target_col_set, saving_col_set, col_list, class_kor=class_kor_list[0], class_en=class_en_list[0])

<class 'pandas.core.series.Series'>


KeyError: 'values'

In [None]:
# 샘플링 이미지 파일 복사
# 한글 데이터 영문 변환
    # images.fname
    # images.disease_class
    # images.disease_cause_method
    # name

# 이미지파일명, 컬럼명 간소화
    # 이미지파일명: 클래스명_시간.jpg형태로 축약(예: 'normal_221030095807.jpg')
    # images.fname -> fname
    # images.width -> f_width
    # images.height -> f_height
    # images.disease_class -> class
    # images.disease_cause_method -> cause_method

# images.fname 원본 컬럼은 실제 이미지 파일명 변환에 사용 후 drop할 것