In [169]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

In [171]:
train = pd.read_csv('../data/train/train.csv')
train

Unnamed: 0,시점,품목명,품종명,거래단위,등급,평년 평균가격(원),평균가격(원)
0,201801상순,건고추,화건,30 kg,상품,381666.666667,590000.0
1,201801중순,건고추,화건,30 kg,상품,380809.666667,590000.0
2,201801하순,건고추,화건,30 kg,상품,380000.000000,590000.0
3,201802상순,건고추,화건,30 kg,상품,380000.000000,590000.0
4,201802중순,건고추,화건,30 kg,상품,376666.666667,590000.0
...,...,...,...,...,...,...,...
29371,202111중순,대파,대파(일반),10키로묶음,상,0.000000,0.0
29372,202111하순,대파,대파(일반),10키로묶음,상,0.000000,0.0
29373,202112상순,대파,대파(일반),10키로묶음,상,0.000000,0.0
29374,202112중순,대파,대파(일반),10키로묶음,상,0.000000,0.0


In [173]:
def process_data(raw_file, 품목명, scaler=None):
    raw_data = raw_file
    # 산지공판장 = pd.read_csv(산지공판장_file)
    # 전국도매 = pd.read_csv(전국도매_file)

    # 타겟 및 메타데이터 필터 조건 정의
    conditions = {
    '감자': {
        'target': lambda df: (df['품종명'] == '감자 수미') & (df['거래단위'] == '20키로상자') & (df['등급'] == '상')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['감자'], '품종명': ['수미'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['감자'], '품종명': ['수미']}
    },
    '건고추': {
        'target': lambda df: (df['품종명'] == '화건') & (df['거래단위'] == '30 kg') & (df['등급'] == '상품'),
        # '공판장': None, 
        # '도매': None  
    },
    '깐마늘(국산)': {
        'target': lambda df: (df['거래단위'] == '20 kg') & (df['등급'] == '상품')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['마늘'], '품종명': ['깐마늘'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['마늘'], '품종명': ['깐마늘']}
    },
    '대파': {
        'target': lambda df: (df['품종명'] == '대파(일반)') & (df['거래단위'] == '1키로단') & (df['등급'] == '상')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['대파'], '품종명': ['대파(일반)'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['대파'], '품종명': ['대파(일반)']}
    },
    '무': {
        'target': lambda df: (df['거래단위'] == '20키로상자') & (df['등급'] == '상')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['무'], '품종명': ['기타무'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['무'], '품종명': ['무']}
    },
    '배추': {
        'target': lambda df: (df['거래단위'] == '10키로망대') & (df['등급'] == '상')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배추'], '품종명': ['쌈배추'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['배추'], '품종명': ['배추']}
    },
    '사과': {
        'target': lambda df: (df['품종명'].isin(['홍로', '후지'])) & (df['거래단위'] == '10 개') & (df['등급'] == '상품')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['사과'], '품종명': ['후지'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['사과'], '품종명': ['후지']}
    },
    '상추': {
        'target': lambda df: (df['품종명'] == '청') & (df['거래단위'] == '100 g') & (df['등급'] == '상품')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['상추'], '품종명': ['청상추'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['상추'], '품종명': ['청상추']}
    },
    '양파': {
        'target': lambda df: (df['품종명'] == '양파') & (df['거래단위'] == '1키로') & (df['등급'] == '상')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['양파'], '품종명': ['기타양파'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['양파'], '품종명': ['양파(일반)']}
    },
    '배': {
        'target': lambda df: (df['품종명'] == '신고') & (df['거래단위'] == '10 개') & (df['등급'] == '상품')
        # '공판장': {'공판장명': ['*전국농협공판장'], '품목명': ['배'], '품종명': ['신고'], '등급명': ['상']},
        # '도매': {'시장명': ['*전국도매시장'], '품목명': ['배'], '품종명': ['신고']}
    }
    }

    # 타겟 데이터 필터링
    raw_품목 = raw_data[raw_data['품목명'] == 품목명]
    target_mask = conditions[품목명]['target'](raw_품목)
    filtered_data = raw_품목[target_mask]

    # 다른 품종에 대한 파생변수 생성
    other_data = raw_품목[~target_mask]
    unique_combinations = other_data[['품종명', '거래단위', '등급']].drop_duplicates()
    for _, row in unique_combinations.iterrows():
        품종명, 거래단위, 등급 = row['품종명'], row['거래단위'], row['등급']
        mask = (other_data['품종명'] == 품종명) & (other_data['거래단위'] == 거래단위) & (other_data['등급'] == 등급)
        temp_df = other_data[mask]
        for col in ['평년 평균가격(원)', '평균가격(원)']:
            new_col_name = f'{품종명}_{거래단위}_{등급}_{col}'
            filtered_data = filtered_data.merge(temp_df[['시점', col]], on='시점', how='left', suffixes=('', f'_{new_col_name}'))
            filtered_data.rename(columns={f'{col}_{new_col_name}': new_col_name}, inplace=True)


    # 공판장 데이터 처리
    # if conditions[품목명]['공판장']:
    #     filtered_공판장 = 산지공판장
    #     for key, value in conditions[품목명]['공판장'].items():
    #         filtered_공판장 = filtered_공판장[filtered_공판장[key].isin(value)]
        
    #     filtered_공판장 = filtered_공판장.add_prefix('공판장_').rename(columns={'공판장_시점': '시점'})
    #     filtered_data = filtered_data.merge(filtered_공판장, on='시점', how='left')

    # 도매 데이터 처리
    # if conditions[품목명]['도매']:
    #     filtered_도매 = 전국도매
    #     for key, value in conditions[품목명]['도매'].items():
    #         filtered_도매 = filtered_도매[filtered_도매[key].isin(value)]
        
    #     filtered_도매 = filtered_도매.add_prefix('도매_').rename(columns={'도매_시점': '시점'})
    #     filtered_data = filtered_data.merge(filtered_도매, on='시점', how='left')

    # 수치형 컬럼 처리
    numeric_columns = filtered_data.select_dtypes(include=[np.number]).columns
    filtered_data = filtered_data[['시점'] + list(numeric_columns)]
    filtered_data[numeric_columns] = filtered_data[numeric_columns].fillna(0)

    # 정규화 적용
    if scaler is None:
        scaler = MinMaxScaler()
        filtered_data[numeric_columns] = scaler.fit_transform(filtered_data[numeric_columns])
    else:
        filtered_data[numeric_columns] = scaler.transform(filtered_data[numeric_columns])

    return filtered_data, scaler

In [175]:
filtered_data, scaler = process_data(train, '건고추')
filtered_data

Unnamed: 0,시점,평년 평균가격(원),평균가격(원),햇산양건_30 kg_상품_평년 평균가격(원),햇산양건_30 kg_상품_평균가격(원),햇산화건_30 kg_중품_평년 평균가격(원),햇산화건_30 kg_중품_평균가격(원),햇산화건_30 kg_상품_평년 평균가격(원),햇산화건_30 kg_상품_평균가격(원),양건_30 kg_중품_평년 평균가격(원),양건_30 kg_중품_평균가격(원),양건_30 kg_상품_평년 평균가격(원),양건_30 kg_상품_평균가격(원),화건_30 kg_중품_평년 평균가격(원),화건_30 kg_중품_평균가격(원),햇산양건_30 kg_중품_평년 평균가격(원),햇산양건_30 kg_중품_평균가격(원)
0,201801상순,0.639284,0.380653,0.0,0.0,0.0,0.0,0.0,0.0,0.635248,0.682716,0.639342,0.668177,0.634330,0.649486,0.0,0.0
1,201801중순,0.637848,0.380653,0.0,0.0,0.0,0.0,0.0,0.0,0.634775,0.682716,0.639031,0.668177,0.632567,0.649486,0.0,0.0
2,201801하순,0.636492,0.380653,0.0,0.0,0.0,0.0,0.0,0.0,0.633500,0.682716,0.638230,0.668177,0.630580,0.649486,0.0,0.0
3,201802상순,0.636492,0.380653,0.0,0.0,0.0,0.0,0.0,0.0,0.633500,0.682716,0.638230,0.668177,0.630580,0.649486,0.0,0.0
4,201802중순,0.630909,0.380653,0.0,0.0,0.0,0.0,0.0,0.0,0.630952,0.682716,0.635940,0.668177,0.624331,0.649486,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,202111중순,0.926170,0.317865,0.0,0.0,0.0,0.0,0.0,0.0,0.952255,0.660706,0.923662,0.658832,0.940204,0.589662,0.0,0.0
140,202111하순,0.928058,0.331881,0.0,0.0,0.0,0.0,0.0,0.0,0.952739,0.660706,0.924589,0.658832,0.941692,0.596471,0.0,0.0
141,202112상순,0.936313,0.342392,0.0,0.0,0.0,0.0,0.0,0.0,0.953054,0.660706,0.928317,0.658832,0.946558,0.601579,0.0,0.0
142,202112중순,0.924589,0.342392,0.0,0.0,0.0,0.0,0.0,0.0,0.941714,0.660706,0.918130,0.658832,0.933433,0.601579,0.0,0.0


In [177]:
target = filtered_data[['시점','평균가격(원)']]
target

Unnamed: 0,시점,평균가격(원)
0,201801상순,0.380653
1,201801중순,0.380653
2,201801하순,0.380653
3,201802상순,0.380653
4,201802중순,0.380653
...,...,...
139,202111중순,0.317865
140,202111하순,0.331881
141,202112상순,0.342392
142,202112중순,0.342392
