In [111]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt, freqz
import pickle
import os
from sklearn.preprocessing import QuantileTransformer
from statsmodels.tsa.seasonal import STL
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Attention, Input
from tensorflow.keras.models import Model
from sklearn.cluster import KMeans

from sklearn.cluster import KMeans, SpectralClustering, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import MeanShift
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler
from scipy.spatial import ConvexHull

# GPU 설정
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    print("\033[92mGPU Detected. Configuring GPU...\033[0m")
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)  # 메모리 동적 할당
else:
    print("\033[91mNo GPU detected. Running on CPU.\033[0m")

[92mGPU Detected. Configuring GPU...[0m


In [8]:
def create_first_sales_column_dict(df, save_result=False):
    # sell_prices의 각 행에서 NaN이 끝나고 처음으로 숫자인 column을 찾기
    first_sales_column_dict = {}
    
    for idx in df.index:
        row = df.iloc[idx]
        # state_id와 item_id로 키 튜플 생성
        key = (row['state_id'], row['item_id'])
        
        # NaN 값들의 위치를 찾습니다
        nan_mask = row.isna()
        
        if not nan_mask.any():
            # NaN이 없는 경우 첫 번째 데이터 컬럼('d_1')을 저장
            first_sales_column_dict[key] = 'd_1'
        else:
            # 마지막 NaN의 위치를 찾습니다
            last_nan_idx = nan_mask[::-1].idxmax()
            # 마지막 NaN의 위치 이후의 첫 번째 숫자가 있는 컬럼을 찾습니다
            last_nan_position = row.index.get_loc(last_nan_idx)
            first_number_col = row.index[last_nan_position + 1]
            first_sales_column_dict[key] = first_number_col
    
    if save_result:
        with open('../data/preprocessed/first_sales_column_dict.pkl', 'wb') as f:
            pickle.dump(first_sales_column_dict, f)
            
    return first_sales_column_dict

# 판매량 데이터 (아이템별 상이)
sales = pd.read_csv("../data/preprocessed/sales.csv")
# 판매 가격 데이터 (아이템별 상이)
sell_prices = pd.read_csv("../data/preprocessed/sell_prices.csv")
# 주말 여부 데이터 (1941일)
is_weekend = pd.read_csv("../data/preprocessed/is_weekend.csv")
# 이벤트 데이터 (1941일)
event_type_cultural = pd.read_csv("../data/preprocessed/event_type_cultural.csv")
event_type_national = pd.read_csv("../data/preprocessed/event_type_national.csv")
event_type_religious = pd.read_csv("../data/preprocessed/event_type_religious.csv")
event_type_sporting = pd.read_csv("../data/preprocessed/event_type_sporting.csv")

# 판매 시작 시점 
# first_sales_column_dict = create_first_sales_column_dict(sell_prices, save_result==True)
with open('../data/preprocessed/first_sales_column_dict.pkl', 'rb') as f:
    first_sales_column_dict = pickle.load(f)


In [9]:
def align_by_first_sale_day(state_item_id, first_sales_day):
    # 주 ID, 아이템 ID
    state_id, item_id = state_item_id

    # 필요한 데이터 필터링 (한 번씩만 필터링)
    sales_df = sales.query("state_id == @state_id and item_id == @item_id").iloc[0]
    price_df = sell_prices.query("state_id == @state_id and item_id == @item_id").iloc[0]
    
    # 각 데이터의 슬라이싱 시작 인덱스 계산
    start_col_sales = sales_df.index.get_loc(first_sales_day)
    start_col_price = price_df.index.get_loc(first_sales_day)

    # 각 데이터 슬라이싱
    sales_data = sales_df.iloc[start_col_sales:].values
    price_data = price_df.iloc[start_col_price:].values
    
    # 고정된 데이터 (weekend와 이벤트 데이터)
    start_col_weekend_event = is_weekend.columns.get_loc(first_sales_day)
    weekend_data = is_weekend.iloc[0, start_col_weekend_event:].values
    cultural_data = event_type_cultural.iloc[0, start_col_weekend_event:].values
    national_data = event_type_national.iloc[0, start_col_weekend_event:].values
    religious_data = event_type_religious.iloc[0, start_col_weekend_event:].values
    sporting_data = event_type_sporting.iloc[0, start_col_weekend_event:].values
    
    # 출력
    return {
        "sales": sales_data,
        "price": price_data,
        "weekend": weekend_data,
        "cultural": cultural_data,
        "national": national_data,
        "religious": religious_data,
        "sporting": sporting_data,
    }

In [10]:
def create_generator_for_each_item(num_kernels, look_back_window_size, look_forward_window_size):
    for state_item_id, first_sales_day in first_sales_column_dict.items(): # 아이템별로
        data = sort_by_first_sale_day(state_item_id, first_sales_day)

        kernel_sizes = [k for k in fourier_results[state_item_id]['selected_periods'][:num_kernels]]
        max_kernel_size = max(kernel_sizes)
        look_back_window_size = max(look_back_window_size, max_kernel_size)

        # 슬라이딩 윈도우 데이터 생성
        input_data = []
        output_data = []
        for i in range(len(data["sales"]) - look_back_window_size - look_forward_window_size): # 슬라이딩 윈도우
            ##### 슬라이딩 윈도우 간격
            input = {
                "sales": data["sales"][i:i+look_back_window_size],
                "price": data["price"][i:i+look_back_window_size],
                "weekend": data["weekend"][i:i+look_back_window_size],
                "cultural": data["cultural"][i:i+look_back_window_size],
                "national": data["national"][i:i+look_back_window_size],
                "religious": data["religious"][i:i+look_back_window_size],
                "sporting": data["sporting"][i:i+look_back_window_size],
            }
            output = data["sales"][i+look_back_window_size:i+look_back_window_size+look_forward_window_size]
            input_data.append(input)
            output_data.append(output)

        yield input_data, output_data, kernel_sizes  # 아이템별로 훈련/테스트 데이터 반환

In [188]:
def extract_periods_in_sales_data(sales_data, c_algorithm, n_clusters, save_plot, fourier_save_path):
    # 아이템별 판매 데이터를 분석하여 주기 분석과 SNR 계산을 수행하는 함수

    # 푸리에 변환
    fft = np.fft.fft(sales_data)
    frequencies = np.fft.fftfreq(len(sales_data))
    magnitudes = np.abs(fft) / len(sales_data)
    phases = np.angle(fft)

    # 군집화 알고리즘 정의
    clustering_algorithms = {
        'KMeans': KMeans(n_clusters=n_clusters, random_state=42),
        'Spectral': SpectralClustering(n_clusters=n_clusters, random_state=42),
        'DBSCAN': DBSCAN(eps=0.3),
        'Agglomerative': AgglomerativeClustering(n_clusters=n_clusters),
        'GMM': GaussianMixture(n_components=n_clusters, random_state=42),
        'MeanShift': MeanShift(bandwidth=0.6)
    }
    clustering_algorithm = clustering_algorithms[c_algorithm]

    # 군집화
    positive_mask = frequencies >= 0
    frequencies_magnitudes_phases = np.column_stack((
        frequencies[positive_mask], 
        magnitudes[positive_mask],
        phases[positive_mask]
    ))
    predicts = clustering_algorithm.fit_predict(frequencies_magnitudes_phases[:, :2])
    labels = np.unique(predicts)

    # 군집화 시각화
    if save_plot:
        plt.figure(figsize=(12, 6))
        scatter = plt.scatter(frequencies_magnitudes_phases[:, 0], frequencies_magnitudes_phases[:, 1], c=predicts, alpha=0.6)
        plt.xlabel('Frequency')
        plt.ylabel('Magnitude')
        plt.tight_layout()
        plt.savefig(os.path.join(fourier_save_path, f"cluster_{c_algorithm}_{n_clusters}.png"))
        plt.close()

    # 진폭이 가장 큰 군집 대표값 추출
    cluster_represents = []
    for label in labels:
        cluster_mask = predicts == label

        cluster_represent_frequency = frequencies_magnitudes_phases[cluster_mask, 0]
        cluster_represent_magnitude = frequencies_magnitudes_phases[cluster_mask, 1]
        cluster_represent_phase = frequencies_magnitudes_phases[cluster_mask, 2]

        cluster_represent_idx = np.argmax(cluster_represent_magnitude)
 
        cluster_represents.append({
            'frequency': cluster_represent_frequency[cluster_represent_idx],
            'magnitude': cluster_represent_magnitude[cluster_represent_idx],
            'phase': cluster_represent_phase[cluster_represent_idx]
        })

    # 푸리에 역변환
    t = np.arange(len(sales_data))
    reconstructed_sales_data = np.zeros_like(sales_data, dtype=float)

    for cluster_represent in cluster_represents:
        if cluster_represent['frequency'] == 0:  # DC
            reconstructed_sales_data += cluster_represent['magnitude'] * np.cos(2 * np.pi * cluster_represent['frequency'] * t + cluster_represent['phase'])
        else: # 음수, 양수
            reconstructed_sales_data += 2 * cluster_represent['magnitude'] * np.cos(2 * np.pi * cluster_represent['frequency'] * t + cluster_represent['phase'])

    # 푸리에 역변환 시각화
    if save_plot:
        plt.figure(figsize=(12, 6))
        plt.plot(t, sales_data, label="Original Signal", alpha=0.7)
        plt.plot(t, reconstructed_sales_data, label="Reconstructed Signal", alpha=0.7)
        plt.xlabel("Time")
        plt.ylabel("Magnitude")
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(fourier_save_path, f"reconstruct_{c_algorithm}_{n_clusters-1}.png"))
        plt.close()

    # SNR
    original_energy = np.sum(np.abs(sales_data) ** 2)
    error_energy = np.sum(np.abs(sales_data - reconstructed_sales_data) ** 2)
    snr = 10 * np.log10(original_energy / error_energy)

    # 주기
    periods = [1 / cluster_represent['frequency'] for cluster_represent in cluster_represents if cluster_represent['frequency'] != 0] # DC 제거

    # 주기, SNR 저장
    with open(os.path.join(fourier_save_path, f"periods_{c_algorithm}_{n_clusters-1}.pkl"), 'wb') as f:
        pickle.dump({"periods": periods, "snr": snr}, f)

    return periods

In [189]:
### 파라미터 설정

## 푸리에 변환 & 군집화
c_algorithm = 'KMeans' # 'Spectral','DBSCAN','Agglomerative','GMM','MeanShift'
n_clusters = 30 + 1 # DC 포함
save_plot = True



test_size = 0.2

n_epochs = 300
initial_learning_rate = 5e-4
lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
    initial_learning_rate,
    first_decay_steps=1000,
    t_mul=2.0,
    m_mul=0.9,
    alpha=1e-5
)

batch_size = 64

num_kernels = 20 # 상위 몇 개 커널 사용할지
look_back_window_size = 1
look_forward_window_size = 1

num_filters=128

In [185]:
for idx, (state_item_id, first_sales_day) in enumerate(first_sales_column_dict.items()): # 아이템별로
    aligned_data = align_by_first_sale_day(state_item_id, first_sales_day)

    ### 판매 데이터 
    sales_data = aligned_data["sales"]

    ## 주기 추출
    fourier_save_path = f"../data/fourier/{state_item_id[0]}_{state_item_id[1]}/"
    periods_file_path = os.path.join(fourier_save_path, f"periods_{c_algorithm}_{n_clusters-1}.pkl")
    if os.path.exists(periods_file_path):
        with open(periods_file_path, 'rb') as f:
            periods_data = pickle.load(f)
            periods = periods_data["periods"]
    else:
        os.makedirs(fourier_save_path, exist_ok=True) # 폴더 생성
        periods = extract_periods_in_sales_data(sales_data, c_algorithm, n_clusters, save_plot, fourier_save_path)


    continue






































    ## 푸리에 변환을 통해 주기 추출
    periods_path = f"../data/fourier/{state_item_id[0]}_{state_item_id[1]}/periods.pkl"
    if os.path.exists(periods_path): 
        with open(periods_path, 'rb') as f:
            periods = pickle.load(f)
    else:
        periods = extract_periods_in_sales_data_with_fourier(
                        sales_data, 
                        low_cutoff_period=low_cutoff_period,
                        high_cutoff_period=high_cutoff_period,
                        snr_percentile=snr_percentile, 
                        fourier_save_plot=fourier_save_plot, 
                        fourier_save_path=f"../data/fourier/{state_item_id[0]}_{state_item_id[1]}/"
                    )
    
    ## 추출한 주기로 STL 분해 반복적으로 수행
    seasonals = np.zeros_like(sales_data)
    residuals = np.zeros_like(sales_data)
    for period in periods:
        stl = STL(sales_data, period=period)
        result = stl.fit()
        print(result.seasonal.shape, result.resid.shape)
        break
        seasonals += result.seasonal
        residuals += result.resid

    ### 외부 데이터
    # price_data = aligned_data["price"]
    # weekend_data = aligned_data["weekend"]
    # cultural_data = aligned_data["cultural"]
    # national_data = aligned_data["national"]
    # religious_data = aligned_data["religious"]
    # sporting_data = aligned_data["sporting"]

    def create_windows(data, look_back_window_size, look_forward_window_size):
        input_data = []
        output_data = []
        for i in range(len(data) - look_back_window_size - look_forward_window_size):
            input_data.append(data[i:i+look_back_window_size])
            output_data.append(data[i+look_back_window_size:i+look_back_window_size+look_forward_window_size])
        return np.array(input_data), np.array(output_data)

    def build_seq2seq_model(input_shape, num_filters, look_forward_window_size):
        inputs = Input(shape=input_shape)
        x = Conv1D(filters=num_filters, kernel_size=3, padding='same', activation='relu')(inputs)
        x = Attention()([x, x])
        encoder_output = LSTM(128, return_state=True)
        encoder_outputs, state_h, state_c = encoder_output(x)
        encoder_states = [state_h, state_c]

        decoder_inputs = Input(shape=(None, input_shape[1]))
        decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
        decoder_dense = Dense(look_forward_window_size, activation='linear')
        decoder_outputs = decoder_dense(decoder_outputs)

        model = Model([inputs, decoder_inputs], decoder_outputs)
        return model

    # Create sliding windows
    look_back_window_size = 30
    look_forward_window_size = 7
    input_data, output_data = create_windows(seasonals, look_back_window_size, look_forward_window_size)

    # Split data into training and testing sets
    split_index = int(len(input_data) * (1 - test_size))
    X_train, X_test = input_data[:split_index], input_data[split_index:]
    y_train, y_test = output_data[:split_index], output_data[split_index:]

    # Build and compile the model
    input_shape = (look_back_window_size, 1)
    model = build_seq2seq_model(input_shape, num_filters, look_forward_window_size)
    model.compile(optimizer='adam', loss='mse')

    # Train the model
    history = model.fit([X_train, X_train], y_train, epochs=n_epochs, batch_size=batch_size, validation_split=0.2)

    # Evaluate the model
    test_loss = model.evaluate([X_test, X_test], y_test)
    print(f"Test Loss: {test_loss}")

    # Predict and plot results
    predictions = model.predict([X_test, X_test])
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.flatten(), label='True')
    plt.plot(predictions.flatten(), label='Predicted')
    plt.legend()
    plt.show()

    break
    #     # print(f"\033[92mAnalyzing\033[0m")
    #     # # 모델 해석 단계
    #     # analyzer = AnalyzeModel(model, X_sales_test, X_aux_test)
    #     # analyzer.pdp(feature_index=0)  # 예시: 첫 번째 보조 특성에 대한 PDP
    #     # analyzer.ice(feature_index=0)  # 예시: 첫 번째 보조 특성에 대한 ICE
    #     # analyzer.feature_weights()  # 모델의 학습된 가중치 출력

    # # # 모델 저장
    # # model.save(f"model_{state_item_id}.h5")

    # # # 모델 가중치 초기화
    # # model.reset_states()

    # break