In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
# ------------------- 🔹 데이터 불러오기 -------------------

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 🔹 geology_id 저장 (나중에 submission에 추가)
test_ids = test["geology_id"]

# 🔹 train 데이터에서 -299~-1을 입력(X), 1~300을 타겟(y)으로 분리
X_train_full = train.iloc[:, 1:301]  # -299 ~ 0
y_train_full = train.iloc[:, 301:601]   # 1 ~ 300

# 🔹 test에서도 동일한 방식으로 -299~-1을 입력으로 사용
X_test_full = test.iloc[:, 1:301]  # -299 ~ -1

# ------------------- 🔹 데이터 전처리 함수 -------------------

# -299 ~ -k 범위에서 결측치가 시작되는 첫 번째 열 (-k)을 찾는 함수
def find_k(series):
    notna_index = series.notna()
    if notna_index.any():
        return int(series.index[notna_index.argmax()])
    return None  # 모든 값이 NaN이면 None 반환

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from scipy.interpolate import PchipInterpolator, UnivariateSpline

# 🔹 기존 선형 회귀 함수 이름 변경
def Linear_prediction(x_known, y_known, x_missing):
    model = LinearRegression()
    model.fit(x_known.reshape(-1, 1), y_known)
    return model.predict(x_missing.reshape(-1, 1))

# 🔹 PCHIP 보간 함수
def PCHIP_prediction(x_known, y_known, x_missing):
    interpolator = PchipInterpolator(x_known, y_known, extrapolate=True)
    return interpolator(x_missing)

# 🔹 KNN 회귀 기반 보간 함수
def KNN_prediction(x_known, y_known, x_missing, k=3):
    model = KNeighborsRegressor(n_neighbors=min(k, len(x_known)))
    model.fit(x_known.reshape(-1, 1), y_known)
    return model.predict(x_missing.reshape(-1, 1))

# 🔹 Univariate Spline 보간 함수
def Spline_prediction(x_known, y_known, x_missing, s=0):
    spline = UnivariateSpline(x_known, y_known, s=s, ext=0)
    return spline(x_missing)

def evaluate_smoothness(y_full, w1, w2, w3):
    dy = np.gradient(y_full)
    ddy = np.gradient(dy)

    curvature_var = np.var(ddy)
    max_curvature = np.max(np.abs(ddy))
    num_sharp_turns = np.sum(np.abs(ddy) > 0.01)

    return (
        curvature_var * w1 +
        max_curvature * w2 +
        num_sharp_turns * w3
    )

# 🔹 통합 결측치 예측 함수
def predict_missing_values(data,w1=1,w2=0.5,w3=0.3):
    filled_data = data.copy()

    for i in range(data.shape[0]):
        row_data = data.iloc[i, 1:301]  # -299 ~ 0
        k_value = find_k(row_data)
        if k_value is None:
            continue

        k_index = row_data.index.get_loc(str(k_value))

        for j in range(k_index, -1, -1):  # 결측치 위치 반복 (-k ~ -299)
            if pd.notna(row_data.iloc[j]):
                continue

            x_known = row_data.iloc[j+1:].dropna()
            if x_known.empty:
                continue

            x_known_idx = x_known.index.astype(float).to_numpy()
            y_known = x_known.to_numpy()
            x_missing_idx = np.array([float(row_data.index[j])])

            candidates = {}
            try:
                y_lin = Linear_prediction(x_known_idx, y_known, x_missing_idx)
                candidates["linear"] = y_lin
            except:
                pass
            try:
                y_pchip = PCHIP_prediction(x_known_idx, y_known, x_missing_idx)
                candidates["pchip"] = y_pchip
            except:
                pass
            try:
                y_spline = Spline_prediction(x_known_idx, y_known, x_missing_idx)
                candidates["spline"] = y_spline
            except:
                pass

            # 각 후보에 대해 해당 위치에 값 추가하고 자연스러움 평가
            best_method = None
            best_score = float("inf")
            for method, y_pred in candidates.items():
                temp_series = row_data.copy()
                temp_series.iloc[j] = y_pred[0]
                temp_array = temp_series.dropna().to_numpy()
                score = evaluate_smoothness(temp_array,w1,w2,w3)
                if score < best_score:
                    best_score = score
                    best_method = method

            if best_method:
                filled_data.iat[i, j+1] = candidates[best_method][0]
                print(f"Row {i}, Col {row_data.index[j]} -> {best_method}")

    return filled_data

# ------------------- 🔹 전처리 과정 결과 저장 -------------------

# 🔹 train 데이터 결측치 처리
filled_train = predict_missing_values(train)
filled_train.to_csv("filled_train2.csv", index=False)
print("결측치 보완 완료! 'filled_train1.csv' 파일로 저장되었습니다.")

# 🔹 test 데이터 결측치 처리
filled_test = predict_missing_values(test)
filled_test.to_csv("filled_test2.csv", index=False)
print("결측치 보완 완료! 'filled_test1.csv' 파일로 저장되었습니다.")

# 🔹 geology_id 컬럼 제거 후 학습 데이터 준비
X_train_clean = filled_train.iloc[:, 1:301]  # geology_id 제외
X_test_clean = filled_test.iloc[:, 1:301]  # geology_id 제외
# ------------------- 🔹 모델 정의 및 학습 -------------------
# 🔹 정규화
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_clean)
X_test_scaled = scaler.transform(X_test_clean)

X_train_seq = X_train_scaled.reshape(-1, 300, 1)
X_test_seq = X_test_scaled.reshape(-1, 300, 1)

from tensorflow.keras.callbacks import EarlyStopping

# EarlyStopping 콜백 정의
early_stop = EarlyStopping(monitor='loss', patience=20, restore_best_weights=True, verbose=1)

# 모델 정의 함수 수정: LSTM 유닛 수 증가
def build_gru(input_shape=(300, 1)):
    model = Sequential([
        tf.keras.layers.GRU(256, return_sequences=False, input_shape=input_shape),
        Dense(256, activation='relu'),
        Dense(300)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# 모델 초기화
model_gru = build_gru()

# 학습 수행: EarlyStopping

model_gru.fit(
    X_train_seq, y_train_full.values,
    epochs=200,
    batch_size=16,
    callbacks=[early_stop],
    verbose=1
)

# 예측
pred_gru = model_gru.predict(X_test_seq)

# 앙상블 (단순 평균)
y_test_pred = pred_gru

# ------------------- 🔹 결과 제출 -------------------

columns_new = [str(i+1) for i in range(300)]
for r in range(1, 10):
    for pos in range(1, 301):
        columns_new.append(f"r_{r}_pos_{pos}")

if y_test_pred.shape[1] < len(columns_new):
    y_test_pred = np.pad(y_test_pred, ((0, 0), (0, len(columns_new) - y_test_pred.shape[1])), constant_values=np.nan)

submission = pd.DataFrame(y_test_pred, columns=columns_new)
submission.insert(0, "geology_id", test_ids)

copied_part = submission.iloc[:, 1:301].copy().values
for start in range(301, 3001, 300):
    submission.iloc[:, start:start+300] = copied_part

submission.to_csv("voting_unit_submission.csv", index=False)
print("✅ voting_unit_submission.csv 생성됨")