<a href="https://colab.research.google.com/github/hykim-1/Study/blob/main/weather_scaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -----------------------------
# Weather - Scaling only (X만 스케일링, 모델 없음)
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1) 데이터 로드: 'NA' 문자열을 결측으로 인식, Date를 인덱스/날짜로 파싱
df = pd.read_csv(
    "/content/drive/MyDrive/weather.csv",
    na_values=["NA"],
    parse_dates=["Date"],
)
df = df.set_index("Date")

# 2) 누설/불필요 컬럼 제거
# - RainTomorrow: 내일 비 여부(미래 정보)
# - RISK_MM: 보통 내일 비 양(미래 정보로 취급되는 경우가 많음)
# - RainToday: 오늘 비 여부 → 우리가 예측하려는 '오늘 Rainfall'에서 파생된 **타깃 누설** 가능
for col in ["RainTomorrow", "RISK_MM", "WeatherTomorrow", "weather_tomorrow", "내일날씨", "Tomorrow"]:
    if col in df.columns:
        df = df.drop(columns=[col])

# 3) 결측치 제거
print("결측치 현황(원본):\n", df.isnull().sum())
df = df.dropna()
print("dropna 후:\n", df.isnull().sum())

# 4) 범주형(object) → 라벨인코딩 (예: Location, WindGustDir, WindDir9am, WindDir3pm, RainToday 등)
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# 5) X, y 분리 (타깃 y = Rainfall; y에는 스케일링 적용하지 않음)
y = df["Rainfall"]
X = df.drop(columns=["Rainfall"])

# 6) train/test 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=12
)

# 7) X만 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # train으로만 fit
X_test_scaled  = scaler.transform(X_test)

# 8) 결과 점검/저장
print("스케일러 train 평균:", scaler.mean_)
print("스케일러 train var :", scaler.var_)
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape :", X_test_scaled.shape)

pd.DataFrame(X_train_scaled, columns=X.columns).to_csv("/content/X_train_scaled.csv", index=False)
pd.DataFrame(X_test_scaled,  columns=X.columns).to_csv("/content/X_test_scaled.csv",  index=False)
y_train.to_csv("/content/y_train.csv", index=False)
y_test.to_csv("/content/y_test.csv", index=False)


결측치 현황(원본):
 Location          0
MinTemp           0
MaxTemp           0
Rainfall          0
Evaporation       0
Sunshine          3
WindGustDir       3
WindGustSpeed     2
WindDir9am       31
WindDir3pm        1
WindSpeed9am      7
WindSpeed3pm      0
Humidity9am       0
Humidity3pm       0
Pressure9am       0
Pressure3pm       0
Cloud9am          0
Cloud3pm          0
Temp9am           0
Temp3pm           0
RainToday         0
dtype: int64
dropna 후:
 Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
dtype: int64
스케일러 train 평균: [0.00000000e+00 7.67328244e+00 2.10694656e+01 4.63969466e+00
 8.04809160e+00 6.16030534e+00 3.99847328e+01 7.0534351