In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor

In [55]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 함수

In [56]:
# class_interval 범주화 함수
def convert_to_class_interval(value):
    if value < 0.1:
        return 0
    elif 0.1 <= value < 0.2:
        return 1
    elif 0.2 <= value < 0.5:
        return 2
    elif 0.5 <= value < 1.0:
        return 3
    elif 1.0 <= value < 2.0:
        return 4
    elif 2.0 <= value < 5.0:
        return 5
    elif 5.0 <= value < 10.0:
        return 6
    elif 10.0 <= value < 20.0:
        return 7
    elif 20.0 <= value < 30.0:
        return 8
    else:
        return 9

#csi 계산
def calculate_csi(y_true, y_pred):
    hits = ((y_true == y_pred) & (y_true > 0)).sum()
    false_alarms = ((y_true != y_pred) & (y_pred > 0)).sum()
    misses = ((y_true != y_pred) & (y_true > 0) & (y_pred == 0)).sum()
    csi = hits / (hits + false_alarms + misses)
    return csi

#달에 가중치 추가
def month_w(value):
    if value == 8:
        return value * 10000
    elif value == 7:
        return value * 1000
    elif value == 9:
        return value * 100
    elif value == 6:
        return value * 10
    elif value == 10:
        return value * 1
    elif value == 5:
        return value * 0.1
    else:
        return value

# 계절 매핑 함수
def get_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'autumn'
    else:
        return 'winter'

# XGBoostRegressor


In [62]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')

# class_interval 범주화 함수
def convert_to_class_interval(value):
    if value < 0.1:
        return 0
    elif 0.1 <= value < 0.2:
        return 1
    elif 0.2 <= value < 0.5:
        return 2
    elif 0.5 <= value < 1.0:
        return 3
    elif 1.0 <= value < 2.0:
        return 4
    elif 2.0 <= value < 5.0:
        return 5
    elif 5.0 <= value < 10.0:
        return 6
    elif 10.0 <= value < 20.0:
        return 7
    elif 20.0 <= value < 30.0:
        return 8
    else:
        return 9

#csi 계산
def calculate_csi(y_true, y_pred):
    hits = ((y_true == y_pred) & (y_true > 0)).sum()
    false_alarms = ((y_true != y_pred) & (y_pred > 0)).sum()
    misses = ((y_true != y_pred) & (y_true > 0) & (y_pred == 0)).sum()
    csi = hits / (hits + false_alarms + misses)
    return csi

#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]

mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)

# 계절 매핑 함수
def get_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'autumn'
    else:
        return 'winter'

# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['class_interval']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# 하이퍼파라미터 그리드 설정
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9, 12],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10]
}

# RandomizedSearchCV 설정
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', random_state=42)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=50,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# 하이퍼파라미터 튜닝 수행
random_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# 최적의 하이퍼파라미터로 모델 학습
best_xgb = random_search.best_estimator_
best_xgb.fit(X_train, y_train)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")


# 실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])


# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['fc_year'] = df_train['fc_year'].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['ef_year'] = df_train['ef_year'].map(mapping)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 

## Version1. CSI : 0.05093704041713412
- 하이퍼파라미터
- 공통파일에서 변수 활용
- CSI 계산법 수정

In [73]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')

# class_interval 범주화 함수
def convert_to_class_interval(value):
    if value < 0.1:
        return 0
    elif 0.1 <= value < 0.2:
        return 1
    elif 0.2 <= value < 0.5:
        return 2
    elif 0.5 <= value < 1.0:
        return 3
    elif 1.0 <= value < 2.0:
        return 4
    elif 2.0 <= value < 5.0:
        return 5
    elif 5.0 <= value < 10.0:
        return 6
    elif 10.0 <= value < 20.0:
        return 7
    elif 20.0 <= value < 30.0:
        return 8
    else:
        return 9

#csi 계산
def calculate_csi(y_true, y_pred):
    hits = ((y_true == y_pred) & (y_true > 0)).sum()
    false_alarms = ((y_true != y_pred) & (y_pred > 0)).sum()
    misses = ((y_true != y_pred) & (y_true > 0) & (y_pred == 0)).sum()
    csi = hits / (hits + false_alarms + misses)
    return csi

#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]

mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)

# 계절 매핑 함수
def get_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'autumn'
    else:
        return 'winter'

# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['vv']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

# 모델 생성 및 학습
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # GPU 사용 설정
    device='cuda',  # GPU 예측기 설정
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=10,
    colsample_bytree=0.8,
    subsample=1.0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)
best_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")

#실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])

# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['fc_year'] = df_train['fc_year'].map(mapping)


[0]	validation_0-rmse:4.33478
[1]	validation_0-rmse:4.25416
[2]	validation_0-rmse:4.18647
[3]	validation_0-rmse:4.13265
[4]	validation_0-rmse:4.09131
[5]	validation_0-rmse:4.04843
[6]	validation_0-rmse:4.01629
[7]	validation_0-rmse:3.99111
[8]	validation_0-rmse:3.96443
[9]	validation_0-rmse:3.94069
[10]	validation_0-rmse:3.92797
[11]	validation_0-rmse:3.91418
[12]	validation_0-rmse:3.90065
[13]	validation_0-rmse:3.88929
[14]	validation_0-rmse:3.88072
[15]	validation_0-rmse:3.87059
[16]	validation_0-rmse:3.86098
[17]	validation_0-rmse:3.85533
[18]	validation_0-rmse:3.84592
[19]	validation_0-rmse:3.83782
[20]	validation_0-rmse:3.83150
[21]	validation_0-rmse:3.82784
[22]	validation_0-rmse:3.82615
[23]	validation_0-rmse:3.82464
[24]	validation_0-rmse:3.82025
[25]	validation_0-rmse:3.81854
[26]	validation_0-rmse:3.81349
[27]	validation_0-rmse:3.81035
[28]	validation_0-rmse:3.80412
[29]	validation_0-rmse:3.80377
[30]	validation_0-rmse:3.80124
[31]	validation_0-rmse:3.79914
[32]	validation_0-

## CSI: 0.052012989518740706
- 예은님이 주신 month 가중치 추가


In [74]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')

# class_interval 범주화 함수
def convert_to_class_interval(value):
    if value < 0.1:
        return 0
    elif 0.1 <= value < 0.2:
        return 1
    elif 0.2 <= value < 0.5:
        return 2
    elif 0.5 <= value < 1.0:
        return 3
    elif 1.0 <= value < 2.0:
        return 4
    elif 2.0 <= value < 5.0:
        return 5
    elif 5.0 <= value < 10.0:
        return 6
    elif 10.0 <= value < 20.0:
        return 7
    elif 20.0 <= value < 30.0:
        return 8
    else:
        return 9

#csi 계산
def calculate_csi(y_true, y_pred):
    hits = ((y_true == y_pred) & (y_true > 0)).sum()
    false_alarms = ((y_true != y_pred) & (y_pred > 0)).sum()
    misses = ((y_true != y_pred) & (y_true > 0) & (y_pred == 0)).sum()
    csi = hits / (hits + false_alarms + misses)
    return csi

#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]

def month_w(value):
    if value == 8:
        return value * 10000
    elif value == 7:
        return value * 1000
    elif value == 9:
        return value * 100
    elif value == 6:
        return value * 10
    elif value == 10:
        return value * 1
    elif value == 5:
        return value * 0.1
    else:
        return value

df_train['fc_month_w'] = df_train['fc_month'].apply(month_w)
df_train['ef_month_w'] = df_train['ef_month'].apply(month_w)


mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)

# 계절 매핑 함수
def get_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'autumn'
    else:
        return 'winter'

# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['vv']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

# 모델 생성 및 학습
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # GPU 사용 설정
    device='cuda',  # GPU 예측기 설정
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=10,
    colsample_bytree=0.8,
    subsample=1.0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)
best_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")

# 실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])

# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")



[0]	validation_0-rmse:4.33353
[1]	validation_0-rmse:4.25783
[2]	validation_0-rmse:4.18832
[3]	validation_0-rmse:4.12580
[4]	validation_0-rmse:4.08069
[5]	validation_0-rmse:4.03487
[6]	validation_0-rmse:4.00155
[7]	validation_0-rmse:3.97425
[8]	validation_0-rmse:3.95400
[9]	validation_0-rmse:3.93107
[10]	validation_0-rmse:3.91594
[11]	validation_0-rmse:3.89630
[12]	validation_0-rmse:3.88098
[13]	validation_0-rmse:3.87254
[14]	validation_0-rmse:3.86359
[15]	validation_0-rmse:3.85132
[16]	validation_0-rmse:3.84487
[17]	validation_0-rmse:3.84039
[18]	validation_0-rmse:3.83320
[19]	validation_0-rmse:3.82596
[20]	validation_0-rmse:3.82053
[21]	validation_0-rmse:3.81677
[22]	validation_0-rmse:3.81484
[23]	validation_0-rmse:3.81241
[24]	validation_0-rmse:3.80895
[25]	validation_0-rmse:3.80619
[26]	validation_0-rmse:3.80506
[27]	validation_0-rmse:3.80422
[28]	validation_0-rmse:3.80198
[29]	validation_0-rmse:3.80100
[30]	validation_0-rmse:3.80102
[31]	validation_0-rmse:3.79933
[32]	validation_0-

## CSI: 0.055201367146239876
- randomstate 값 : 42 -> 180으로 변경

In [75]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')


#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]

mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)

def month_w(value):
    if value == 8:
        return value * 10000
    elif value == 7:
        return value * 1000
    elif value == 9:
        return value * 100
    elif value == 6:
        return value * 10
    elif value == 10:
        return value * 1
    elif value == 5:
        return value * 0.1
    else:
        return value


# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['vv']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=180)

# 모델 생성 및 학습
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # GPU 사용 설정
    device='cuda',  # GPU 예측기 설정
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=10,
    colsample_bytree=0.8,
    subsample=1.0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)
best_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")

# 실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])

# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['fc_year'] = df_train['fc_year'].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['ef_year'] = df_train['ef_year'].map(mapping)


[0]	validation_0-rmse:4.36525
[1]	validation_0-rmse:4.28560
[2]	validation_0-rmse:4.21582
[3]	validation_0-rmse:4.16133
[4]	validation_0-rmse:4.12041
[5]	validation_0-rmse:4.07700
[6]	validation_0-rmse:4.04094
[7]	validation_0-rmse:4.01288
[8]	validation_0-rmse:3.98658
[9]	validation_0-rmse:3.96365
[10]	validation_0-rmse:3.95037
[11]	validation_0-rmse:3.93560
[12]	validation_0-rmse:3.91852
[13]	validation_0-rmse:3.91045
[14]	validation_0-rmse:3.90394
[15]	validation_0-rmse:3.89325
[16]	validation_0-rmse:3.88309
[17]	validation_0-rmse:3.87557
[18]	validation_0-rmse:3.86397
[19]	validation_0-rmse:3.85402
[20]	validation_0-rmse:3.84715
[21]	validation_0-rmse:3.84199
[22]	validation_0-rmse:3.84044
[23]	validation_0-rmse:3.83883
[24]	validation_0-rmse:3.83339
[25]	validation_0-rmse:3.82972
[26]	validation_0-rmse:3.82477
[27]	validation_0-rmse:3.82009
[28]	validation_0-rmse:3.81664
[29]	validation_0-rmse:3.81659
[30]	validation_0-rmse:3.81387
[31]	validation_0-rmse:3.80945
[32]	validation_0-

##  CSI: 0.055562949794176086
- randomstate 180
- 예은님 달 가중치 추가



In [77]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')


#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]

mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)

def month_w(value):
    if value == 8:
        return value * 10000
    elif value == 7:
        return value * 1000
    elif value == 9:
        return value * 100
    elif value == 6:
        return value * 10
    elif value == 10:
        return value * 1
    elif value == 5:
        return value * 0.1
    else:
        return value

df_train['fc_month_w'] = df_train['fc_month'].apply(month_w)
df_train['ef_month_w'] = df_train['ef_month'].apply(month_w)

# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['vv']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=180)

# 모델 생성 및 학습
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # GPU 사용 설정
    device='cuda',  # GPU 예측기 설정
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=10,
    colsample_bytree=0.8,
    subsample=1.0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)
best_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")

# 실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])

# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")



[0]	validation_0-rmse:4.36273
[1]	validation_0-rmse:4.28897
[2]	validation_0-rmse:4.21965
[3]	validation_0-rmse:4.15556
[4]	validation_0-rmse:4.11013
[5]	validation_0-rmse:4.06778
[6]	validation_0-rmse:4.03337
[7]	validation_0-rmse:4.00391
[8]	validation_0-rmse:3.98341
[9]	validation_0-rmse:3.96105
[10]	validation_0-rmse:3.94407
[11]	validation_0-rmse:3.92110
[12]	validation_0-rmse:3.90822
[13]	validation_0-rmse:3.90084
[14]	validation_0-rmse:3.89228
[15]	validation_0-rmse:3.87714
[16]	validation_0-rmse:3.87138
[17]	validation_0-rmse:3.86665
[18]	validation_0-rmse:3.85307
[19]	validation_0-rmse:3.84566
[20]	validation_0-rmse:3.83818
[21]	validation_0-rmse:3.82780
[22]	validation_0-rmse:3.82358
[23]	validation_0-rmse:3.82172
[24]	validation_0-rmse:3.81745
[25]	validation_0-rmse:3.81659
[26]	validation_0-rmse:3.81517
[27]	validation_0-rmse:3.81446
[28]	validation_0-rmse:3.81424
[29]	validation_0-rmse:3.81380
[30]	validation_0-rmse:3.81261
[31]	validation_0-rmse:3.81130
[32]	validation_0-

In [None]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')


#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]

mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)

def month_w(value):
    if value == 8:
        return value * 10000
    elif value == 7:
        return value * 1000
    elif value == 9:
        return value * 100
    elif value == 6:
        return value * 10
    elif value == 10:
        return value * 1
    elif value == 5:
        return value * 0.1
    else:
        return value

df_train['fc_month_w'] = df_train['fc_month'].apply(month_w)
df_train['ef_month_w'] = df_train['ef_month'].apply(month_w)

# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['vv']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=180)

# 모델 생성 및 학습
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # GPU 사용 설정
    device='cuda',  # GPU 예측기 설정
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=10,
    colsample_bytree=0.8,
    subsample=1.0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)
best_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")

# 실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])

# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")

## CSI: 0.0544362929520943
- 일부 변수에 가중치 추가


In [76]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')



#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]


mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)



# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train['prob_mean_w'] = df_train['prob_mean'] * 1000
df_train['prob_sum_w'] = df_train['prob_sum'] * 1000
df_train['v04_w'] = df_train['v04'] * 1000
df_train['v03_w'] = df_train['v03'] * 1000
df_train['prob_std_w'] = df_train['prob_std'] * 1000


df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['vv']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=500)

# 모델 생성 및 학습
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # GPU 사용 설정
    device='cuda',  # GPU 예측기 설정
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=10,
    colsample_bytree=0.8,
    subsample=1.0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)
best_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")

# 실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])

# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")



[0]	validation_0-rmse:4.37894
[1]	validation_0-rmse:4.30036
[2]	validation_0-rmse:4.23286
[3]	validation_0-rmse:4.17631
[4]	validation_0-rmse:4.12956
[5]	validation_0-rmse:4.09038
[6]	validation_0-rmse:4.05682
[7]	validation_0-rmse:4.02530
[8]	validation_0-rmse:4.00466
[9]	validation_0-rmse:3.97747
[10]	validation_0-rmse:3.96268
[11]	validation_0-rmse:3.94801
[12]	validation_0-rmse:3.93760
[13]	validation_0-rmse:3.92170
[14]	validation_0-rmse:3.91251
[15]	validation_0-rmse:3.90059
[16]	validation_0-rmse:3.88661
[17]	validation_0-rmse:3.87904
[18]	validation_0-rmse:3.87269
[19]	validation_0-rmse:3.86895
[20]	validation_0-rmse:3.86380
[21]	validation_0-rmse:3.85870
[22]	validation_0-rmse:3.85569
[23]	validation_0-rmse:3.85299
[24]	validation_0-rmse:3.84862
[25]	validation_0-rmse:3.84827
[26]	validation_0-rmse:3.84670
[27]	validation_0-rmse:3.84413
[28]	validation_0-rmse:3.84147
[29]	validation_0-rmse:3.84127
[30]	validation_0-rmse:3.83849
[31]	validation_0-rmse:3.83834
[32]	validation_0-

## CSI: 0.049604165923176666
- 일부 변수 삭제
- randomstate 전부 180
-

-> 변화X

In [81]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')


#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]


mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)



# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month','season_spring','ef_hour','season_autumn','ef_day','fc_dayall','fc_day','dh'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['vv']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=180)

# 모델 생성 및 학습
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # GPU 사용 설정
    device='cuda',  # GPU 예측기 설정
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=10,
    colsample_bytree=0.8,
    subsample=1.0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=180
)
best_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")

# 실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])

# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")



[0]	validation_0-rmse:4.37883
[1]	validation_0-rmse:4.31624
[2]	validation_0-rmse:4.25713
[3]	validation_0-rmse:4.20647
[4]	validation_0-rmse:4.16434
[5]	validation_0-rmse:4.13242
[6]	validation_0-rmse:4.10303
[7]	validation_0-rmse:4.08644
[8]	validation_0-rmse:4.06662
[9]	validation_0-rmse:4.05029
[10]	validation_0-rmse:4.04157
[11]	validation_0-rmse:4.03254
[12]	validation_0-rmse:4.02425
[13]	validation_0-rmse:4.01545
[14]	validation_0-rmse:4.01195
[15]	validation_0-rmse:4.00597
[16]	validation_0-rmse:4.00204
[17]	validation_0-rmse:3.99859
[18]	validation_0-rmse:3.99742
[19]	validation_0-rmse:3.99173
[20]	validation_0-rmse:3.98735
[21]	validation_0-rmse:3.98264
[22]	validation_0-rmse:3.98225
[23]	validation_0-rmse:3.98058
[24]	validation_0-rmse:3.97939
[25]	validation_0-rmse:3.97931
[26]	validation_0-rmse:3.97912
[27]	validation_0-rmse:3.97556
[28]	validation_0-rmse:3.97415
[29]	validation_0-rmse:3.97098
[30]	validation_0-rmse:3.97027
[31]	validation_0-rmse:3.97053
[32]	validation_0-

## CSI: 0.04997015067227814
- 일부 변수 삭제
- randomstate 180

In [82]:
#기본세팅
df_train=pd.read_csv('/content/drive/MyDrive/BDA_2024 날씨 빅데이터 콘테스트/rainfall_train.csv')


#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train = df_train[df_train['class_interval'] != -999]


mapping = {'A': 2021, 'B': 2022, 'C': 2023}
df_train['fc_year'] = df_train['fc_year'].map(mapping)
df_train['ef_year'] = df_train['ef_year'].map(mapping)



# fc_month를 계절로 변환하여 season 컬럼 추가
df_train['season'] = df_train['fc_month'].apply(get_season)

# 계절을 더미 변수로 변환 (One-Hot Encoding)
season_dummies_train = pd.get_dummies(df_train['season'], prefix='season')
df_train = pd.concat([df_train, season_dummies_train], axis=1)

# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

#표준편차
df_train['prob_std'] = df_train[['v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']].std(axis=1)

# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_train['vclass']=df_train['vclass1']+df_train['vclass2']*10+df_train['vclass3']*100

df_train = df_train.drop(columns=['stn4contest', 'season', 'v07', 'v08', 'v09', 'fc_year', 'fc_month','season_spring','ef_hour','season_autumn','ef_day','fc_dayall','fc_day','dh'])

X = df_train.drop(columns=['vv','class_interval'])
y = df_train['vv']

# 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=180)

# 모델 생성 및 학습
best_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',  # GPU 사용 설정
    device='cuda',  # GPU 예측기 설정
    n_estimators=300,
    max_depth=12,
    learning_rate=0.1,
    gamma=0,
    min_child_weight=10,
    colsample_bytree=0.8,
    subsample=1.0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)
best_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=True)

# 예측
y_pred_train_best_xgb = best_xgb.predict(X_train)
y_pred_test_best_xgb = best_xgb.predict(X_test)

# 학습 데이터 평가
mse_train_best_xgb = mean_squared_error(y_train, y_pred_train_best_xgb)
r2_train_best_xgb = r2_score(y_train, y_pred_train_best_xgb)

# 테스트 데이터 평가
mse_test_best_xgb = mean_squared_error(y_test, y_pred_test_best_xgb)
r2_test_best_xgb = r2_score(y_test, y_pred_test_best_xgb)

print("Best XGBoost Regression")
print(f"Train MSE: {mse_train_best_xgb}, Train R2: {r2_train_best_xgb}")
print(f"Test MSE: {mse_test_best_xgb}, Test R2: {r2_test_best_xgb}")

# 실제값과 예측값을 class_interval로 변환
y_test_class = np.array([convert_to_class_interval(value) for value in y_test])
y_pred_class = np.array([convert_to_class_interval(value) for value in y_pred_test_best_xgb])

# CSI 계산
csi_value = calculate_csi(y_test_class, y_pred_class)
print(f"CSI: {csi_value}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['fc_year'] = df_train['fc_year'].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['ef_year'] = df_train['ef_year'].map(mapping)


[0]	validation_0-rmse:4.38310
[1]	validation_0-rmse:4.32330
[2]	validation_0-rmse:4.26242
[3]	validation_0-rmse:4.22646
[4]	validation_0-rmse:4.18333
[5]	validation_0-rmse:4.14984
[6]	validation_0-rmse:4.11882
[7]	validation_0-rmse:4.10045
[8]	validation_0-rmse:4.07818
[9]	validation_0-rmse:4.05947
[10]	validation_0-rmse:4.04898
[11]	validation_0-rmse:4.04210
[12]	validation_0-rmse:4.03272
[13]	validation_0-rmse:4.02441
[14]	validation_0-rmse:4.01810
[15]	validation_0-rmse:4.00896
[16]	validation_0-rmse:4.00734
[17]	validation_0-rmse:4.00160
[18]	validation_0-rmse:3.99593
[19]	validation_0-rmse:3.99390
[20]	validation_0-rmse:3.99323
[21]	validation_0-rmse:3.99259
[22]	validation_0-rmse:3.99014
[23]	validation_0-rmse:3.98664
[24]	validation_0-rmse:3.98483
[25]	validation_0-rmse:3.98363
[26]	validation_0-rmse:3.98237
[27]	validation_0-rmse:3.98145
[28]	validation_0-rmse:3.98157
[29]	validation_0-rmse:3.97884
[30]	validation_0-rmse:3.97826
[31]	validation_0-rmse:3.97688
[32]	validation_0-