In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

#### 데이터 설명
- fc_ : 기준 발표시각
- stn4contest : AWS 지점코드
- dh : 기준시각 - 예측시간
- ef_ : 예측 시각
- v~ : 강수량 누적확률(0.1 / 0.2 / 0.5 / 1.0 / 2.0 / 5.0 / 10.0 / 20.0 / 30.0)
- vv : 실강수량
- class_Interval : 강수계급

In [2]:
df_train=pd.read_csv('rainfall_train.csv')
#새롭게 인덱스가 정의되어 삭제
df_train = df_train.drop('Unnamed: 0', axis=1)
df_train = df_train.rename(columns=lambda x: x.replace('rainfall_train.', ''))
df_train

Unnamed: 0,fc_year,fc_month,fc_day,fc_hour,stn4contest,dh,ef_year,ef_month,ef_day,ef_hour,...,v02,v03,v04,v05,v06,v07,v08,v09,vv,class_interval
0,A,5,1,9,STN001,3,A,5,1,12,...,0,0,0,0,0,0,0,0,0,0
1,A,5,1,9,STN001,6,A,5,1,15,...,0,0,0,0,0,0,0,0,0,0
2,A,5,1,9,STN001,9,A,5,1,18,...,0,0,0,0,0,0,0,0,0,0
3,A,5,1,9,STN001,12,A,5,1,21,...,0,0,0,0,0,0,0,0,0,0
4,A,5,1,9,STN001,15,A,5,2,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,C,6,10,21,STN004,240,C,6,20,21,...,17,14,12,12,8,4,0,0,0,0
1048571,C,6,11,9,STN004,3,C,6,11,12,...,16,7,3,3,0,0,0,0,0,0
1048572,C,6,11,9,STN004,6,C,6,11,15,...,5,4,3,2,0,0,0,0,0,0
1048573,C,6,11,9,STN004,9,C,6,11,18,...,71,54,44,33,14,3,0,0,0,0


In [3]:
df_test=pd.read_csv('rainfall_test.csv')
df_test = df_test.drop('Unnamed: 0', axis=1)
df_test = df_test.rename(columns=lambda x: x.replace('rainfall_test.', ''))
df_test

Unnamed: 0,fc_year,fc_month,fc_day,fc_hour,stn4contest,dh,ef_year,ef_month,ef_day,ef_hour,v01,v02,v03,v04,v05,v06,v07,v08,v09,class_interval
0,D,5,1,9,STN031,3,D,5,1,12,68,46,13,1,0,0,0,0,0,
1,D,5,1,9,STN031,6,D,5,1,15,83,66,26,5,0,0,0,0,0,
2,D,5,1,9,STN031,9,D,5,1,18,17,6,0,0,0,0,0,0,0,
3,D,5,1,9,STN031,12,D,5,1,21,0,0,0,0,0,0,0,0,0,
4,D,5,1,9,STN031,15,D,5,2,0,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121995,D,9,30,21,STN035,228,D,10,10,9,4,4,4,2,2,0,0,0,0,
121996,D,9,30,21,STN035,231,D,10,10,12,8,6,6,5,0,0,0,0,0,
121997,D,9,30,21,STN035,234,D,10,10,15,8,6,6,5,0,0,0,0,0,
121998,D,9,30,21,STN035,237,D,10,10,18,10,8,8,8,6,2,0,0,0,


## 데이터 확인

In [4]:
df_train.info()
# 결측값 처리는 따로 필요 없어 보임

##인코딩
# 원핫 : fc_year, ef_year
# 라벨 : stn4contest

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   fc_year         1048575 non-null  object
 1   fc_month        1048575 non-null  int64 
 2   fc_day          1048575 non-null  int64 
 3   fc_hour         1048575 non-null  int64 
 4   stn4contest     1048575 non-null  object
 5   dh              1048575 non-null  int64 
 6   ef_year         1048575 non-null  object
 7   ef_month        1048575 non-null  int64 
 8   ef_day          1048575 non-null  int64 
 9   ef_hour         1048575 non-null  int64 
 10  v01             1048575 non-null  int64 
 11  v02             1048575 non-null  int64 
 12  v03             1048575 non-null  int64 
 13  v04             1048575 non-null  int64 
 14  v05             1048575 non-null  int64 
 15  v06             1048575 non-null  int64 
 16  v07             1048575 non-null  int64 
 17  v08     

In [5]:
#AWS의 지점에 따른 강수계급
distribution_by_class_interval = df_train.groupby(['stn4contest', 'class_interval']).size().unstack(fill_value=0)
distribution_by_class_interval

##정리
# 0의 지점에서 다른 값들과 비교하였을 때 유난히 높은 값을 가진다. => 계급이 낮은 것이 많다...
# 999는 수정X

class_interval,(999),0,1,2,3,4,5,6,7,8,9
stn4contest,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
STN001,182,60799,994,1508,1406,1678,2435,1520,1477,540,382
STN002,260,57373,1732,2360,2500,2243,2593,1923,1286,394,254
STN003,255,61084,1010,1643,1504,1794,2187,1554,1068,482,340
STN004,442,46481,618,1204,1016,1475,1404,1197,662,260,280
STN005,193,40539,796,1249,1012,901,1472,974,578,220,180
STN006,331,40155,881,1360,954,966,1647,1123,551,220,260
STN007,151,39617,939,1316,1154,1080,1864,922,839,374,220
STN008,180,40488,763,1294,1190,1464,1261,843,629,220,144
STN009,173,39628,766,1544,1057,1194,1567,1253,774,320,200
STN010,234,39431,800,1403,1282,1336,1623,1077,816,294,180


In [6]:
df_train['stn4contest'].value_counts()

stn4contest
STN001    72921
STN003    72921
STN002    72918
STN004    55039
STN019    48476
STN018    48476
STN015    48476
STN014    48476
STN012    48476
STN020    48476
STN010    48476
STN009    48476
STN008    48476
STN007    48476
STN011    48455
STN013    48449
STN006    48448
STN016    48313
STN017    48237
STN005    48114
Name: count, dtype: int64

In [7]:
df_train['class_interval'].value_counts()

class_interval
0        868411
5         32260
4         26227
2         25841
3         24108
6         22071
1         16823
7         16278
8          6271
9          5275
(999)      5010
Name: count, dtype: int64

In [8]:
df_train['dh'].value_counts()

dh
3      13120
9      13120
12     13120
6      13120
42     13119
       ...  
207    13069
234    13069
237    13068
240    13064
210    13063
Name: count, Length: 80, dtype: int64

In [9]:
#관측 월에 따른 강수계급
distribution_by_class_interval = df_train.groupby(['ef_month', 'class_interval']).size().unstack(fill_value=0)
distribution_by_class_interval

##정리
# 강수계급이 높아질수록 비가 많이 오는것이 아닐까..
# 8월의 강수계급이 높았을 때가 아마 장마기간?
# 999는 수정X

class_interval,(999),0,1,2,3,4,5,6,7,8,9
ef_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5,1021,148401,2391,3209,3702,3278,4741,2798,1536,306,24
6,1240,180148,3177,4359,3540,4017,5505,3410,2035,680,620
7,840,170277,3970,6137,5389,5810,6894,5447,4676,1939,1631
8,900,165471,4120,5980,6100,6720,8517,6140,5100,2160,2000
9,740,171729,2758,5437,4876,5795,6039,3960,2760,1120,940
10,269,32385,407,719,501,607,564,316,171,66,60


In [10]:
#생략없이
pd.set_option('display.max_rows', None)

df_train['v01'].value_counts()
#pd.reset_option('display.max_rows')

## 정리
# 확률이 커질수록 해당 값은 적어진다.

v01
0      154905
2       37525
1       28244
4       26777
6       21925
3       19843
8       19723
100     18891
10      17865
5       16500
12      16417
14      15447
7       14730
16      14688
18      13688
9       13558
20      12986
11      12953
13      12251
22      12065
15      11931
17      11678
24      11535
19      11446
23      11240
21      11240
25      10878
26      10781
27      10505
29      10393
31      10201
28      10122
33       9900
30       9615
35       9580
37       9389
32       9229
39       9036
34       8850
41       8485
43       8323
36       8229
45       8089
38       7944
47       7758
40       7556
42       7513
49       7386
51       7371
53       7301
44       7301
46       7189
55       6965
57       6842
48       6836
59       6733
50       6698
61       6507
52       6386
54       6270
56       6081
63       6062
65       6027
67       5941
58       5836
60       5810
69       5808
71       5636
62       5635
64       5455
98       5425
68

In [11]:
#생략없이
pd.set_option('display.max_rows', None)

df_train['v02'].value_counts()
#pd.reset_option('display.max_rows')

## 정리
# 확률이 커질수록 해당 값은 적어진다.

v02
0      189978
2       43842
1       31852
4       30926
6       25053
8       22156
3       22140
10      19744
5       18513
12      17810
7       16699
14      16621
16      15466
9       15112
18      14441
11      14304
13      13588
100     13388
20      13331
15      12662
17      12522
22      12212
19      12099
21      11790
24      11598
23      11371
25      11243
27      10600
26      10593
29      10144
28      10079
31       9818
33       9466
35       9304
30       9097
32       8837
37       8497
34       8325
39       8322
41       8042
36       7745
43       7568
40       7391
45       7353
38       7296
47       6964
42       6894
51       6695
44       6663
49       6638
46       6405
53       6398
48       6151
55       6150
57       5850
52       5689
50       5639
59       5516
54       5407
61       5335
63       5187
56       5084
58       5048
65       5039
60       4756
67       4687
62       4618
69       4576
71       4405
66       4391
64       4359
73

In [12]:
# 강수계급 0이 어느 변수에서든 가장 많은 값을 가지게 된다.-> 비가 안오는 경우가 많다.

In [29]:
import pandas as pd

# df_train 데이터프레임이 이미 정의되어 있다고 가정합니다

# 'class_interval' 열의 값이 '(999)'인 경우를 999로 변경
df_train['class_interval'] = df_train['class_interval'].replace('(999)', '999')

# 'class_interval' 열을 숫자형으로 변환
df_train['class_interval'] = pd.to_numeric(df_train['class_interval'], errors='coerce')

## 전처리

### 내용 전처리
train, test 모두 동일하게 적용

- 원래 있는 컬럼의 값을 전처리
- 새로운 변수 생성(중요도를 보고 서로 곱하거나 더하거나, KNN이나 학회에서 진행했던거 등등 참고해서)

In [14]:
df_train['vv'].value_counts()

vv
0        911075
1         39312
2         18620
3         12289
4          8850
5          6500
6          5387
(999)      5010
8          4413
7          4313
9          3556
10         2820
11         2342
12         2181
13         2058
14         1877
16         1580
15         1252
17         1152
18          968
20          967
19          937
22          873
21          842
25          694
24          654
28          637
23          573
26          454
30          439
27          420
29          394
31          375
33          360
36          260
38          260
32          238
34          220
40          220
39          220
44          200
37          180
35          180
45          160
42          153
43          140
47          120
46          120
48          100
52          100
54          100
61           80
51           80
64           79
67           60
65           60
81           60
53           60
73           60
105          40
70           40
58           40
57   

In [15]:
#결측치 대체
df_train['vv'] = df_train['vv'].replace('(999)', 999)
df_train['vv'] = df_train['vv'].astype(float)

### 수치형변수
train, test 모두 동일하게 적용

- 이상치 처리

In [16]:
num_col = ['fc_month', 'fc_day', 'fc_hour', 'dh', 'ef_month', 'ef_day', 'ef_hour', 
                   'v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']

df_train[num_col] = df_train[num_col].apply(lambda x: x.clip(x.quantile(0.05), x.quantile(0.95)), axis=0)
df_test[num_col] = df_test[num_col].apply(lambda x: x.clip(x.quantile(0.05), x.quantile(0.95)), axis=0)

### 스케일링
train에만 적용(test에서는 적용X)

- Standard Scaling

In [17]:
scaler = StandardScaler()

df_train[num_col] = scaler.fit_transform(df_train[num_col])
df_test[num_col] = scaler.fit_transform(df_test[num_col])

### 범주형변수
train, test 모두 동일하게 적용

- 인코딩 필요
    - 값이 별로 없다면 get_dummies
    - 값이 많다면 label_encoding..고민 필요
- vv를 예측하여 class를 나눌 것이라 vv, class 인코딩X

In [18]:
df_train = pd.get_dummies(df_train, columns=['fc_year', 'ef_year'])
df_test = pd.get_dummies(df_test, columns=['fc_year', 'ef_year'])

# 훈련 세트와 테스트 세트에 동일한 열이 존재하도록 정렬
df_train, df_test = df_train.align(df_test, join='outer', axis=1, fill_value=0)

In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_train['stn4contest'] = le.fit_transform(df_train['stn4contest'])
df_test['stn4contest'] = le.fit_transform(df_test['stn4contest'])

### 새로운 Feature 생성

In [20]:
# df_train에 가중치 부여
df_train['v01_w'] = df_train['v01'] * 0.01
df_train['v02_w'] = df_train['v02'] * 0.1
df_train['v03_w'] = df_train['v03']
df_train['v04_w'] = df_train['v04']
df_train['v05_w'] = df_train['v05'] * 10
df_train['v06_w'] = df_train['v06'] * 100
df_train['v07_w'] = df_train['v07'] * 1000
df_train['v08_w'] = df_train['v08'] * 10000
df_train['v09_w'] = df_train['v09'] * 100000

# df_test에 가중치 부여
df_test['v01_w'] = df_test['v01'] * 0.01
df_test['v02_w'] = df_test['v02'] * 0.1
df_test['v03_w'] = df_test['v03']
df_test['v04_w'] = df_test['v04']
df_test['v05_w'] = df_test['v05'] * 10
df_test['v06_w'] = df_test['v06'] * 100
df_test['v07_w'] = df_test['v07'] * 1000
df_test['v08_w'] = df_test['v08'] * 10000
df_test['v09_w'] = df_test['v09'] * 100000

In [21]:
# df_train에 확률 값의 합과 평균 계산
df_train['prob_sum'] = df_train['v01'] + df_train['v02'] + df_train['v03'] + df_train['v04'] + df_train['v05'] + df_train['v06'] + df_train['v07'] + df_train['v08'] + df_train['v09']
df_train['prob_mean'] = df_train['prob_sum'] / 9

# df_test에 확률 값의 합과 평균 계산
df_test['prob_sum'] = df_test['v01'] + df_test['v02'] + df_test['v03'] + df_test['v04'] + df_test['v05'] + df_test['v06'] + df_test['v07'] + df_test['v08'] + df_test['v09']
df_test['prob_mean'] = df_test['prob_sum'] / 9

In [22]:
# 날짜 전체
df_train['fc_dayall']=df_train['fc_month']*1000+df_train['fc_day']
df_train['ef_dayall']=df_train['ef_month']*1000+df_train['ef_day']

df_test['fc_dayall']=df_test['fc_month']*1000+df_test['fc_day']
df_test['ef_dayall']=df_test['ef_month']*1000+df_test['ef_day']

In [23]:

df_train['vclass1']=df_train['v01']+df_train['v02']+df_train['v03']
df_train['vclass2']=df_train['v04']+df_train['v05']+df_train['v06']
df_train['vclass3']=df_train['v07']+df_train['v08']+df_train['v09']

df_test['vclass1']=df_test['v01']*1000+df_test['v02']+df_test['v03']
df_test['vclass2']=df_test['v04']*1000+df_test['v05']+df_test['v06']
df_test['vclass3']=df_test['v07']*1000+df_test['v08']+df_test['v09']

In [24]:
df_train['stn*ef']=df_train['stn4contest']*df_train['ef_month']
df_test['stn*ef']=df_test['stn4contest']*df_test['ef_month']

#### 필요하다면 Feture Selection 진행

In [25]:
X = df_train.drop(columns=['vv', 'class_interval'])
y=df_train['class_interval']
# X_test = df_train.drop(columns=['vv','class_interval'])

In [39]:
# y를 int 타입으로 변환
y = y.astype(int)

# 다시 데이터 타입 확인
print(y.dtypes)

int32


In [26]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# import shap

# # 데이터 분할
# X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.3, random_state=111)

# #모델 간단한 학습
# model = RandomForestClassifier(random_state=111)
# model.fit(X_train, y_train)

# ## SHAP 값 구하기 
# ## SHAP Explainer 초기화 및 SHAP 값 계산

# explainer = shap.TreeExplainer(model)
# shap_values= explainer.shap_values(X_test) # shap 값을 간단하게 계산한다.

# # shap 요약 플롯
# shap.summary_plot(shap_values[:,:,0], X_test, plot_type='bar', feature_names = X_test.columns.tolist())

## 모델

In [40]:
def calculate_csi(df, predicted_col):
    # Hit (H): 예보가 정확히 맞아떨어진 경우
    hits = ((y_test == df[predicted_col]) & (y_test > 0)).sum()

    # False Alarm (F): 잘못된 예보의 수
    false_alarms = ((y_test != df[predicted_col]) & (df[predicted_col] > 0)).sum()

    # Miss (M): 놓친 예보의 수
    misses = ((y_test != df[predicted_col]) & (y_test > 0) & (df[predicted_col] == 0)).sum()

    # CSI 계산
    csi = hits / (hits + false_alarms + misses)

    return csi

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.3, random_state=111)


rf = RandomForestClassifier(random_state=111)
rf.fit(X_train, y_train)

df1 = X_test.copy()
df1['rf_pred']=rf.predict(X_test)

In [42]:
df1['rf_pred'].value_counts()

rf_pred
0      288339
5        5980
6        4058
4        3626
7        3078
3        2837
2        2744
1        1478
8        1131
9         945
999       357
Name: count, dtype: int64

In [43]:
# CSI 계산
csi_value = calculate_csi(df1, 'rf_pred')
print(f'CSI: {csi_value:.4f}')

CSI: 0.1100


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
import pandas as pd
import numpy as np

# 탐색할 하이퍼파라미터 공간 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
   # 'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}


# CSI 스코어러 만들기
csi_scorer = make_scorer(calculate_csi, greater_is_better=True)

# 랜덤 서치 수행
random_search = RandomizedSearchCV(estimator=RandomForestClassifier(), 
                                   param_distributions=param_grid, 
                                   n_iter=30, 
                                   cv=3, 
                                   scoring=csi_scorer)
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

df2 = X_test.copy()
df2['rf_pred']=best_model.predict(X_test)

# CSI 계산
csi_value = calculate_csi(df2, 'rf_pred')
print(f'CSI: {csi_value:.4f}')

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_27744\3858456063.py", line 3, in calculate_csi
    hits = ((y_test == df[predicted_col]) & (y_test > 0)).sum()
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\series.py", line 1153, in __getitem__
    return self._get_with(key)
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\series.py", line 1180, in _get_with
    return self.loc[key]
  File "C:\User

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_27744\3858456063.py", line 3, in calculate_csi
    hits = ((y_test == df[predicted_col]) & (y_test > 0)).sum()
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\series.py", line 1153, in __getitem__
    return self._get_with(key)
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\series.py", line 1180, in _get_with
    return self.loc[key]
  File "C:\User

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_27744\3858456063.py", line 3, in calculate_csi
    hits = ((y_test == df[predicted_col]) & (y_test > 0)).sum()
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\series.py", line 1153, in __getitem__
    return self._get_with(key)
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\series.py", line 1180, in _get_with
    return self.loc[key]
  File "C:\User

Traceback (most recent call last):
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "C:\Users\user\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_27744\3858456063.py", line 3, in calculate_csi
    hits = ((y_test == df[predicted_col]) & (y_test > 0)).sum()
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\series.py", line 1153, in __getitem__
    return self._get_with(key)
  File "C:\Users\user\anaconda3\lib\site-packages\pandas\core\series.py", line 1180, in _get_with
    return self.loc[key]
  File "C:\User

#### 마지막 최종 파일 생성

In [None]:
# X_train = df_train.drop(columns=['vv', 'class_interval'])
# y_train=df_train['vv']
# X_test = df_test.drop(columns=['vv','class_interval'])

In [None]:
# y_pred = rf.predict(X_test)