## import

In [75]:
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from IPython.core.interactiveshell import InteractiveShell
from matplotlib import pyplot as plt
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
plt.rc('font', family='GULIM')
warnings.filterwarnings(action='ignore')
InteractiveShell.ast_node_interactivity = "all"

## 데이터 호출 및 DROP

In [76]:
df = pd.read_csv('Database/rainfall_train.csv', index_col=0)
df_test = pd.read_csv('Database/rainfall_test.csv', index_col=0)
df = df.drop(columns=['rainfall_train.fc_year','rainfall_train.fc_month', 'rainfall_train.fc_day', 'rainfall_train.fc_hour',\
                      'rainfall_train.ef_year','rainfall_train.ef_month', 'rainfall_train.ef_day', 'rainfall_train.ef_hour',\
                      'rainfall_train.vv','rainfall_train.ef_year'])
df_test = df_test.drop(columns=['rainfall_test.fc_year','rainfall_test.fc_month', 'rainfall_test.fc_day', 'rainfall_test.fc_hour',\
                                'rainfall_test.ef_year','rainfall_test.ef_month', 'rainfall_test.ef_day', 'rainfall_test.ef_hour',])

## 파생변수 1

In [77]:
predict_variable =[ 'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04',
       'rainfall_train.v05', 'rainfall_train.v06', 'rainfall_train.v07',
       'rainfall_train.v08', 'rainfall_train.v09']

In [78]:
df['zero_count'] = (df[predict_variable]==0).sum(axis=1)


## 파생변수 2

In [79]:
# for i in range(1, 9):
#     current_col = f'rainfall_train.v{i:02d}'
#     next_col = f'rainfall_train.v{i+1:02d}'
#     new_col = f'rainfall_train.m{i:02d}'
#     df[new_col] = df[current_col] - df[next_col]


## -999 마스킹

In [80]:
mask = df['rainfall_train.class_interval'] == -999
df = df[~mask]

## 지역코드 인코딩

In [81]:
# 마지막 3자리 자르고 정수형으로 변환하는 함수 정의
def extract_and_convert(value):
    return int(value[-3:])

# apply 함수를 사용하여 변환 적용
df['rainfall_train.stn4contest'] = df['rainfall_train.stn4contest'].apply(extract_and_convert)
df_test['rainfall_test.stn4contest'] = df_test['rainfall_test.stn4contest'].apply(extract_and_convert)

In [82]:
df['rainfall_train.class_interval'].value_counts(normalize=True)

rainfall_train.class_interval
0    0.844386
5    0.028581
2    0.023506
4    0.023073
3    0.021185
6    0.019302
1    0.014883
7    0.014611
8    0.005651
9    0.004822
Name: proportion, dtype: float64

## 데이터 불균형 및 분리

In [83]:
X = df.drop('rainfall_train.class_interval', axis=1)
y = df['rainfall_train.class_interval']

rus = RandomUnderSampler(sampling_strategy={0: int(y.value_counts().min() * 1.5)}, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Tomek Links 적용
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(X_resampled, y_resampled)

In [85]:
for i in range(1, 9):
    current_col = f'rainfall_train.v{i:02d}'
    next_col = f'rainfall_train.v{i+1:02d}'
    new_col = f'rainfall_train.m{i:02d}'
    X_resampled[new_col] = X_resampled[current_col] - X_resampled[next_col]


In [86]:
y_resampled.value_counts(normalize=True)

rainfall_train.class_interval
5    0.170997
2    0.147879
4    0.135224
3    0.126834
6    0.108302
1    0.093167
7    0.079352
0    0.056840
9    0.053675
8    0.027729
Name: proportion, dtype: float64

## 랜덤포레스트 적용

In [94]:
# 훈련 세트와 테스트 세트로 분할
X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


# 랜덤 포레스트 분류 모델 생성
model = RandomForestClassifier(n_estimators=100, random_state=42)

# 모델 학습
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_valid)

# 모델 평가
accuracy = accuracy_score(y_valid, y_pred)
print(f'Accuracy: {accuracy}')

# 분류 보고서 출력
print(classification_report(y_valid, y_pred))

# 추가로 중요 변수 출력 (선택 사항)
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_resampled.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

Accuracy: 0.27367369674618724
              precision    recall  f1-score   support

           0       0.49      0.33      0.40      1513
           1       0.23      0.14      0.18      2451
           2       0.24      0.34      0.28      3864
           3       0.24      0.21      0.22      3364
           4       0.25      0.24      0.24      3543
           5       0.26      0.43      0.32      4393
           6       0.30      0.26      0.28      2749
           7       0.36      0.24      0.29      2050
           8       0.57      0.11      0.18       739
           9       0.35      0.18      0.24      1365

    accuracy                           0.27     26031
   macro avg       0.33      0.25      0.26     26031
weighted avg       0.29      0.27      0.27     26031

                       Feature  Importance
1            rainfall_train.dh    0.104752
0   rainfall_train.stn4contest    0.081617
2           rainfall_train.v01    0.069384
3           rainfall_train.v02    0.065

In [100]:
mapping = {
    0: "<0.1",
    1: "0.1<=X<0.2",
    2: "0.2<=X<0.5",
    3: "0.5<=X<1",
    4: "1<=X<2",
    5: "2<=X<5",
    6: "5<=X<10",
    7: "10<=X<20",
    8: "20<=X<30",
    9: "X>=30"
}
y_pred_df = pd.DataFrame(y_pred,columns=['y_pred'])
y_valid_df = pd.DataFrame(y_valid).reset_index()


y_pred_df['y_pred'] = y_pred_df['y_pred'].map(mapping)
y_valid_df['rainfall_train.class_interval'] = y_valid_df['rainfall_train.class_interval'].map(mapping)

In [101]:
csi_categories = ["<0.1", "0.1<=X<0.2", "0.2<=X<0.5", "0.5<=X<1", "1<=X<2", "2<=X<5", "5<=X<10", "10<=X<20", "20<=X<30",
                  "X>=30"]
csi_table = pd.DataFrame(0, index=csi_categories, columns=csi_categories)

In [102]:
for i in range(len(y_pred_df)):
    observed = y_pred_df['y_pred'][i]
    predicted = y_valid_df['rainfall_train.class_interval'][i]
    csi_table.at[observed, predicted] += 1

In [103]:
import numpy as np
def cal_CSI(csi_table: pd.DataFrame):
    '''
    :param csi_table: CSI 데이터프레임
    :return: CSI
    '''
    H = np.trace(csi_table.values) - csi_table.iloc[0, 0]
    F = csi_table.iloc[:, 1:].sum().sum() - H
    M = csi_table.iloc[1:, 0].sum()
    csi = H / (H + F + M)
    return csi


In [104]:
cal_CSI(csi_table)

np.float64(0.25933325498491794)

In [None]:
# 파생변수 없음 0.2322
# 0개수 파생변수 0.235
# marginal 확률 파생변수 : np.float64(0.25933325498491794)

## 모델저장

In [112]:
# 모델 저장
import joblib 
joblib.dump(model, 'random_forest_model.pkl')

['random_forest_model.pkl']

## test 데이터

In [142]:
df_test = pd.read_csv('Database/rainfall_test.csv', index_col=0)
df_test = df_test.drop(columns=['rainfall_test.fc_year','rainfall_test.fc_month', 'rainfall_test.fc_day', 'rainfall_test.fc_hour',\
                                'rainfall_test.ef_year','rainfall_test.ef_month', 'rainfall_test.ef_day', 'rainfall_test.ef_hour',])

In [143]:
predict_variable =[ 'rainfall_test.v01', 'rainfall_test.v02', 'rainfall_test.v03', 'rainfall_test.v04',
       'rainfall_test.v05', 'rainfall_test.v06', 'rainfall_test.v07',
       'rainfall_test.v08', 'rainfall_test.v09']

In [144]:
df_test['zero_count'] = (df_test[predict_variable]==0).sum(axis=1)

In [145]:
for i in range(1, 9):
    current_col = f'rainfall_test.v{i:02d}'
    next_col = f'rainfall_test.v{i+1:02d}'
    new_col = f'rainfall_test.m{i:02d}'
    df_test[new_col] = df_test[current_col] - df_test[next_col]

In [146]:
df_test.columns = df_test.columns.str.replace('test', 'train')
print(df_test.columns)

Index(['rainfall_train.stn4contrain', 'rainfall_train.dh',
       'rainfall_train.v01', 'rainfall_train.v02', 'rainfall_train.v03',
       'rainfall_train.v04', 'rainfall_train.v05', 'rainfall_train.v06',
       'rainfall_train.v07', 'rainfall_train.v08', 'rainfall_train.v09',
       'rainfall_train.class_interval', 'zero_count', 'rainfall_train.m01',
       'rainfall_train.m02', 'rainfall_train.m03', 'rainfall_train.m04',
       'rainfall_train.m05', 'rainfall_train.m06', 'rainfall_train.m07',
       'rainfall_train.m08'],
      dtype='object')


In [152]:
# 마지막 3자리 자르고 정수형으로 변환하는 함수 정의
def extract_and_convert(value):
    return int(value[-3:])

df_test['rainfall_train.stn4contest'] = df_test['rainfall_train.stn4contrain'].apply(extract_and_convert)
df_test = df_test.drop(columns=['rainfall_train.stn4contrain'])

In [None]:
# 열의 순서 변경 (stn4contest를 맨 앞으로)
new_columns = ['rainfall_train.stn4contest'] + [col for col in df_test.columns if col != 'rainfall_train.stn4contest']
df_test = df_test[new_columns]

In [158]:
X_test = df_test.drop('rainfall_train.class_interval', axis=1)
y_test = df_test['rainfall_train.class_interval']

In [159]:
X_train.columns

Index(['rainfall_train.stn4contest', 'rainfall_train.dh', 'rainfall_train.v01',
       'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04',
       'rainfall_train.v05', 'rainfall_train.v06', 'rainfall_train.v07',
       'rainfall_train.v08', 'rainfall_train.v09', 'zero_count',
       'rainfall_train.m01', 'rainfall_train.m02', 'rainfall_train.m03',
       'rainfall_train.m04', 'rainfall_train.m05', 'rainfall_train.m06',
       'rainfall_train.m07', 'rainfall_train.m08'],
      dtype='object')

In [160]:
X_test.columns

Index(['rainfall_train.stn4contest', 'rainfall_train.dh', 'rainfall_train.v01',
       'rainfall_train.v02', 'rainfall_train.v03', 'rainfall_train.v04',
       'rainfall_train.v05', 'rainfall_train.v06', 'rainfall_train.v07',
       'rainfall_train.v08', 'rainfall_train.v09', 'zero_count',
       'rainfall_train.m01', 'rainfall_train.m02', 'rainfall_train.m03',
       'rainfall_train.m04', 'rainfall_train.m05', 'rainfall_train.m06',
       'rainfall_train.m07', 'rainfall_train.m08'],
      dtype='object')

In [161]:
# 저장된 모델 불러오기
loaded_model = joblib.load('random_forest_model.pkl')

test_prediction = loaded_model.predict(X_test)


In [162]:
test_prediction

array([1, 2, 1, ..., 5, 1, 6])

## submission

In [168]:
df_submission = pd.read_csv('Database/rainfall_test.csv', index_col=0)

In [165]:
df_submission['rainfall_test.class_interval']=test_prediction

In [167]:
df_submission.to_csv('./240105.csv')