# **Packages**

In [2]:
import os
import time
import random
import platform
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from sklearn.model_selection import train_test_split

# 한글 문제
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':  # 맥OS
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':  # 윈도우
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system...  sorry~~~')

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 17,8

import warnings
warnings.filterwarnings(action='ignore')

# **데이터 전처리**

### **ECLO만 뽑아내기위한 작업**

In [3]:
# ECLO만 뽑아내기 위한 train 호출. (전처리 안된 원본데이터)
train = pd.read_csv('/Users/euijinlee/KDT_DATA/Project_1/data/train.csv')
# 전국데이터 호출. (전처리 안된 원본데이터)
countrywide = pd.read_csv('/Users/euijinlee/KDT_DATA/Project_1/data/external_open/countrywide_accident.csv')
#전국 데이터와 대구 데이터들의 eclo만 뽑아낸것. 총 row 642,384개
eclo = pd.concat([train['ECLO'], countrywide['ECLO']], sort=False)

### **요일, 기상, 노면, 유형, 연, 월, 시, 도시, 도로형태로 전처리된 데이터**

In [4]:
countrywide_df = pd.read_csv('/Users/euijinlee/KDT_DATA/Project_1/data/countrywide_processed.csv')
train_df = pd.read_csv('/Users/euijinlee/KDT_DATA/Project_1/data/train_processed.csv')
test_df = pd.read_csv('/Users/euijinlee/KDT_DATA/Project_1/data/test_processed.csv')

### **모든 컬럼 일괄적으로 원핫인 하기위해 데이터타입 통일**

In [5]:
# 문자열로 변환하는 함수 정의
def convert_to_str(df, columns):
    for col in columns:
        df[col] = df[col].astype(str)
    return df
# train_df, test_df, countrywide에 대한 문자열 변환
columns_to_convert = ['연', '월', '시간']
train_df = convert_to_str(train_df, columns_to_convert)
test_df = convert_to_str(test_df, columns_to_convert)
countrywide_df = convert_to_str(countrywide_df, columns_to_convert)

### **data라는 변수에 모두 넣어서 한번에 원핫인코딩**

In [6]:
#데이터 3개 합쳐
data = pd.concat([train_df, countrywide_df, test_df], sort=False)
data = pd.get_dummies(data)

### **트레인과 전국데이터 만큼 덜어내기**

In [7]:
sum_train_ctw_len = len(train) + len(countrywide)
train_enco = data[:sum_train_ctw_len]
test_enco = data[sum_train_ctw_len:]

# **LightGBM**

In [8]:
import lightgbm as lgb

## **1. 기본 / 로그변환 O, 전국데이터 O**
- 로그역변환 후 최종 RMSLE: 0.44126351291254134
- y_true : [1.584963, 1.584963, 2.321928 ... 1.584963, 2.584963, 2.584963]
- y_pred : [2.01827512 2.03540713 1.9335683  ... 2.20620982 2.20789858 2.21193198]

## => 제출 점수 : 0.4278996463	

### **Data Split**

In [24]:
# X_train은 대구+전국 데이터 원핫인코딩 된 데이터(642,384개), y_train은 eclo만(642,384개) 뽑아낸것. 
X_train = train_enco
y_train = np.log2(eclo)

# train 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### **LightGBM 모델 생성, 학습, 예측**

In [25]:
# LightGBM Dataset format
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)  

# 기본 parameters
params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


# 학습
num_round = 1000
lgb_model = lgb.train(params, train_data, num_round, valid_sets=[train_data, val_data], early_stopping_rounds=50, verbose_eval=10)


# 예측
## 테스트 데이터 예측
lgb_predictions = lgb_model.predict(X_valid, num_iteration=lgb_model.best_iteration)
## 테스트 데이터의 실제 값
y_true = y_valid
## 테스트 데이터의 예측 값
y_pred = lgb_predictions

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 50 rounds
[10]	training's l2: 0.650534	valid_1's l2: 0.649335
[20]	training's l2: 0.644646	valid_1's l2: 0.643758
[30]	training's l2: 0.641721	valid_1's l2: 0.640987
[40]	training's l2: 0.640121	valid_1's l2: 0.639528
[50]	training's l2: 0.639141	valid_1's l2: 0.638659
[60]	training's l2: 0.638349	valid_1's l2: 0.637999
[70]	training's l2: 0.637777	valid_1's l2: 0.637539
[80]	training's l2: 0.637318	valid_1's l2: 0.637237
[90]	training's l2: 0.636946	valid_1's l2: 0.637034
[100]	training's l2: 0.636633	valid_1's l2: 0.636896
[110]	training's l2: 0.636352	valid_1's l2: 0.636794
[120]	training's l2: 0.636077	valid_1's l2: 0.63667
[130]	training's l2: 0.635832	valid_1's l2: 0.636616
[140]	training's l2: 0.635612	valid_1's l2: 0.636541
[150]	training's l2: 0.635401	valid_1's l2: 0.636504
[160]	training's l2: 0.635199	va

### **RMSLE 평가**

In [26]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_true) - np.log1p(y_pred))))

In [27]:
# 로그 역변환 후 RMSLE 계산
predicted_eclo_unlog = np.exp2(lgb_predictions)
y_true_unlog = np.exp2(y_true)
lgb_rmsle = rmsle(y_true_unlog, predicted_eclo_unlog)
print("로그역변환 후 최종 RMSLE:", lgb_rmsle)

로그역변환 후 최종 RMSLE: 0.44126351291254134


In [28]:
display("y 실제 값", y_true)
print("y 예측 값", y_pred)

'y 실제 값'

252646    1.584963
522737    1.584963
77768     2.321928
427198    1.584963
179583    2.321928
            ...   
85878     1.584963
163400    1.584963
173291    1.584963
421151    2.584963
56653     2.584963
Name: ECLO, Length: 128477, dtype: float64

y 예측 값 [2.01827512 2.03540713 1.9335683  ... 2.20620982 2.20789858 2.21193198]


In [31]:
predictions_sub = lgb_model.predict(test_enco, num_iteration=lgb_model.best_iteration)
predictions_sub = np.exp2(predictions_sub)

In [32]:
predictions_sub

array([3.71895975, 3.45751446, 4.58013359, ..., 4.16624029, 4.17119203,
       4.23119101])

In [33]:
#기존 제출파일 불러오기 
sub = pd.read_csv('/Users/euijinlee/KDT_DATA/Project_1/data/sample_submission.csv') 
# 제출용 파일 만들기
sub['ECLO'] = predictions_sub
submission_lgb = sub[['ID', 'ECLO']]
# CSV 파일로 저장
submission_lgb.to_csv('LGBM_2_submission.csv', index=False)

## **2. RandomizedSearchCV / 로그변환 O, 전국데이터 O**
- 로그역변환 후 최종 RMSLE : 0.4413957760810627
- y_true : [1.584963, 1.584963, 2.321928 ... 1.584963, 2.584963, 2.584963]
- y_pred : [2.01690808 2.03878801 1.94848396 ... 2.1999051  2.19096902 2.21673301]

In [19]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

### **RandomizedSearchCV 적용 모델 생성, 학습, 예측**

In [20]:
# LightGBM Dataset format
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)  



# RandomizedSearchCV
## 최적의 params 찾을 범위 설정 
param_dist = {
    'objective': ['regression'],
    'num_leaves': sp_randint(6, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'feature_fraction': [0.5, 0.7, 0.9],
    'bagging_fraction': [0.5, 0.7, 0.9],
    'bagging_freq': sp_randint(1, 10),
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'metric': ['rmse']
}

## RandomizedSearchCV 적용전 LightGBM 객체 생성
lgb_model_base = lgb.LGBMRegressor(objective='regression', verbose=0)

## RandomizedSearchCV 객체 생성 
random_search = RandomizedSearchCV(
    lgb_model_base, 
    param_distributions=param_dist, 
    n_iter=10,  
    cv=3,  # Number of cross-validation folds
    random_state=42  # Set a random seed for reproducibility
)

## RandomizedSearchCV 데이터에 학습
random_search.fit(X_train, y_train)

## 최적의 하이퍼파라미터는?
print("Best Hyperparameters:", random_search.best_params_)

## 최적의 하이퍼파라미터 적용한 모델 생성
lgb_model_RSCV = random_search.best_estimator_



# 예측
## 테스트 데이터 예측 
lgb_predictions_RSCV = lgb_model_RSCV.predict(X_valid)
## 테스트 데이터의 실제 값
y_true = y_valid
## 테스트 데이터의 예측 값
y_pred_RSCV = lgb_predictions_RSCV

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


[LightGBM] [Fatal] Cannot use bagging in GOSS


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


[LightGBM] [Fatal] Cannot use bagging in GOSS


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


[LightGBM] [Fatal] Cannot use bagging in GOSS


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_w

[LightGBM] [Fatal] Cannot use bagging in GOSS


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


[LightGBM] [Fatal] Cannot use bagging in GOSS


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


[LightGBM] [Fatal] Cannot use bagging in GOSS


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_w

Best Hyperparameters: {'bagging_fraction': 0.9, 'bagging_freq': 7, 'boosting_type': 'gbdt', 'feature_fraction': 0.5, 'learning_rate': 0.1, 'metric': 'rmse', 'num_leaves': 23, 'objective': 'regression'}

### **RMSLE 평가**

In [21]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_true) - np.log1p(y_pred))))

In [22]:
# 로그 역변환 후 RMSLE 계산
predicted_eclo_unlog = np.exp2(lgb_predictions_RSCV)
y_true_unlog = np.exp2(y_true)
lgb_rmsle = rmsle(y_true_unlog, predicted_eclo_unlog)
print("로그역변환 후 최종 RMSLE:", lgb_rmsle)

로그역변환 후 최종 RMSLE: 0.4413957760810627


In [23]:
display("y 실제 값", y_true)
print("y 예측 값", y_pred_RSCV)

'y 실제 값'

252646    1.584963
522737    1.584963
77768     2.321928
427198    1.584963
179583    2.321928
            ...   
85878     1.584963
163400    1.584963
173291    1.584963
421151    2.584963
56653     2.584963
Name: ECLO, Length: 128477, dtype: float64

y 예측 값 [2.01690808 2.03878801 1.94848396 ... 2.1999051  2.19096902 2.21673301]
