In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### Data  & SPLIT

In [2]:
train = pd.read_parquet('./data/train_preprocess_2-3.parquet')
# test = pd.read_parquet('./test.parquet')
test = pd.read_parquet('./data/test_preprocess_2-3.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

print(train.shape)
print(train.Delay.value_counts())
print()

from sklearn.model_selection  import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train.drop(columns=['ID', 'Delay', 'Delay_num']), train['Delay_num'], test_size=0.2, random_state=42)

print(train_x.shape, val_x.shape)
print(train_y.value_counts())
print(val_y.value_counts())

(255001, 20)
Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64

(204000, 17) (51001, 17)
0    168109
1     35891
Name: Delay_num, dtype: int64
0    41892
1     9109
Name: Delay_num, dtype: int64


In [3]:
test_x = test.drop(columns=['ID'])
print(train.shape, test.shape)

(255001, 20) (1000000, 18)


### 모델적합 1 - GRID SEARCH
+ 최저의 Loss를 갖는 모델 적합

#### 2-3 전처리 이용

In [5]:
from xgboost import XGBClassifier

In [19]:
# 2-3 전처리 이용
train = pd.read_parquet('./data/train_preprocess_2-3.parquet')
test = pd.read_parquet('./data/test_preprocess_2-3.parquet')


# grid search 1
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의


# Model and hyperparameter tuning using GridSearchCV
xgboost = XGBClassifier(random_state=42)


# 그리드 서치를 위한 파라미터 그리드 정의
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7, 9],
    "learning_rate": [0.01, 0.1, 0.2],
#     "reg_lambda " : [0.1,0.3,0.5,1],
#     'early_stopping_rounds ' : [150] # eval 필요
}

# Cross-validation with StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 그리드 서치를 위한 GridSearchCV 객체 생성
grid_search = GridSearchCV(
    estimator=xgboost,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=cv,
    verbose=1,
    n_jobs=-1,
)

# 그리드 서치를 수행
grid_search.fit(train_x, train_y)

# 최적의 파라미터와 최고의 점수 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고의 점수: {-1 * grid_search.best_score_:.4f}")

# 최적의 파라미터를 사용한 최종 모델
best_xgboost = grid_search.best_estimator_


Fitting 5 folds for each of 36 candidates, totalling 180 fits
최적의 파라미터: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
최고의 점수: 0.4391


In [13]:
# 2-3 전처리 이용
train = pd.read_parquet('./data/train_preprocess_2-3.parquet')
test = pd.read_parquet('./data/test_preprocess_2-3.parquet')


# grid search 1
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의


# Model and hyperparameter tuning using GridSearchCV
xgboost = XGBClassifier(random_state=42)


# 그리드 서치를 위한 파라미터 그리드 정의
param_grid = {
    "n_estimators": [150, 200, 250],
    "max_depth": [6, 7, 8],
    "learning_rate": [0.05 , 0.075, 0.1, 0.125],
#     "reg_lambda " : [0.1,0.3,0.5,1],
#     'early_stopping_rounds ' : [150] # eval 필요
}

# Cross-validation with StratifiedKFold
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 그리드 서치를 위한 GridSearchCV 객체 생성
grid_search = GridSearchCV(
    estimator=xgboost,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=5,
#     cv= cv,
    verbose=1,
    n_jobs=-1,
)

# 그리드 서치를 수행
grid_search.fit(train_x, train_y)

# 최적의 파라미터와 최고의 점수 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고의 점수: {-1 * grid_search.best_score_:.4f}")

# 최적의 파라미터를 사용한 최종 모델
best_xgboost2 = grid_search.best_estimator_


Fitting 5 folds for each of 36 candidates, totalling 180 fits
최적의 파라미터: {'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 250}
최고의 점수: 0.4384


### 예측

In [16]:
#  1. grid search
y_pred = best_xgboost.predict_proba(test_x)
y_pred2 = best_xgboost2.predict_proba(test_x)
print(y_pred.shape, y_pred2.shape)

(1000000, 2) (1000000, 2)


### 제출

In [18]:
submission = pd.DataFrame(data=y_pred2, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('./submission/submission_xgboost2.csv', index=True)