In [2]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### Data  & SPLIT

In [4]:
train = pd.read_parquet('./data/train_preprocess_2-1.parquet')
# test = pd.read_parquet('./test.parquet')
test = pd.read_parquet('./data/test_preprocess_2-1.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

print(train.shape)
print(train.Delay.value_counts())
print()

from sklearn.model_selection  import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train.drop(columns=['ID', 'Delay', 'Delay_num']), train['Delay_num'], test_size=0.2, random_state=42)

print(train_x.shape, val_x.shape)
print(train_y.value_counts())
print(val_y.value_counts())

(255001, 20)
Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64

(204000, 17) (51001, 17)
0    168109
1     35891
Name: Delay_num, dtype: int64
0    41892
1     9109
Name: Delay_num, dtype: int64


### 여러 모델 적합
Extra Trees Classifier	
Random Forest Classifier	
Light Gradient Boosting Machine
Decision Tree Classifier	
Gradient Boosting Classifier	
Ada Boost Classifier	
Logistic Regression

In [None]:
# SMOTE 없이 적합
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(train_x, train_y)
    y_pred = model.predict_proba(val_x)
    loss = log_loss(val_y, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Extra Trees Classifier: Log Loss = 0.4799
Random Forest Classifier: Log Loss = 0.4646
Light Gradient Boosting Machine: Log Loss = 0.4433
Decision Tree Classifier: Log Loss = 10.5274
Gradient Boosting Classifier: Log Loss = 0.4480
Ada Boost Classifier: Log Loss = 0.6823
Logistic Regression: Log Loss = 0.4621


In [None]:
# SMOTE 이용해서 적합
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# SMOTE로 데이터 불균형 완화
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(train_x, train_y)


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict_proba(val_x)
    loss = log_loss(val_y, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Extra Trees Classifier: Log Loss = 0.5113
Random Forest Classifier: Log Loss = 0.5092
Light Gradient Boosting Machine: Log Loss = 0.5127
Decision Tree Classifier: Log Loss = 11.6489
Gradient Boosting Classifier: Log Loss = 0.6037
Ada Boost Classifier: Log Loss = 0.6896
Logistic Regression: Log Loss = 0.6813


### 모델적합
+ 최저의 Loss를 갖는 모델 적합
    + lgbm 이용

In [10]:
# grid search 1
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의
lgbm = LGBMClassifier(random_state=42)

# 그리드 서치를 위한 파라미터 그리드 정의
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "num_leaves": [7, 15, 31],
}

# 그리드 서치를 위한 GridSearchCV 객체 생성
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=5,
    verbose=1,
    n_jobs=-1,
)

# 그리드 서치를 수행
grid_search.fit(train_x, train_y)

# 최적의 파라미터와 최고의 점수 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고의 점수: {-1 * grid_search.best_score_:.4f}")

# 최적의 파라미터를 사용한 최종 모델
best_lgbm = grid_search.best_estimator_


Fitting 5 folds for each of 81 candidates, totalling 405 fits
최적의 파라미터: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'num_leaves': 31}
최고의 점수: 0.4395


In [14]:
# grid search2 from grid search1
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의
lgbm = LGBMClassifier(random_state=42)

# 그리드 서치를 위한 파라미터 그리드 정의
param_grid2 = {
    "n_estimators": [150, 200, 250, 300],
    "max_depth": [5, 7, 9],
    "learning_rate": [0.05, 0.075, 0.1, 0.125],
    "num_leaves": [20, 30, 40],
}

# 그리드 서치를 위한 GridSearchCV 객체 생성
grid_search2 = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid2,
    scoring="neg_log_loss",
    cv=5,
    verbose=1,
    n_jobs=-1,
)

# 그리드 서치를 수행
grid_search2.fit(train_x, train_y)

# 최적의 파라미터와 최고의 점수 출력
print(f"최적의 파라미터: {grid_search2.best_params_}")
print(f"최고의 점수: {-1 * grid_search2.best_score_:.4f}")

# 최적의 파라미터를 사용한 최종 모델
best_lgbm2 = grid_search2.best_estimator_


Fitting 5 folds for each of 144 candidates, totalling 720 fits
최적의 파라미터: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'num_leaves': 30}
최고의 점수: 0.4391


In [11]:
# 전처리 방법 바꾼 후 test
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의
lgbm = LGBMClassifier(random_state=42, learning_rate=0.1, max_depth=7, n_estimators=300, num_leaves=30)
lgbm.fit(train_x, train_y)

In [14]:
# grid search3 from grid search2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의
lgbm = LGBMClassifier(random_state=42)

# 그리드 서치를 위한 파라미터 그리드 정의
param_grid3 = {
    "n_estimators": [ 300, 400, 500, 600],
    "max_depth": [ 7, 8, 9],
    "learning_rate": [0.075, 0.1, 0.125],
    "num_leaves": [25, 30, 35, 40, 45],
}

# 그리드 서치를 위한 GridSearchCV 객체 생성
grid_search3 = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid3,
    scoring="neg_log_loss",
    cv=5,
    verbose=1,
    n_jobs=-1,
)

# 그리드 서치를 수행
grid_search3.fit(train_x, train_y)

# 최적의 파라미터와 최고의 점수 출력
print(f"최적의 파라미터: {grid_search3.best_params_}")
print(f"최고의 점수: {-1 * grid_search3.best_score_:.4f}")

# 최적의 파라미터를 사용한 최종 모델
best_lgbm3 = grid_search3.best_estimator_

Fitting 5 folds for each of 180 candidates, totalling 900 fits
최적의 파라미터: {'learning_rate': 0.075, 'max_depth': 7, 'n_estimators': 300, 'num_leaves': 45}
최고의 점수: 0.4379


### 예측

In [11]:
y_pred = best_lgbm.predict_proba(test_x)

In [15]:
y_pred2 = best_lgbm2.predict_proba(test_x)

In [12]:
y_pred3 = lgbm.predict_proba(test_x)

In [16]:
y_pred4 = best_lgbm3.predict_proba(test_x)

### 제출

In [12]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_lgbm.csv', index=True)

In [16]:
submission2 = pd.DataFrame(data=y_pred2, columns=sample_submission.columns, index=sample_submission.index)
submission2.to_csv('submission_lgbm2.csv', index=True)

In [13]:
submission3 = pd.DataFrame(data=y_pred3, columns=sample_submission.columns, index=sample_submission.index)
submission3.to_csv('submission_lgbm3.csv', index=True)

In [17]:
submission4 = pd.DataFrame(data=y_pred4, columns=sample_submission.columns, index=sample_submission.index)
submission4.to_csv('./submission/submission_lgbm4.csv', index=True)