In [2]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### Data  & SPLIT

In [4]:
train = pd.read_parquet('./data/train_preprocess_2-1.parquet')
# test = pd.read_parquet('./test.parquet')
test = pd.read_parquet('./data/test_preprocess_2-1.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

print(train.shape)
print(train.Delay.value_counts())
print()

from sklearn.model_selection  import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train.drop(columns=['ID', 'Delay', 'Delay_num']), train['Delay_num'], test_size=0.2, random_state=42)

print(train_x.shape, val_x.shape)
print(train_y.value_counts())
print(val_y.value_counts())

(255001, 20)
Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64

(204000, 17) (51001, 17)
0    168109
1     35891
Name: Delay_num, dtype: int64
0    41892
1     9109
Name: Delay_num, dtype: int64


### 여러 모델 적합
Extra Trees Classifier	
Random Forest Classifier	
Light Gradient Boosting Machine
Decision Tree Classifier	
Gradient Boosting Classifier	
Ada Boost Classifier	
Logistic Regression

In [5]:
# SMOTE 없이 적합
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(train_x, train_y)
    y_pred = model.predict_proba(val_x)
    loss = log_loss(val_y, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Extra Trees Classifier: Log Loss = 0.5019
Random Forest Classifier: Log Loss = 0.4750
Light Gradient Boosting Machine: Log Loss = 0.4424
Decision Tree Classifier: Log Loss = 10.4489
Gradient Boosting Classifier: Log Loss = 0.4474
Ada Boost Classifier: Log Loss = 0.6823
Logistic Regression: Log Loss = 0.4622


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# SMOTE 이용해서 적합
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# SMOTE로 데이터 불균형 완화
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(train_x, train_y)


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict_proba(val_x)
    loss = log_loss(val_y, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Extra Trees Classifier: Log Loss = 0.5167
Random Forest Classifier: Log Loss = 0.5130
Light Gradient Boosting Machine: Log Loss = 0.5160
Decision Tree Classifier: Log Loss = 11.7125
Gradient Boosting Classifier: Log Loss = 0.5985
Ada Boost Classifier: Log Loss = 0.6901
Logistic Regression: Log Loss = 0.6810


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
