<a href="https://colab.research.google.com/github/khwwang/coding-training/blob/master/%ED%95%AD%EA%B3%B5%ED%8E%B8%EC%A7%80%EC%97%B0%EC%98%88%EC%B8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import os
import numpy as np
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer
from xgboost import XGBClassifier

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Fixed Seed

def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [3]:
# drive에 연결합니다.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 현재 데이터가 있는 공간으로 작업 경로를 변경해줍니다.
%cd "/content/drive/MyDrive/dacon/open_1"

Mounted at /content/drive
/content/drive/MyDrive/dacon/open_1


In [4]:
csv_to_parquet('train.csv', 'train')
csv_to_parquet('test.csv', 'test')

train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

train Done.
test Done.


In [5]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)

    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [None]:
# cols = train.select_dtypes(include=['object'])
# le = LabelEncoder()

# for col in cols:
#   train[col] = le.fit_transform(train[col])
#   test[col] = le.transform(test[col])

In [6]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])

    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [7]:
#레이블이 없는 데이터들을 제거합니다
train = train.dropna()

In [8]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i

def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [9]:
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [10]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

In [11]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)
test_x = scaler.transform(test_x)

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, log_loss

dt = DecisionTreeClassifier()
dt.fit(train_x, train_y)
pred_1 = dt.predict(val_x)

accuracy = accuracy_score(val_y, pred_1)
f1 = f1_score(val_y, pred_1, average='weighted')
precision = precision_score(val_y, pred_1, average='weighted')
recall = recall_score(val_y, pred_1, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

cm1 = confusion_matrix(val_y, pred_1)
print(cm1)

Accuracy: 0.7101625458324347
F1 Score: 0.7157154247660936
Precision: 0.7216520312641815
Recall: 0.7101625458324347
[[34023  7869]
 [ 6913  2196]]


In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

rf.fit(train_x, train_y)
pred_2 = rf.predict(val_x)

accuracy = accuracy_score(val_y, pred_2)
f1 = f1_score(val_y, pred_2, average='weighted')
precision = precision_score(val_y, pred_2, average='weighted')
recall = recall_score(val_y, pred_2, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

cm2 = confusion_matrix(val_y, pred_2)
print(cm2)

Accuracy: 0.8181996431442521
F1 Score: 0.7489386921906548
Precision: 0.7463246276782165
Recall: 0.8181996431442521
[[41449   443]
 [ 8829   280]]


In [15]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)

xgb.fit(train_x, train_y)
pred_3 = xgb.predict(val_x)

accuracy = accuracy_score(val_y, pred_3)
f1 = f1_score(val_y, pred_3, average='weighted')
precision = precision_score(val_y, pred_3, average='weighted')
recall = recall_score(val_y, pred_3, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

cm3 = confusion_matrix(val_y, pred_3)
print(cm3)

Accuracy: 0.8212976216152624
F1 Score: 0.7490495880677704
Precision: 0.7654504972612673
Recall: 0.8212976216152624
[[41655   237]
 [ 8877   232]]


In [18]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(train_x, train_y)
pred_4 = lgbm.predict(val_x)

accuracy = accuracy_score(val_y, pred_4)
f1 = f1_score(val_y, pred_4, average='weighted')
precision = precision_score(val_y, pred_4, average='weighted')
recall = recall_score(val_y, pred_4, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

cm4 = confusion_matrix(val_y, pred_4)
print(cm4)

[LightGBM] [Info] Number of positive: 35891, number of negative: 168109
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2098
[LightGBM] [Info] Number of data points in the train set: 204000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.175936 -> initscore=-1.544126
[LightGBM] [Info] Start training from score -1.544126
Accuracy: 0.8217682006235172
F1 Score: 0.7429650884383274
Precision: 0.7845486323828018
Recall: 0.8217682006235172
[[41859    33]
 [ 9057    52]]


In [20]:
from sklearn.metrics import log_loss

logloss_1 = log_loss(val_y, pred_1)
logloss_2 = log_loss(val_y, pred_2)
logloss_3 = log_loss(val_y, pred_3)
logloss_4 = log_loss(val_y, pred_4)

f1_1 = f1_score(val_y, pred_1, average='weighted')
f1_2 = f1_score(val_y, pred_2, average='weighted')
f1_3 = f1_score(val_y, pred_3, average='weighted')
f1_4 = f1_score(val_y, pred_4, average='weighted')

# 결과 출력
print(f'F1_1 Score: {f1_1}')
print(f'F1_2 Score: {f1_2}')
print(f'F1_3 Score: {f1_3}')
print(f'F1_4 Score: {f1_4}')

print(f"Log Loss_1: {logloss_1}")
print(f"Log Loss_1: {logloss_2}")
print(f"Log Loss_3: {logloss_3}")
print(f"Log Loss_4: {logloss_4}")

F1_1 Score: 0.7157154247660936
F1_2 Score: 0.7489386921906548
F1_3 Score: 0.7490495880677704
F1_4 Score: 0.7429650884383274
Log Loss_1: 10.446800737199853
Log Loss_1: 6.552749048526386
Log Loss_3: 6.441086586310341
Log Loss_4: 6.424125199644614


In [21]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [27]:
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5 ,8],
    'n_estimators': [100, 200],
}

In [28]:
grid = GridSearchCV(xgb,
                    param_grid,
                    cv=cv,
                    scoring='accuracy',
                    n_jobs=-1,
                    verbose=1)

In [29]:
grid.fit(train_x,train_y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




In [30]:
best_model = grid.best_estimator_
pred_5 = best_model.predict(val_x)

In [31]:
accuracy = accuracy_score(val_y, pred_5)
f1 = f1_score(val_y, pred_5, average='weighted')
precision = precision_score(val_y, pred_5, average='weighted')
recall = recall_score(val_y, pred_5, average='weighted')
logloss_5 = log_loss(val_y, pred_5)

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f"Log Loss_5: {logloss_5}")

Accuracy: 0.8215329111193899
F1 Score: 0.7418136740327891
Precision: 0.7795085218522191
Recall: 0.8215329111193899
Log Loss_5: 6.4326058929774765
