In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import GridSearchCV, train_test_split

from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lgbm

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

# Data Load

In [3]:
train = pd.read_parquet('train.parquet').drop('ID', axis=1)
test = pd.read_parquet('test.parquet').drop('ID', axis=1)
submission = pd.read_csv('./dts/sample_submission.csv', index_col=0)

print('Data Loading Done')

Data Loading Done


# Data Pre-Processing

In [4]:
train.head(2)

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 18 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Month                     1000000 non-null  int64  
 1   Day_of_Month              1000000 non-null  int64  
 2   Estimated_Departure_Time  890981 non-null   float64
 3   Estimated_Arrival_Time    890960 non-null   float64
 4   Cancelled                 1000000 non-null  int64  
 5   Diverted                  1000000 non-null  int64  
 6   Origin_Airport            1000000 non-null  object 
 7   Origin_Airport_ID         1000000 non-null  int64  
 8   Origin_State              890985 non-null   object 
 9   Destination_Airport       1000000 non-null  object 
 10  Destination_Airport_ID    1000000 non-null  int64  
 11  Destination_State         890921 non-null   object 
 12  Distance                  1000000 non-null  float64
 13  Airline                   89

- 레이블을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다

In [6]:
NaN_col = []

for i in range(len(train.columns)):
    if(sum(train[train.columns[i]].notna())<1000000):
        NaN_col.append(train.columns[i])
        
print(NaN_col)

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)

print('결측치 처리 완료(최빈값으로 대체)')

['Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Origin_State', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Delay']
결측치 처리 완료(최빈값으로 대체)


- 질적 변수들을 수치화합니다

In [7]:
qual_col = []

for i in range(len(train.dtypes)):
    if (train.dtypes[i] == 'object'):
        qual_col.append(train.columns[i])
        
qual_col.remove('Delay')
print(qual_col)

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
            
    test[i] = le.transform(test[i])
    
print('질적 변수 수치화 완료')

['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']
질적 변수 수치화 완료


- 레이블이 없는 데이터를 분리합니다

In [8]:
train = train.dropna()

column_number = {}
for i,column in enumerate(submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))

train_x = train.drop(columns = ['Delay','Delay_num'])
train_y = train['Delay_num']
print('레이블이 없는 데이터 제거 완료')

레이블이 없는 데이터 제거 완료


- 데이터 표준화 절차

In [9]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

train_x = pd.DataFrame(ss.fit_transform(train_x), columns=train_x.columns)

# 모델 튜닝

In [10]:
train_y.value_counts()[0]/train_y.value_counts()[1]

21.22222222222222

In [11]:
model = xgb(scale_pos_weight=21.2, random_state=42)

param_grid = {
    'eta': [.05, .1, .2, .3],
    'max_depth': [4,5,6],
    'n_estimators': [100, 300, 500],
    'reg_alpha':[0,1,2],
    'reg_lambda':[0,1,2]
}

grid = GridSearchCV(model,
                    param_grid,
                    cv=5,
                    scoring='neg_log_loss',
                    verbose=1)

grid.fit(train_x, train_y)

best_model = grid.best_estimator_

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


KeyboardInterrupt: 

# 예측 및 제출

In [None]:
y_pred = best_model.predict_proba(test)

ubmission = pd.DataFrame(data=y_pred, columns=submission.columns, index=submission.index)
submission.to_csv('xgbgrid_submission.csv', index=True)

In [None]:
# mod_xgb = xgb(booster='gbtree', n_estimators=2000, reg_alpha=2, reg_lambda=2,
#               eval_metric='rmse', num_parallel_tree=3, eta=.1, max_depth=4, random_state=42)
# mod_xgb.fit(train_test, train_y)

# print('xgb_rmse')
# pred_score = mod_xgb.predict(val_test)
# print(mse(val_y,pred_score,squared=False))