## 결측치처리방식
### 1. 해당 노트북
+ Train에 대해서만 결측치 처리함
+ 출발&도착State와 Airline&Carrier(DOT)의 결측치를 처리함


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
path = '/content/drive/MyDrive/github/Dacon/항공편지연예측AI경진대회(2023.04.03-2023.05.08)'
os.chdir(path)
print(os.getcwd())

/content/drive/MyDrive/github/Dacon/항공편지연예측AI경진대회(2023.04.03-2023.05.08)


In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### Data Load

In [105]:
train = pd.read_parquet('./data/train.parquet')
test = pd.read_parquet('./data/test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

print(train.shape)
print(train.Delay.value_counts())


(1000000, 19)
Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64


In [19]:
# column별 고유한 값 개수
print(train.nunique())
print()
print(test.nunique())

ID                          1000000
Month                            12
Day_of_Month                     31
Estimated_Departure_Time       1365
Estimated_Arrival_Time         1428
Cancelled                         1
Diverted                          1
Origin_Airport                  374
Origin_Airport_ID               374
Origin_State                     52
Destination_Airport             375
Destination_Airport_ID          375
Destination_State                52
Distance                       1597
Airline                          28
Carrier_Code(IATA)               11
Carrier_ID(DOT)                  28
Tail_Number                    6430
Delay                             2
dtype: int64

ID                          1000000
Month                            12
Day_of_Month                     31
Estimated_Departure_Time       1413
Estimated_Arrival_Time         1436
Cancelled                         1
Diverted                          1
Origin_Airport                  379
Origin_Airport

In [3]:
train.isnull().sum()

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    109019
Estimated_Arrival_Time      109040
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                109015
Destination_Airport              0
Destination_Airport_ID           0
Destination_State           109079
Distance                         0
Airline                     108920
Carrier_Code(IATA)          108990
Carrier_ID(DOT)             108997
Tail_Number                      0
Delay                       744999
dtype: int64

In [94]:
# delay가 notnull인 경우 결측값 수
train[train['Delay'].notnull()].isnull().sum()

ID                              0
Month                           0
Day_of_Month                    0
Estimated_Departure_Time    27841
Estimated_Arrival_Time      27684
Cancelled                       0
Diverted                        0
Origin_Airport                  0
Origin_Airport_ID               0
Origin_State                27856
Destination_Airport             0
Destination_Airport_ID          0
Destination_State           27678
Distance                        0
Airline                     27540
Carrier_Code(IATA)          27888
Carrier_ID(DOT)             27770
Tail_Number                     0
Delay                           0
dtype: int64

### 결측치처리1 - State

In [39]:
# 코드 작동 test
i = 0
for airport, df in train.groupby('Origin_Airport')['Origin_State']:
    print(airport)
    print(df[df.notnull()].unique()[0])
    if i ==0:
        break

ABE
Pennsylvania


In [108]:
# 1. airport - state 페어가 잘 못된 것 있는지 파악
print('train origin')
for idx, df in train.groupby('Origin_Airport')['Origin_State']:
    if df.nunique() != 1:
        print(idx, 'values : ',df.nunique(),' something wrong')
        print(df.value_counts())
print()

print('train destination')
for idx, df in train.groupby('Destination_Airport')['Destination_State']:
    if df.nunique() != 1:
        print(idx, 'values : ',df.nunique(),' something wrong')
        print(df.value_counts())
print()

print('test origin')
for idx, df in test.groupby('Origin_Airport')['Origin_State']:
    if df.nunique() != 1:
        print(idx, 'values : ',df.nunique(),' something wrong')
        print(df.value_counts())
print()

print('test destination')
for idx, df in test.groupby('Destination_Airport')['Destination_State']:
    if df.nunique() != 1:
        print(idx, 'values : ',df.nunique(),' something wrong')
        print(df.value_counts())
print('-'*100)


# 2. airport : state dictionary 만들기
airport_state_dict = {}
for airport, df in train.groupby('Origin_Airport')['Origin_State']:
    state = df[df.notnull()].unique()
    if state:  # null아닌 값이 존재한다면 dict에 넣기
        state = state[0]
        
        if airport not in airport_state_dict:
            airport_state_dict[airport] = state
        else:
            if airport_state_dict[airport] != state:
                print('something wrong : ', airport)

for airport, df in train.groupby('Destination_Airport')['Destination_State']:
    state = df[df.notnull()].unique()
    if state: # null아닌 값이 존재한다면 dict에 넣기
        state = state[0]
        
        if airport not in airport_state_dict:
            airport_state_dict[airport] = state
        else:
            if airport_state_dict[airport] != state:
                print('something wrong : ', airport)

                
for airport, df in test.groupby('Origin_Airport')['Origin_State']:
    state = df[df.notnull()].unique()
    if state:  # null아닌 값이 존재한다면 dict에 넣기
        state = state[0]
        
        if airport not in airport_state_dict:
            airport_state_dict[airport] = state
        else:
            if airport_state_dict[airport] != state:
                print('something wrong : ', airport)

for airport, df in test.groupby('Destination_Airport')['Destination_State']:
    state = df[df.notnull()].unique()
    if state: # null아닌 값이 존재한다면 dict에 넣기
        state = state[0]
        
        if airport not in airport_state_dict:
            airport_state_dict[airport] = state
        else:
            if airport_state_dict[airport] != state:
                print('something wrong : ', airport)               

train origin

train destination
YNG values :  0  something wrong
Series([], Name: Destination_State, dtype: int64)

test origin
DDC values :  0  something wrong
Series([], Name: Origin_State, dtype: int64)
DEC values :  0  something wrong
Series([], Name: Origin_State, dtype: int64)
RIW values :  0  something wrong
Series([], Name: Origin_State, dtype: int64)
SHR values :  0  something wrong
Series([], Name: Origin_State, dtype: int64)

test destination
DEC values :  0  something wrong
Series([], Name: Destination_State, dtype: int64)
RIW values :  0  something wrong
Series([], Name: Destination_State, dtype: int64)
SHR values :  0  something wrong
Series([], Name: Destination_State, dtype: int64)
----------------------------------------------------------------------------------------------------


  if state: # null아닌 값이 존재한다면 dict에 넣기
  if state:  # null아닌 값이 존재한다면 dict에 넣기
  if state: # null아닌 값이 존재한다면 dict에 넣기


In [109]:
# 3. train 결측치 처리
## 3.1 Origin_State 결측치 처리
for i in train[train.Origin_State.isnull()].index:
    origin_airport = train.loc[i, 'Origin_Airport']
    train.loc[i, 'Origin_State'] = airport_state_dict[origin_airport]
train.isnull().sum()

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    109019
Estimated_Arrival_Time      109040
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                     0
Destination_Airport              0
Destination_Airport_ID           0
Destination_State           109079
Distance                         0
Airline                     108920
Carrier_Code(IATA)          108990
Carrier_ID(DOT)             108997
Tail_Number                      0
Delay                       744999
dtype: int64

In [110]:
## 3.2 Destination_State 결측치 처리
for i in train[train.Destination_State.isnull()].index:
    destination_airport = train.loc[i, 'Destination_Airport']
    train.loc[i, 'Destination_State'] = airport_state_dict[destination_airport]
train.isnull().sum()

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    109019
Estimated_Arrival_Time      109040
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                     0
Destination_Airport              0
Destination_Airport_ID           0
Destination_State                0
Distance                         0
Airline                     108920
Carrier_Code(IATA)          108990
Carrier_ID(DOT)             108997
Tail_Number                      0
Delay                       744999
dtype: int64

In [112]:
# 4. test 결측치 처리
for i in test[test.Origin_State.isnull()].index:
    origin_airport = test.loc[i, 'Origin_Airport']
    if origin_airport in airport_state_dict:
        test.loc[i, 'Origin_State'] = airport_state_dict[origin_airport]


for i in test[test.Destination_State.isnull()].index:
    destination_airport = test.loc[i, 'Destination_Airport']
    if destination_airport in airport_state_dict:
        test.loc[i, 'Destination_State'] = airport_state_dict[destination_airport]


### 결측치처리2 - Carrier_ID(DOT) 
+ **Airline - Carrier_ID(DOT)  페어로 진행**

+ Airline - Carrier_Code(IATA)
    Airline - Carrier_Code(IATA)는 Airline에 따라 값이 여러개인 것이 많아서 생략

+ Tail_Number - Carrier_Code(IATA) or Carrier_ID(DOT)
    Tail_Number 마다 값 여러개인 것 많아서 이용하지 않음

In [None]:
# print('train Carrier_Code(IATA)')
# for idx, df in train.groupby('Airline')['Carrier_Code(IATA)']:
#     if df.nunique() != 1:
#         print(idx, 'values : ',df.nunique(),' something wrong')
#         print(df.value_counts())
# print('-'*100)

In [113]:
# 1. 결측치 확인 : Airline - Carrier pair 확인 -> 확인 결과 모두 1대1 대응이거나 결측
print('train Carrier_ID(DOT)')
for idx, df in train.groupby('Airline')['Carrier_ID(DOT)']:
    if df.nunique() != 1:
        print(idx, 'values : ',df.nunique(),' something wrong')
        print(df.value_counts())
print('-'*100)


print('test Carrier_ID(DOT)')
for idx, df in test.groupby('Airline')['Carrier_ID(DOT)']:
    if df.nunique() != 1:
        print(idx, 'values : ',df.nunique(),' something wrong')
        print(df.value_counts())
print('-'*100)

print('train Airline')
for idx, df in train.groupby('Carrier_ID(DOT)')['Airline']:
    if df.nunique() != 1:
        print(idx, 'values : ',df.nunique(),' something wrong')
        print(df.value_counts())
print('-'*100)


print('test Airline')
for idx, df in test.groupby('Carrier_ID(DOT)')['Airline']:
    if df.nunique() != 1:
        print(idx, 'values : ',df.nunique(),' something wrong')
        print(df.value_counts())
print('-'*100)

print('airline null & DOT not null')
print(len(train[train.Airline.isnull() & train['Carrier_ID(DOT)'].notnull()]))
print('-'*100)

print('airline not null & DOT null')
print(len(train[train.Airline.notnull() & train['Carrier_ID(DOT)'].isnull()]))

train Carrier_ID(DOT)
----------------------------------------------------------------------------------------------------
test Carrier_ID(DOT)
----------------------------------------------------------------------------------------------------
train Airline
----------------------------------------------------------------------------------------------------
test Airline
----------------------------------------------------------------------------------------------------
airline null & DOT not null
97037
----------------------------------------------------------------------------------------------------
airline not null & DOT null
97114


In [114]:
# 2. { Airline : DOT }dictionary 만들기
airline_dot_dict = {}

## 2.1. train 의 airline 1개 당 DOT 1개인지 파악
for airline, df in train.groupby('Airline')['Carrier_ID(DOT)']:
    DOT = df[df.notnull()].unique()
    if DOT:  # null아닌 값이 존재한다면 dict에 넣기
        DOT = DOT[0]
        
        if airline not in airline_dot_dict:
            airline_dot_dict[airline] = DOT
        else:
            if airline_dot_dict[airline] != DOT:
                print('something wrong : ', airline)

## 2.2. train의 DOT 1개당 airline 1개인지 파악하고 이전과 일치하는지 파악
for DOT, df in train.groupby('Carrier_ID(DOT)')['Airline']:
    airline = df[df.notnull()].unique()
    if airline:  # null아닌 값이 존재한다면 dict에 넣기
        airline = airline[0]
        
        if airline not in airline_dot_dict:
            airline_dot_dict[airline] = DOT
        else:
            if airline_dot_dict[airline] != DOT:
                print('something wrong : ', airline)
                
## 2.3. test 의 airline 1개 당 DOT 1개인지 파악하고 이전과 일치하는지 파악
for airline, df in test.groupby('Airline')['Carrier_ID(DOT)']:
    DOT = df[df.notnull()].unique()
    if DOT:  # null아닌 값이 존재한다면 dict에 넣기
        DOT = DOT[0]
        
        if airline not in airline_dot_dict:
            airline_dot_dict[airline] = DOT
        else:
            if airline_dot_dict[airline] != DOT:
                print('something wrong : ', airline)

## 2.4. test의 DOT 1개당 airline 1개인지 파악하고 이전과 일치하는지 파악
for DOT, df in test.groupby('Carrier_ID(DOT)')['Airline']:
    airline = df[df.notnull()].unique()
    if airline:  # null아닌 값이 존재한다면 dict에 넣기
        airline = airline[0]
        
        if airline not in airline_dot_dict:
            airline_dot_dict[airline] = DOT
        else:
            if airline_dot_dict[airline] != DOT:
                print('something wrong : ', airline)

In [115]:
# 3. { DOT : Airline } dict 만들기
print(airline_dot_dict)
dot_airline_dict = { dot : airline for airline, dot in airline_dot_dict.items()}
print(dot_airline_dict)

{'Air Wisconsin Airlines Corp': 20046.0, 'Alaska Airlines Inc.': 19930.0, 'Allegiant Air': 20368.0, 'American Airlines Inc.': 19805.0, 'Cape Air': 20253.0, 'Capital Cargo International': 20427.0, 'Comair Inc.': 20397.0, 'Commutair Aka Champlain Enterprises, Inc.': 20445.0, 'Compass Airlines': 21167.0, 'Delta Air Lines Inc.': 19790.0, 'Empire Airlines Inc.': 20263.0, 'Endeavor Air Inc.': 20363.0, 'Envoy Air': 20398.0, 'ExpressJet Airlines Inc.': 20366.0, 'Frontier Airlines Inc.': 20436.0, 'GoJet Airlines, LLC d/b/a United Express': 20500.0, 'Hawaiian Airlines Inc.': 19690.0, 'Horizon Air': 19687.0, 'JetBlue Airways': 20409.0, 'Mesa Airlines Inc.': 20378.0, 'Peninsula Airways Inc.': 20225.0, 'Republic Airlines': 20452.0, 'SkyWest Airlines Inc.': 20304.0, 'Southwest Airlines Co.': 19393.0, 'Spirit Air Lines': 20416.0, 'Trans States Airlines': 20237.0, 'United Air Lines Inc.': 19977.0, 'Virgin America': 21171.0}
{20046.0: 'Air Wisconsin Airlines Corp', 19930.0: 'Alaska Airlines Inc.', 2036

In [116]:
# 4. train 결측치 처리
## 4.1 Airline 결측치 처리
for i in train[train['Airline'].isnull()].index:
    DOT = train.loc[i, 'Carrier_ID(DOT)']
    if DOT in dot_airline_dict:
        train.loc[i, 'Airline'] = dot_airline_dict[DOT]
train.isnull().sum()

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    109019
Estimated_Arrival_Time      109040
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                     0
Destination_Airport              0
Destination_Airport_ID           0
Destination_State                0
Distance                         0
Airline                      11883
Carrier_Code(IATA)          108990
Carrier_ID(DOT)             108997
Tail_Number                      0
Delay                       744999
dtype: int64

In [117]:
##. 4.2 DOT 결측치 처리
for i in train[train['Carrier_ID(DOT)'].isnull()].index:
    airline = train.loc[i, 'Airline']
    if airline in airline_dot_dict:
        train.loc[i, 'Carrier_ID(DOT)'] = airline_dot_dict[airline]
train.isnull().sum()

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    109019
Estimated_Arrival_Time      109040
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                     0
Destination_Airport              0
Destination_Airport_ID           0
Destination_State                0
Distance                         0
Airline                      11883
Carrier_Code(IATA)          108990
Carrier_ID(DOT)              11883
Tail_Number                      0
Delay                       744999
dtype: int64

In [118]:
# 5. test 결측치 처리
for i in test[test['Airline'].isnull()].index:
    DOT = test.loc[i, 'Carrier_ID(DOT)']
    if DOT in dot_airline_dict:
        test.loc[i, 'Airline'] = dot_airline_dict[DOT]
        
for i in test[test['Carrier_ID(DOT)'].isnull()].index:
    airline = test.loc[i, 'Airline']
    if airline in airline_dot_dict:
        test.loc[i, 'Carrier_ID(DOT)'] = airline_dot_dict[airline]

In [119]:
test.isnull().sum()

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    108984
Estimated_Arrival_Time      109048
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                    28
Destination_Airport              0
Destination_Airport_ID           0
Destination_State               21
Distance                         0
Airline                      11543
Carrier_Code(IATA)          108993
Carrier_ID(DOT)              11543
Tail_Number                      0
dtype: int64

### 결측치 처리한 것 저장

In [120]:
save_idx = input('몇 번째 전처리 방법인지 정수를 입력하세요 : ')
train_save_name = 'train_preprocess_' + save_idx
test_save_name = 'test_preprocess_' + save_idx
train.to_parquet(f'./data/{train_save_name}.parquet')
test.to_parquet(f'./data/{test_save_name}.parquet')

몇 번째 전처리 방법인지 정수를 입력하세요 : 2


### ===================결측치 이후 =======================================

In [50]:
# 카테고리 변수
category_cols = ['Month', 'Day_of_Month', 'Cancelled', 'Diverted', 
                 'Origin_Airport','Origin_Airport_ID', 'Origin_State', 
                 'Destination_Airport', 'Destination_Airport_ID', 'Destination_State',
                 'Airline', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number', 
                 'Delay']



# 수치형 변수
numeric_cols = [ 'Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Distance']


# 중복되어서 버릴 변수
['Destination_Airport_ID', 'Origin_Airport_ID']


print(train.columns)


Index(['ID', 'Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Distance', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number', 'Delay'],
      dtype='object')


In [81]:
# 레이블(Delay)을 제외한 결측값이 존재하는 변수들을  -1 value로 바꾸고, lable을 가지도록 합니다.
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    if type(mode) == str:
        train[col] = train[col].fillna('null')
    else:
        train[col] = train[col].fillna(-1)
    
    if col in test.columns:
        if type(mode) == str:
            test[col] = test[col].fillna('null')
        else:
            test[col] = test[col].fillna(-1)
print('Done.')

Done.


In [82]:
# 질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: # train에 없는 label인 경우
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [None]:
train.head()

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,600.0,1900.0,0,0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,


In [83]:
# 레이블이 없는 데이터들을 제거합니다
train = train.dropna()
print(train.shape)

# Delay -> 1, Not_Delayed -> 0
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

train.head()

(255001, 19)
Done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))


Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
5,TRAIN_000005,4,13,1545.0,-1.0,0,0,119,11618,28,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed,0
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19977.0,3495,Not_Delayed,0
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,18,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed,0
10,TRAIN_000010,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,20452.0,241,Delayed,1
12,TRAIN_000012,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,Not_Delayed,0


In [84]:
# train & test 데이터 생성
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

### train_test_split

In [85]:
from sklearn.model_selection  import train_test_split
train_x, val_x, train_y, val_y = train_test_split(train.drop(columns=['ID', 'Delay', 'Delay_num']), train['Delay_num'], test_size=0.2, random_state=42)

print(train_x.shape, val_x.shape)
print(train_y.value_counts())
print(val_y.value_counts())

(204000, 17) (51001, 17)
0    168109
1     35891
Name: Delay_num, dtype: int64
0    41892
1     9109
Name: Delay_num, dtype: int64


### 여러 모델 적합
Extra Trees Classifier	
Random Forest Classifier	
Light Gradient Boosting Machine
Decision Tree Classifier	
Gradient Boosting Classifier	
Ada Boost Classifier	
Logistic Regression

In [None]:
# SMOTE 없이 적합
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(train_x, train_y)
    y_pred = model.predict_proba(val_x)
    loss = log_loss(val_y, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Extra Trees Classifier: Log Loss = 0.4799
Random Forest Classifier: Log Loss = 0.4646
Light Gradient Boosting Machine: Log Loss = 0.4433
Decision Tree Classifier: Log Loss = 10.5274
Gradient Boosting Classifier: Log Loss = 0.4480
Ada Boost Classifier: Log Loss = 0.6823
Logistic Regression: Log Loss = 0.4621


In [None]:
# SMOTE 이용해서 적합
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# SMOTE로 데이터 불균형 완화
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(train_x, train_y)


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict_proba(val_x)
    loss = log_loss(val_y, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Extra Trees Classifier: Log Loss = 0.5113
Random Forest Classifier: Log Loss = 0.5092
Light Gradient Boosting Machine: Log Loss = 0.5127
Decision Tree Classifier: Log Loss = 11.6489
Gradient Boosting Classifier: Log Loss = 0.6037
Ada Boost Classifier: Log Loss = 0.6896
Logistic Regression: Log Loss = 0.6813


### 모델적합
+ 최저의 Loss를 갖는 모델 적합
    + lgbm 이용

In [10]:
# grid search 1
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의
lgbm = LGBMClassifier(random_state=42)

# 그리드 서치를 위한 파라미터 그리드 정의
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "num_leaves": [7, 15, 31],
}

# 그리드 서치를 위한 GridSearchCV 객체 생성
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=5,
    verbose=1,
    n_jobs=-1,
)

# 그리드 서치를 수행
grid_search.fit(train_x, train_y)

# 최적의 파라미터와 최고의 점수 출력
print(f"최적의 파라미터: {grid_search.best_params_}")
print(f"최고의 점수: {-1 * grid_search.best_score_:.4f}")

# 최적의 파라미터를 사용한 최종 모델
best_lgbm = grid_search.best_estimator_


Fitting 5 folds for each of 81 candidates, totalling 405 fits
최적의 파라미터: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'num_leaves': 31}
최고의 점수: 0.4395


In [14]:
# grid search2 from grid search1
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의
lgbm = LGBMClassifier(random_state=42)

# 그리드 서치를 위한 파라미터 그리드 정의
param_grid2 = {
    "n_estimators": [150, 200, 250, 300],
    "max_depth": [5, 7, 9],
    "learning_rate": [0.05, 0.075, 0.1, 0.125],
    "num_leaves": [20, 30, 40],
}

# 그리드 서치를 위한 GridSearchCV 객체 생성
grid_search2 = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid2,
    scoring="neg_log_loss",
    cv=5,
    verbose=1,
    n_jobs=-1,
)

# 그리드 서치를 수행
grid_search2.fit(train_x, train_y)

# 최적의 파라미터와 최고의 점수 출력
print(f"최적의 파라미터: {grid_search2.best_params_}")
print(f"최고의 점수: {-1 * grid_search2.best_score_:.4f}")

# 최적의 파라미터를 사용한 최종 모델
best_lgbm2 = grid_search2.best_estimator_


Fitting 5 folds for each of 144 candidates, totalling 720 fits
최적의 파라미터: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'num_leaves': 30}
최고의 점수: 0.4391


In [88]:
# 전처리 방법 바꾼 후 test
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

# 전체 데이터 이용
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

# 모델 정의
lgbm = LGBMClassifier(random_state=42, learning_rate=0.1, max_depth=7, n_estimators=300, num_leaves=30)
lgbm.fit(train_x, train_y)

### 예측

In [11]:
y_pred = best_lgbm.predict_proba(test_x)

In [15]:
y_pred2 = best_lgbm2.predict_proba(test_x)

In [89]:
y_pred3 = lgbm.predict_proba(test_x)

### 제출

In [12]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission_lgbm.csv', index=True)

In [16]:
submission2 = pd.DataFrame(data=y_pred2, columns=sample_submission.columns, index=sample_submission.index)
submission2.to_csv('submission_lgbm2.csv', index=True)

In [90]:
submission3 = pd.DataFrame(data=y_pred3, columns=sample_submission.columns, index=sample_submission.index)
submission3.to_csv('submission_lgbm3.csv', index=True)