In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/github/Dacon/항공편지연예측AI경진대회(2023.04.03-2023.05.08)')
os.getcwd()

'/content/drive/MyDrive/github/Dacon/항공편지연예측AI경진대회(2023.04.03-2023.05.08)'

## 1. 결측치처리
**해당 노트북**
+ 전처리방법2-3 + y 결측치를 예측하기 + validation set 만들기

In [3]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### 1.1. 전처리방법2 데이터 가져오기

In [63]:
train = pd.read_parquet('./data/train_preprocess_2.parquet')
# test = pd.read_parquet('./test.parquet')
test = pd.read_parquet('./data/test_preprocess_2.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

print(train.shape)
print(train.Delay.value_counts())

(1000000, 19)
Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64


### 1.2. 남은 결측치 처리 - 삭제

In [66]:
# print(train.isnull().sum())
# print(train.dropna().shape)
# print(train.dropna().isnull().sum())
train = train.dropna(subset=['Estimated_Departure_Time','Estimated_Arrival_Time','Carrier_Code(IATA)','Airline','Carrier_ID(DOT)'])
train.isnull().sum()


# 레이블(Delay)을 제외한 결측값이 존재하는 변수들을 unknown으로 대체합니다.
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    # mode = train[col].mode()[0]
    # train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna('Unknown')
print('Done.')

Done.


In [67]:
# 질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: # train에 없는 label인 경우
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

  if label not in le.classes_: # train에 없는 label인 경우


Done.


### 1.3. y값 결측 데이터 추출

In [68]:
# 레이블이 없는 데이터들을 따로 추출합니다.
null_train = train[train['Delay'].isnull()].copy()
null_train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
1,TRAIN_000001,8,15,740.0,1024.0,0,0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,195,12892,4,119,11618,28,2454.0,26,8,19977.0,3013,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,322,14771,4,7,10157,4,250.0,22,8,20304.0,554,
7,TRAIN_000007,4,20,1815.0,1955.0,0,0,256,13930,11,217,13198,23,403.0,22,8,20304.0,173,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,256,13930,11,270,14100,36,678.0,26,8,19977.0,2471,
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,122,11637,32,242,13487,21,223.0,22,3,20304.0,2288,
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,248,13796,4,159,12191,42,1642.0,23,10,19393.0,991,
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,45,10693,41,22,10397,8,214.0,9,3,19790.0,6189,


### 1.4. 결측없는 데이터 추출

In [69]:
# 레이블이 없는 데이터들을 삭제합니다.
train = train.dropna()
print(train.shape)

# Delay -> 1, Not_Delayed -> 0
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

train.head()

(178176, 19)
Done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))


Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19977.0,3487,Not_Delayed,0
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,18,74,11057,31,361.0,23,10,19393.0,4071,Not_Delayed,0
10,TRAIN_000010,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,20452.0,241,Delayed,1
12,TRAIN_000012,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5156,Not_Delayed,0
13,TRAIN_000013,9,19,615.0,706.0,0,0,215,13158,42,94,11292,5,563.0,22,8,20304.0,577,Not_Delayed,0


### 1.5. train, validation 나누기

In [70]:
from sklearn.model_selection import train_test_split
train1, val1 = train_test_split(train, random_state=42, test_size=0.2)
print(train.shape, train1.shape, val1.shape)

(178176, 20) (142540, 20) (35636, 20)


## 2. 준지도학습진행

### 2.1. y값 예측모델 구축1
+ lgbm 이용해서 y값 예측
    + 기존에 좋은 예측 정확도가 나왔던 {'learning_rate': 0.075, 'max_depth': 8, 'n_estimators': 400, 'num_leaves': 35} 모형 사용

In [71]:
from lightgbm import LGBMClassifier

# train1 데이터 이용헤서 모델 적합
train_x = train1.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train1['Delay_num']
null_train_x = null_train.drop(columns=['ID','Delay'])

# 모델 정의
lgbm = LGBMClassifier(random_state=42, learning_rate=0.075, max_depth=8, n_estimators=400, num_leaves=35)

# 모델 fit
lgbm.fit(train_x, train_y)

# 예측
pred_y = lgbm.predict_proba(null_train_x)
pred_y # [Not Delayed, Delayed]


array([[0.77829525, 0.22170475],
       [0.7516171 , 0.2483829 ],
       [0.75301447, 0.24698553],
       ...,
       [0.84875011, 0.15124989],
       [0.79058554, 0.20941446],
       [0.8458111 , 0.1541889 ]])

In [72]:
null_train_x['NotDelayed'] = pred_y[:,0]
null_train_x['Delayed'] = pred_y[:,1]
null_train_x[null_train_x['NotDelayed']>0.9]

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,NotDelayed,Delayed
9,6,6,650.0,838.0,0,0,207,12992,3,169,12266,42,374.0,13,8,20366.0,468,0.922365,0.077635
11,3,18,600.0,748.0,0,0,270,14100,36,103,11433,20,453.0,9,3,19790.0,1855,0.952534,0.047466
14,12,18,845.0,855.0,0,0,271,14107,2,193,12889,26,255.0,23,10,19393.0,3726,0.910124,0.089876
22,3,6,1030.0,1150.0,0,0,57,10792,30,167,12264,47,283.0,0,8,20046.0,2183,0.911253,0.088747
26,8,26,644.0,805.0,0,0,57,10792,30,47,10721,19,395.0,18,2,20409.0,1441,0.940960,0.059040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999935,7,7,650.0,755.0,0,0,328,14831,4,58,10800,4,296.0,23,10,19393.0,4318,0.968947,0.031053
999964,7,2,830.0,1010.0,0,0,93,11278,47,217,13198,23,949.0,21,0,20452.0,2106,0.919053,0.080947
999973,1,9,1035.0,1440.0,0,0,195,12892,4,250,13830,9,2486.0,27,9,21171.0,1336,0.902894,0.097106
999976,10,9,600.0,715.0,0,0,135,11884,48,320,14747,48,224.0,17,1,19687.0,2324,0.943784,0.056216


### 2.2. 임계치 이상의 y값들을 0으로 예측


In [86]:
# 0.9 이상인 것들 지연 안됨으로 예측
semi_super_train3 = null_train[null_train_x['NotDelayed']>0.96]
semi_super_train3['Delay_num'] = 0
semi_super_train3


# 예측값 중 max index 찾아 semi_super_train의 y값으로 넣어주기
# max_prob_idx = np.argmax(pred_y, axis=1)
# print(max_prob_idx)
# from collections import Counter
# print('불연착/연착 분포 : ',Counter(max_prob_idx))
# semi_super_train['Delay_num'] = max_prob_idx
# semi_super_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  semi_super_train3['Delay_num'] = 0


Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
406,TRAIN_000406,12,7,730.0,800.0,0,0,72,11042,33,229,13342,50,328.0,23,10,19393.0,935,,0
478,TRAIN_000478,5,29,527.0,659.0,0,0,30,10529,6,93,11278,47,313.0,6,0,20397.0,1125,,0
619,TRAIN_000619,4,3,645.0,1150.0,0,0,335,14908,4,90,11259,42,1216.0,23,10,19393.0,1090,,0
701,TRAIN_000701,11,9,610.0,730.0,0,0,72,11042,33,59,10821,18,314.0,23,10,19393.0,3998,,0
870,TRAIN_000870,1,8,745.0,935.0,0,0,356,15376,2,94,11292,5,639.0,23,10,19393.0,4907,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998766,TRAIN_998766,9,25,625.0,743.0,0,0,60,10849,24,331,14869,45,347.0,9,3,19790.0,1828,,0
998808,TRAIN_998808,5,31,615.0,729.0,0,0,33,10561,4,322,14771,4,238.0,22,8,20304.0,5468,,0
999403,TRAIN_999403,3,31,831.0,918.0,0,0,189,12758,9,157,12173,9,163.0,16,6,19690.0,2495,,0
999429,TRAIN_999429,10,31,605.0,745.0,0,0,307,14635,7,22,10397,8,515.0,23,10,19393.0,4967,,0


### 2.3. 기존 데이터와 준지도학습 데이터 합치기

In [87]:
# 데이터 합치기
semi_super_train = pd.concat([train1, semi_super_train3])
semi_super_train.sort_index(inplace=True)
semi_super_train


Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,18,74,11057,31,361.0,23,10,19393.0,4071,Not_Delayed,0
10,TRAIN_000010,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,20452.0,241,Delayed,1
12,TRAIN_000012,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5156,Not_Delayed,0
13,TRAIN_000013,9,19,615.0,706.0,0,0,215,13158,42,94,11292,5,563.0,22,8,20304.0,577,Not_Delayed,0
19,TRAIN_000019,7,14,1907.0,2145.0,0,0,30,10529,6,22,10397,8,859.0,9,3,19790.0,6129,Not_Delayed,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999941,TRAIN_999941,8,8,655.0,757.0,0,0,95,11298,42,335,14908,4,1205.0,3,0,19805.0,6217,Not_Delayed,0
999943,TRAIN_999943,9,30,1100.0,1155.0,0,0,174,12339,12,193,12889,26,1590.0,23,10,19393.0,4907,Not_Delayed,0
999950,TRAIN_999950,11,7,1401.0,1950.0,0,0,287,14262,4,256,13930,11,1652.0,26,8,19977.0,5362,Not_Delayed,0
999955,TRAIN_999955,4,12,1310.0,1356.0,0,0,320,14747,48,266,14057,35,129.0,17,1,19687.0,2244,Delayed,1


### 2.4. 준지도학습 결과 validation

+ 준지도 없는 경우 validation score
```
Extra Trees Classifier: Log Loss = 0.5265
Random Forest Classifier: Log Loss = 0.4804
Light Gradient Boosting Machine: Log Loss = 0.4399
Decision Tree Classifier: Log Loss = 10.2489
Gradient Boosting Classifier: Log Loss = 0.4437
Ada Boost Classifier: Log Loss = 0.6821
Logistic Regression: Log Loss = 0.4554
```

+ semi_super_train1 : 0.9
```
Extra Trees Classifier: Log Loss = 0.7188
Random Forest Classifier: Log Loss = 0.6225
Light Gradient Boosting Machine: Log Loss = 0.4508
Decision Tree Classifier: Log Loss = 9.5804
Gradient Boosting Classifier: Log Loss = 0.4542
Ada Boost Classifier: Log Loss = 0.6788
Logistic Regression: Log Loss = 0.4711
```
+ semi_super_train2 : 0.95
```
Extra Trees Classifier: Log Loss = 0.5437
Random Forest Classifier: Log Loss = 0.4938
Light Gradient Boosting Machine: Log Loss = 0.4410
Decision Tree Classifier: Log Loss = 10.1023
Gradient Boosting Classifier: Log Loss = 0.4445
Ada Boost Classifier: Log Loss = 0.6815
Logistic Regression: Log Loss = 0.4563
```


+ semi_super_train3 : 0.96
```
Extra Trees Classifier: Log Loss = 0.5296
Random Forest Classifier: Log Loss = 0.4845
Light Gradient Boosting Machine: Log Loss = 0.4401
Decision Tree Classifier: Log Loss = 10.2965
Gradient Boosting Classifier: Log Loss = 0.4440
Ada Boost Classifier: Log Loss = 0.6818
Logistic Regression: Log Loss = 0.4556
```

In [88]:
# train, semi_super_train, val의 shape 파악

print(train1.shape, semi_super_train.shape, val1.shape)

(142540, 20) (147903, 20) (35636, 20)


In [89]:
# 1. validation data 모델에 적합하게 바꿔주기 (val1)

# val_x & val_y

val_x = val1.drop(columns=['ID', 'Delay', 'Delay_num'])
val_y =  val1['Delay_num']


# 2. 준지도학습 경우 데이터 적합하게 바꿔주기

## 2.1. 준지도학습한 경우 (train1 + 준지도)
# semi_train_x & semi_train_y
semi_train_x = semi_super_train.drop(columns=['ID', 'Delay', 'Delay_num'])
semi_train_y =  semi_super_train['Delay_num']

## 2.2. 준지도학습 없이 기존 데이터 (train1)
# train_x &  train_y

train_x = train1.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train1['Delay_num']




print(val_x.shape, semi_train_x.shape, train_x.shape)

(35636, 17) (147903, 17) (142540, 17)


In [90]:
# 1. semi 의 경우 validation
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(semi_train_x, semi_train_y)
    y_pred = model.predict_proba(val_x)
    loss = log_loss(val_y, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Extra Trees Classifier: Log Loss = 0.5296
Random Forest Classifier: Log Loss = 0.4845
Light Gradient Boosting Machine: Log Loss = 0.4401
Decision Tree Classifier: Log Loss = 10.2965
Gradient Boosting Classifier: Log Loss = 0.4440
Ada Boost Classifier: Log Loss = 0.6818
Logistic Regression: Log Loss = 0.4556


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [78]:
# 2. 기존 데이터 validation
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


# 모델 정의
models = {
    "Extra Trees Classifier": ExtraTreesClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "Light Gradient Boosting Machine": LGBMClassifier(random_state=42),
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
    "Ada Boost Classifier": AdaBoostClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
}

# 각 모델의 성능을 비교
for name, model in models.items():
    model.fit(train_x, train_y)
    y_pred = model.predict_proba(val_x)
    loss = log_loss(val_y, y_pred)
    print(f"{name}: Log Loss = {loss:.4f}")

Extra Trees Classifier: Log Loss = 0.5265
Random Forest Classifier: Log Loss = 0.4804
Light Gradient Boosting Machine: Log Loss = 0.4399
Decision Tree Classifier: Log Loss = 10.2489
Gradient Boosting Classifier: Log Loss = 0.4437
Ada Boost Classifier: Log Loss = 0.6821
Logistic Regression: Log Loss = 0.4554


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 3. 최선의 결과로 준지도학습 데이터 생성

### 전처리방법5 저장

In [None]:
save_idx = input('몇 번째 전처리 방법인지 정수-정수를 입력하세요 : ')
train_save_name = 'train_preprocess_' + save_idx
test_save_name = 'test_preprocess_' + save_idx
train.to_parquet(f'./data/{train_save_name}.parquet')
test.to_parquet(f'./data/{test_save_name}.parquet')

몇 번째 전처리 방법인지 정수-정수를 입력하세요 : 4
