# EDA

In [57]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

## Data Load

In [58]:
train = pd.read_csv('train.csv')
train = train.drop('ID',axis=1)

test = pd.read_csv('test.csv')
test = test.drop('ID',axis=1)

sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

## Data Pre-Processing

In [59]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [60]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


## Self Training

In [61]:
# 레이블 된 데이터, 아닌 데이터 분리
train_labled = train[train['Delay'].notnull()]
train_unlabled = train[train['Delay'].isnull()]

In [62]:
#Not_Delayed = 1, Delayed =0
train_labled['Delay'] = np.where(train_labled['Delay'] == 'Not_Delayed', 1, 0)
train_labled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_labled['Delay'] = np.where(train_labled['Delay'] == 'Not_Delayed', 1, 0)


Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
5,4,13,1545.0,1900.0,0,0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,1
6,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,1
8,6,13,1420.0,1550.0,0,0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,1
10,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,19393.0,241,0
12,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999962,10,11,600.0,2003.0,0,0,310,14683,42,256,13930,11,1041.0,22,8,20304.0,488,1
999963,5,2,1759.0,1926.0,0,0,204,12953,30,93,11278,47,214.0,23,3,20452.0,5204,0
999969,10,10,940.0,1056.0,0,0,223,13256,42,169,12266,42,316.0,19,8,20378.0,5350,0
999985,8,8,1914.0,2039.0,0,0,296,14492,31,183,12451,7,407.0,14,4,20436.0,1499,1


In [63]:
# 레이블이 된 데이터에서 delay변수 따로 저장
train_labled_x = train_labled.drop(["Delay"],axis=1)
train_labled_y = train_labled["Delay"]

In [64]:
train_unlabled_x = train_unlabled.drop(["Delay"],axis=1)

In [65]:
# self-training 모델을 설정
clf = SelfTrainingClassifier(RandomForestClassifier(), max_iter=100, verbose=True)

# labeled 데이터를 사용하여 모델을 학습
clf.fit(train_labled_x, train_labled_y)

# unlabeled 데이터를 사용하여 self-training을 수행
pred_unlabled = clf.predict(train_unlabled_x)
pred_unlabled



array([1, 1, 1, ..., 1, 1, 1])

In [66]:
# 예측한 데이터를 레이블되지 않은 데이터에 붙이기
train_unlabled["Delay"] = pred_unlabled
train_unlabled

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_unlabled["Delay"] = pred_unlabled


Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,4,15,600.0,1900.0,0,0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,1
1,8,15,740.0,1024.0,0,0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,1
2,9,6,1610.0,1805.0,0,0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,1
3,7,10,905.0,1735.0,0,0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,1
4,1,11,900.0,1019.0,0,0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,9,18,936.0,1243.0,0,0,256,13930,4,270,14100,4,678.0,26,8,19977.0,2477,1
999996,5,30,920.0,1028.0,0,0,122,11637,4,242,13487,21,223.0,22,3,19393.0,2294,1
999997,6,28,800.0,1340.0,0,0,248,13796,4,159,12191,42,1642.0,23,10,19393.0,994,1
999998,9,27,1613.0,1824.0,0,0,45,10693,41,22,10397,4,214.0,9,3,19790.0,6207,1


In [67]:
# 다시 하나의 데이터로 합치기
selftrain_data = pd.concat([train_labled,train_unlabled])
selftrain_data

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
5,4,13,1545.0,1900.0,0,0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,1
6,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,1
8,6,13,1420.0,1550.0,0,0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,1
10,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,19393.0,241,0
12,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,9,18,936.0,1243.0,0,0,256,13930,4,270,14100,4,678.0,26,8,19977.0,2477,1
999996,5,30,920.0,1028.0,0,0,122,11637,4,242,13487,21,223.0,22,3,19393.0,2294,1
999997,6,28,800.0,1340.0,0,0,248,13796,4,159,12191,42,1642.0,23,10,19393.0,994,1
999998,9,27,1613.0,1824.0,0,0,45,10693,41,22,10397,4,214.0,9,3,19790.0,6207,1


# Model

In [68]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

In [69]:
selftrain_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 5 to 999999
Data columns (total 18 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Month                     1000000 non-null  int64  
 1   Day_of_Month              1000000 non-null  int64  
 2   Estimated_Departure_Time  1000000 non-null  float64
 3   Estimated_Arrival_Time    1000000 non-null  float64
 4   Cancelled                 1000000 non-null  int64  
 5   Diverted                  1000000 non-null  int64  
 6   Origin_Airport            1000000 non-null  int64  
 7   Origin_Airport_ID         1000000 non-null  int64  
 8   Origin_State              1000000 non-null  int64  
 9   Destination_Airport       1000000 non-null  int64  
 10  Destination_Airport_ID    1000000 non-null  int64  
 11  Destination_State         1000000 non-null  int64  
 12  Distance                  1000000 non-null  float64
 13  Airline                   10

In [70]:
selftrain_data['Delay'].value_counts()

1    938657
0     61343
Name: Delay, dtype: int64

In [71]:
train_x = selftrain_data.drop(['Delay'],axis=1)
train_y = selftrain_data['Delay']

In [72]:
X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, stratify=selftrain_data['Delay'], random_state=40)

In [73]:
# 스케일링
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [74]:
clf2 = RandomForestClassifier(criterion='log_loss',random_state=42)
clf2.fit(X_train_scaled, y_train)

y_pred2 = clf2.predict_proba(X_valid_scaled)

#예측 확률과 실제값 사이의 로그 손실 계산
logloss = log_loss(y_valid, y_pred2)
logloss

0.2572317285125607

# Test data predict

In [75]:
test

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number
0,12,16,1156.0,1900.0,0,0,169,12266,42,310,14683,42,191.0,26,8,19393.0,4387
1,9,12,1500.0,1715.0,0,0,119,11618,28,22,10397,4,746.0,9,3,19790.0,1936
2,3,6,1600.0,1915.0,0,0,256,13930,11,204,12953,30,733.0,26,8,19977.0,2147
3,5,18,1920.0,2045.0,0,0,248,13796,4,195,12892,4,337.0,23,10,19393.0,5486
4,7,7,1915.0,2152.0,0,0,127,11697,7,195,12892,4,2343.0,18,2,20409.0,5965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,6,2,600.0,2240.0,0,0,195,12892,4,94,11292,5,862.0,23,10,19393.0,3831
999996,6,8,600.0,648.0,0,0,57,10792,30,256,13930,11,473.0,26,8,19977.0,2069
999997,6,11,1710.0,1907.0,0,0,167,12264,4,103,11433,20,383.0,19,8,20378.0,2619
999998,11,17,600.0,500.0,0,0,309,14679,4,47,10721,19,2588.0,18,2,20409.0,6343


In [76]:
test_scaled = scaler.transform(test)
test_scaled

array([[ 0.83333333,  0.        , -0.07716371, ...,  0.16666667,
        -0.55190134,  0.37212276],
       [ 0.33333333, -0.26666667,  0.28154327, ..., -0.66666667,
        -0.14388489, -0.41144501],
       [-0.66666667, -0.66666667,  0.38581856, ...,  0.16666667,
         0.04830421, -0.34398977],
       ...,
       [-0.16666667, -0.33333333,  0.50052138, ...,  0.16666667,
         0.46043165, -0.19309463],
       [ 0.66666667,  0.06666667, -0.65693431, ..., -0.83333333,
         0.49229188,  0.99744246],
       [ 0.5       , -0.2       , -0.01042753, ...,  0.5       ,
        -0.55190134,  0.217711  ]])

In [77]:
predict = clf2.predict_proba(test)
predict



array([[0.31, 0.69],
       [0.31, 0.69],
       [0.31, 0.69],
       ...,
       [0.31, 0.69],
       [0.31, 0.69],
       [0.31, 0.69]])

In [78]:
submission = pd.DataFrame(data=predict, columns=sample_submission.columns, index=sample_submission.index)
submission

Unnamed: 0_level_0,Not_Delayed,Delayed
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
TEST_000000,0.31,0.69
TEST_000001,0.31,0.69
TEST_000002,0.31,0.69
TEST_000003,0.31,0.69
TEST_000004,0.31,0.69
...,...,...
TEST_999995,0.31,0.69
TEST_999996,0.31,0.69
TEST_999997,0.31,0.69
TEST_999998,0.31,0.69


In [79]:
submission.to_csv('submission_eda rf_ model rf_RobustScaler.csv', index=True)